In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import os

# Load dataset
data = pd.read_csv('/content/retail_price.csv')

# --- Data Preprocessing ---
top_10_features = [
    'comp_1', 'product_category_name', 'comp_2', 'comp_3', 'freight_price',
    'product_weight_g', 'qty', 'product_score', 'month', 'volume'
]
data = data[top_10_features + ['unit_price']]

# Handle missing values
numeric_cols = data.select_dtypes(include=[np.number]).columns #Selects columns with numeric data types
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean()) #Replaces missing values with the mean of each column

# Convert categorical variables into numeric values using label encoding.
label_encoders = {}
original_values = {}
categorical_columns = ['product_category_name', 'month']
for col in categorical_columns:
    le = LabelEncoder()
    original_values[col] = data[col].astype(str).unique()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

# Define features and target
X = data.drop(columns=['unit_price'])
y = data['unit_price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Hyperparameter Tuning with GridSearchCV ---
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_lambda': [1, 2, 3]
}

# Perform grid search
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# Get best parameters and train the final model
best_params = grid_search.best_params_
print("Best parameters found:", best_params)

xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    **best_params,
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Calculate RMSE on test set
y_pred = xgb_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE on test set with tuned parameters: {rmse:.2f}")

# --- User Input for Prediction ---
def get_user_input():
    print("\nEnter values for the following 10 features to predict the unit price:")
    
    user_data = {}
    

    for col in categorical_columns:
        valid_values = original_values[col]
        print(f"\nValid values for {col}: {valid_values}")
        value = input(f"Enter {col}: ")
        while value not in valid_values:
            print(f"Invalid input. Choose from: {valid_values}")
            value = input(f"Enter {col}: ")
        user_data[col] = label_encoders[col].transform([value])[0]
    
    # Numerical features with loops
    numerical_columns = ['comp_1', 'comp_2', 'comp_3', 'freight_price', 'product_weight_g', 'qty', 'product_score', 'volume']
    for col in numerical_columns:
        while True:
            try:
                value = float(input(f"Enter {col}: "))
                user_data[col] = value
                break
            except ValueError:
                print("Invalid input. Please enter a number.")
    
    # Create DataFrame with correct column order
    user_df = pd.DataFrame([user_data], columns=X.columns)
    return user_df

# Get user input and predict
print("\nPredicting price for user input...")
user_input = get_user_input()
predicted_price = xgb_model.predict(user_input)
print(f"\nPredicted Unit Price: {predicted_price[0]:.2f}")