In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
scaler = MinMaxScaler()
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, LarsCV
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

def evaluate(model,xtrain,xtest,ytrain,ytest):
    model.fit(xtrain,ytrain)
    ypred = model.predict(xtest)
    
    mse = mean_squared_error(ytest, ypred)
    rmse = mean_squared_error(ytest, ypred, squared=False)
    mae = mean_absolute_error(ytest, ypred)
    r2 = r2_score(ytest, ypred)

    print(f'Mean Squared Error (MSE): {mse}')
    print(f'Root Mean Squared Error (RMSE): {rmse}')
    print(f'Mean Absolute Error (MAE): {mae}')
    print(f'R-squared (R²): {r2}')
    return (rmse,model)

## first aproach

In [None]:
data = pd.read_csv('train.csv')

In [None]:
data.head(2)

In [None]:
from tqdm.auto import tqdm
label_encoders = {}
for column in tqdm(['Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years']):
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

imputer = SimpleImputer(strategy= 'median' )
data[['Product_Category_2', 'Product_Category_3']] = imputer.fit_transform(
    data[['Product_Category_2', 'Product_Category_3']])



In [None]:

user_agg = data.groupby('User_ID').agg(
    {'Purchase': ['count', 'mean', 'sum']}).reset_index()
user_agg.columns = ['User_ID', 'User_Purchase_Count',
                    'User_Purchase_Mean', 'User_Purchase_Sum']

product_agg = data.groupby('Product_ID').agg(
    {'Purchase': ['count', 'mean', 'sum']}).reset_index()
product_agg.columns = ['Product_ID', 'Product_Purchase_Count',
                       'Product_Purchase_Mean', 'Product_Purchase_Sum']

data = data.merge(user_agg, on='User_ID', how='left')
data = data.merge(product_agg, on='Product_ID', how='left')


In [None]:
data.drop(columns=['Product_ID','User_ID'], inplace=True)

In [None]:
x = data.drop(columns=['Purchase'])
y = data['Purchase']

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,train_size=0.8, random_state=42)
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

In [None]:
l = []
model = LinearRegression()
l.append(evaluate(model,xtrain,xtest,ytrain,ytest))

model = GradientBoostingRegressor()
l.append(evaluate(model,xtrain,xtest,ytrain,ytest))

model = CatBoostRegressor()
l.append(evaluate(model,xtrain,xtest,ytrain,ytest))

model = XGBRegressor()
l.append(evaluate(model,xtrain,xtest,ytrain,ytest))

model = Ridge()
l.append(evaluate(model,xtrain,xtest,ytrain,ytest))

model = LarsCV()
l.append(evaluate(model,xtrain,xtest,ytrain,ytest))

## second approach

In [None]:
data = pd.read_csv('train.csv')

In [None]:
data['Product_Category_2'] = data['Product_Category_2'].fillna(data['Product_Category_2'].mode()[0])
data['Product_Category_3'] = data['Product_Category_3'].fillna(data['Product_Category_3'].mode()[0])
data = pd.get_dummies(data, columns=['Gender','Age','City_Category'], drop_first=True, dtype=int, sparse=False)
data['Stay_In_Current_City_Years'] = data['Stay_In_Current_City_Years'].astype(str).str.extract('(\d+)').astype(int)
data.drop(columns=['Product_ID', 'User_ID'], inplace= True)

In [None]:
x = data.drop(columns=['Purchase'])
y = data['Purchase']

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=42)
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

In [None]:
m = []
model = LinearRegression()
m.append(evaluate(model,xtrain,xtest,ytrain,ytest))

model = GradientBoostingRegressor()
m.append(evaluate(model,xtrain,xtest,ytrain,ytest))

model = CatBoostRegressor()
m.append(evaluate(model,xtrain,xtest,ytrain,ytest))

model = XGBRegressor()
m.append(evaluate(model,xtrain,xtest,ytrain,ytest))

model = Ridge()
m.append(evaluate(model,xtrain,xtest,ytrain,ytest))

model = LarsCV()
m.append(evaluate(model,xtrain,xtest,ytrain,ytest))

In [None]:
l

In [None]:
m