In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
import warnings
from sklearn.neighbors import LocalOutlierFactor

In [2]:
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("./customer_shopping_data.csv")

In [4]:
df.head()

Unnamed: 0,gender,age,category,price,payment_method,shopping_mall
0,Female,28,Clothing,1500.4,Credit Card,Kanyon
1,Male,21,Shoes,1800.51,Debit Card,Forum Istanbul
2,Male,20,Clothing,300.08,Cash,Metrocity
3,Female,66,Shoes,3000.85,Credit Card,Metropol AVM
4,Female,53,Books,60.6,Cash,Kanyon


In [5]:
df.drop_duplicates()

Unnamed: 0,gender,age,category,price,payment_method,shopping_mall
0,Female,28,Clothing,1500.40,Credit Card,Kanyon
1,Male,21,Shoes,1800.51,Debit Card,Forum Istanbul
2,Male,20,Clothing,300.08,Cash,Metrocity
3,Female,66,Shoes,3000.85,Credit Card,Metropol AVM
4,Female,53,Books,60.60,Cash,Kanyon
...,...,...,...,...,...,...
99444,Male,24,Toys,35.84,Cash,Mall of Istanbul
99447,Female,37,Toys,107.52,Cash,Metropol AVM
99450,Female,28,Books,15.15,Cash,Zorlu Center
99451,Male,50,Toys,179.20,Cash,Metropol AVM


In [6]:
df.to_csv("customer_shopping_data.csv", sep=',', index=False, encoding='utf-8')

# Data Preparation

### Variable Transformation

In [7]:
df['gender'] = LabelEncoder().fit_transform(df['gender'])

In [8]:
df = pd.get_dummies(df, prefix=['payment_method', 'category', 'shopping_mall'],
                    columns=['payment_method', 'category', 'shopping_mall'], drop_first=True)

In [9]:
df.head()

Unnamed: 0,gender,age,price,payment_method_Credit Card,payment_method_Debit Card,category_Clothing,category_Cosmetics,category_Food & Beverage,category_Shoes,category_Souvenir,...,category_Toys,shopping_mall_Emaar Square Mall,shopping_mall_Forum Istanbul,shopping_mall_Istinye Park,shopping_mall_Kanyon,shopping_mall_Mall of Istanbul,shopping_mall_Metrocity,shopping_mall_Metropol AVM,shopping_mall_Viaport Outlet,shopping_mall_Zorlu Center
0,0,28,1500.4,True,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,1,21,1800.51,False,True,False,False,False,True,False,...,False,False,True,False,False,False,False,False,False,False
2,1,20,300.08,False,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,0,66,3000.85,True,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
4,0,53,60.6,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


### Missing Data

In [10]:
df.isnull().sum()

gender                             0
age                                0
price                              0
payment_method_Credit Card         0
payment_method_Debit Card          0
category_Clothing                  0
category_Cosmetics                 0
category_Food & Beverage           0
category_Shoes                     0
category_Souvenir                  0
category_Technology                0
category_Toys                      0
shopping_mall_Emaar Square Mall    0
shopping_mall_Forum Istanbul       0
shopping_mall_Istinye Park         0
shopping_mall_Kanyon               0
shopping_mall_Mall of Istanbul     0
shopping_mall_Metrocity            0
shopping_mall_Metropol AVM         0
shopping_mall_Viaport Outlet       0
shopping_mall_Zorlu Center         0
dtype: int64

### Outliers

In [11]:
clf = LocalOutlierFactor(n_neighbors=10, contamination='auto')
clf.fit_predict(df)

array([1, 1, 1, ..., 1, 1, 1])

In [12]:
df_scores = clf.negative_outlier_factor_
df_scores

array([-1.        , -1.04908584, -1.        , ..., -1.05298166,
       -1.04588532, -0.97764587])

In [13]:
np.sort(df_scores[:])

array([-1.00000000e+10, -9.00000000e+09, -9.00000000e+09, ...,
       -8.54303670e-01, -8.54059689e-01, -8.38079354e-01])

In [14]:
tolerance = np.sort(df_scores[:])[10]

In [15]:
outlier_boolean = df_scores < tolerance
not_outlier_boolean = df_scores > tolerance

In [16]:
df[outlier_boolean]

Unnamed: 0,gender,age,price,payment_method_Credit Card,payment_method_Debit Card,category_Clothing,category_Cosmetics,category_Food & Beverage,category_Shoes,category_Souvenir,...,category_Toys,shopping_mall_Emaar Square Mall,shopping_mall_Forum Istanbul,shopping_mall_Istinye Park,shopping_mall_Kanyon,shopping_mall_Mall of Istanbul,shopping_mall_Metrocity,shopping_mall_Metropol AVM,shopping_mall_Viaport Outlet,shopping_mall_Zorlu Center
22299,0,69,1500.4,True,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
26490,1,38,1500.4,False,False,True,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
28026,0,61,20.92,False,True,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,False
33876,0,61,20.92,False,True,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,False
46661,0,64,900.24,False,True,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
51514,0,61,20.92,True,False,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,False
56406,1,61,20.92,False,False,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,False
72425,1,61,20.92,False,False,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,False
81637,1,61,20.92,False,False,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,False
88310,0,45,1500.4,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [17]:
df = df[not_outlier_boolean]

### Df

In [18]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
gender,99438.0,0.401919,0.490288,0.0,0.0,0.0,1.0,1.0
age,99438.0,43.425984,14.989547,18.0,30.0,43.0,56.0,69.0
price,99438.0,689.214745,941.229503,5.23,45.45,203.3,1200.32,5250.0


# Models

In [19]:
results = []
alg_names = []

In [20]:
def implementMlAlgorithm(alg, Xtr, ytr, Xte, yte):
    model = alg().fit(Xtr, ytr)
    score = np.sqrt(mean_squared_error(yte, model.predict(Xte)))

    results.append(score)
    alg_names.append(alg.__name__)

    # print("For " + alg.__name__, " pre RMSE: ", score)

In [21]:
algorithms = [Ridge, Lasso, ElasticNet, LinearRegression, LGBMRegressor,
              XGBRegressor, GradientBoostingRegressor, RandomForestRegressor,
              DecisionTreeRegressor, MLPRegressor, KNeighborsRegressor, CatBoostRegressor]
# SVR took lots of time

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['price'], axis=1), df['price'], test_size=0.25, random_state=5)

In [24]:
for algorithm in algorithms:
    implementMlAlgorithm(algorithm, X_train, y_train, X_test, y_test)

Learning rate set to 0.080919
0:	learn: 887.4013969	total: 147ms	remaining: 2m 27s
1:	learn: 838.8229679	total: 152ms	remaining: 1m 15s
2:	learn: 795.6170397	total: 157ms	remaining: 52.3s
3:	learn: 757.1493504	total: 163ms	remaining: 40.5s
4:	learn: 723.0524310	total: 168ms	remaining: 33.4s
5:	learn: 692.9241372	total: 173ms	remaining: 28.6s
6:	learn: 666.3170548	total: 178ms	remaining: 25.2s
7:	learn: 643.0394596	total: 182ms	remaining: 22.6s
8:	learn: 622.7468579	total: 188ms	remaining: 20.7s
9:	learn: 604.9632460	total: 194ms	remaining: 19.2s
10:	learn: 589.5678461	total: 198ms	remaining: 17.8s
11:	learn: 576.2448822	total: 204ms	remaining: 16.8s
12:	learn: 564.7412429	total: 208ms	remaining: 15.8s
13:	learn: 554.7711455	total: 214ms	remaining: 15.1s
14:	learn: 546.2032320	total: 220ms	remaining: 14.4s
15:	learn: 538.8842385	total: 226ms	remaining: 13.9s
16:	learn: 532.5632807	total: 231ms	remaining: 13.4s
17:	learn: 527.2294004	total: 236ms	remaining: 12.9s
18:	learn: 522.6136769	t

In [25]:
result_df = pd.DataFrame({
    "Algorithm": alg_names,
    "Scores": results
})

In [26]:
result_df

Unnamed: 0,Algorithm,Scores
0,Ridge,491.669017
1,Lasso,491.884973
2,ElasticNet,849.086229
3,LinearRegression,491.669536
4,LGBMRegressor,495.629768
5,XGBRegressor,505.427566
6,GradientBoostingRegressor,492.020658
7,RandomForestRegressor,537.480827
8,DecisionTreeRegressor,581.243661
9,MLPRegressor,491.55634


In [27]:
result_df['Scores'].mean()

540.1789618122681

In [28]:
result_df[result_df['Scores'] == result_df['Scores'].min()]

Unnamed: 0,Algorithm,Scores
9,MLPRegressor,491.55634


## MLP Regressor

In [27]:
mlp = MLPRegressor()

In [28]:
?mlp

In [29]:
scaler = StandardScaler()

scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

scaler.fit(X_test)
X_test_scaled = scaler.transform(X_test)

In [30]:
params = {"solver": ["lbfgs", "adam"],
          "learning_rate": ["constant", "adaptive"],
          "alpha": [1, 0.1, 0.01, 0.001],
          "hidden_layer_sizes": [(10, 10, 10), (10, 10), (3, 5)]}

In [31]:
mlpc_cv_model = GridSearchCV(MLPRegressor(), params, cv=10, n_jobs=-1, verbose=2).fit(X_train_scaled, y_train)

Fitting 10 folds for each of 48 candidates, totalling 480 fits


In [32]:
mlpc_cv_model.best_params_

{'alpha': 0.1,
 'hidden_layer_sizes': (10, 10),
 'learning_rate': 'constant',
 'solver': 'adam'}

In [45]:
tuned_model = MLPRegressor(alpha=0.1, hidden_layer_sizes=(10,10), learning_rate="constant", solver="adam").fit(X_train, y_train)

In [47]:
np.sqrt(mean_squared_error(y_test, tuned_model.predict(X_test)))

491.81314476330226