In [1]:
import pandas as pd
from sklearn.preprocessing import scale, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("./processed_shopping_data.csv").dropna()

In [4]:
df.head()

Unnamed: 0,gender,age,category,price
0,Female,28,Clothing,300.08
1,Male,21,Shoes,600.17
2,Male,20,Clothing,300.08
3,Female,66,Shoes,600.17
4,Female,53,Books,15.15


In [5]:
df['category'].drop_duplicates()

0            Clothing
1               Shoes
4               Books
6           Cosmetics
10    Food & Beverage
12               Toys
23         Technology
67           Souvenir
Name: category, dtype: object

In [6]:
df = pd.get_dummies(df, prefix=['gender'],
                    columns=['gender'], drop_first=True)

In [7]:
df

Unnamed: 0,age,category,price,gender_Male
0,28,Clothing,300.08,False
1,21,Shoes,600.17,True
2,20,Clothing,300.08,True
3,66,Shoes,600.17,False
4,53,Books,15.15,False
...,...,...,...,...
99451,27,Food & Beverage,5.23,True
99452,23,Food & Beverage,5.23,False
99453,63,Food & Beverage,5.23,True
99454,56,Technology,1050.00,True


In [8]:
_category, df['category'] = df['category'], LabelEncoder().fit_transform(df['category'])

In [9]:
y = df['category']
X = df.drop(['category'], axis=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)

In [11]:
results = []
alg_names = []

In [12]:
def compute_ml(alg):
    model = alg().fit(X_train, y_train)
    score = accuracy_score(y_test, model.predict(X_test))
    results.append(score)
    alg_names.append(alg.__name__)
    # print(alg.__name__, " accuracy score: ", score)

In [13]:
algorithms = [LGBMClassifier,
              XGBClassifier,
              GradientBoostingClassifier,
              RandomForestClassifier,
              DecisionTreeClassifier,
              MLPClassifier,
              KNeighborsClassifier,
              SVC,
              LogisticRegression,
              CatBoostClassifier]

In [14]:
for algorithm in algorithms:
    compute_ml(algorithm)

Learning rate set to 0.098589
0:	learn: 1.4687775	total: 152ms	remaining: 2m 31s
1:	learn: 1.1824203	total: 177ms	remaining: 1m 28s
2:	learn: 0.9767925	total: 201ms	remaining: 1m 6s
3:	learn: 0.8289758	total: 226ms	remaining: 56.3s
4:	learn: 0.7153663	total: 251ms	remaining: 50s
5:	learn: 0.6211169	total: 278ms	remaining: 46s
6:	learn: 0.5440861	total: 307ms	remaining: 43.5s
7:	learn: 0.4789444	total: 332ms	remaining: 41.1s
8:	learn: 0.4233577	total: 352ms	remaining: 38.8s
9:	learn: 0.3762622	total: 375ms	remaining: 37.1s
10:	learn: 0.3355444	total: 397ms	remaining: 35.7s
11:	learn: 0.2995442	total: 418ms	remaining: 34.4s
12:	learn: 0.2681458	total: 440ms	remaining: 33.4s
13:	learn: 0.2406616	total: 466ms	remaining: 32.8s
14:	learn: 0.2162713	total: 492ms	remaining: 32.3s
15:	learn: 0.1947321	total: 517ms	remaining: 31.8s
16:	learn: 0.1756543	total: 538ms	remaining: 31.1s
17:	learn: 0.1584886	total: 557ms	remaining: 30.4s
18:	learn: 0.1431499	total: 577ms	remaining: 29.8s
19:	learn: 0.

In [15]:
result_df = pd.DataFrame({
    "Algorithm": alg_names,
    "Scores": results
})

In [16]:
result_df.sort_values(by='Scores')

Unnamed: 0,Algorithm,Scores
8,LogisticRegression,0.692849
7,SVC,0.950129
0,LGBMClassifier,1.0
1,XGBClassifier,1.0
2,GradientBoostingClassifier,1.0
3,RandomForestClassifier,1.0
4,DecisionTreeClassifier,1.0
5,MLPClassifier,1.0
6,KNeighborsClassifier,1.0
9,CatBoostClassifier,1.0


In [17]:
lgbm = LGBMClassifier().fit(X_train, y_train)

In [18]:
accuracy_score(y_test, lgbm.predict(X_test))

1.0

In [19]:
df['_category'] = _category

In [20]:
df

Unnamed: 0,age,category,price,gender_Male,_category
0,28,1,300.08,False,Clothing
1,21,4,600.17,True,Shoes
2,20,1,300.08,True,Clothing
3,66,4,600.17,False,Shoes
4,53,0,15.15,False,Books
...,...,...,...,...,...
99451,27,3,5.23,True,Food & Beverage
99452,23,3,5.23,False,Food & Beverage
99453,63,3,5.23,True,Food & Beverage
99454,56,6,1050.00,True,Technology


In [22]:
df.drop_duplicates()

Unnamed: 0,age,category,price,gender_Male,_category
0,28,1,300.08,False,Clothing
1,21,4,600.17,True,Shoes
2,20,1,300.08,True,Clothing
3,66,4,600.17,False,Shoes
4,53,0,15.15,False,Books
...,...,...,...,...,...
10295,56,0,15.15,False,Books
10396,18,6,1050.00,False,Technology
11525,55,5,11.73,True,Souvenir
15840,62,0,15.15,False,Books


In [23]:
lgbm.predict([[35, 80, False],[50, 1000, True],[15, 500, False]])

array([2, 6, 4])