In [1]:
import numpy as np
import pandas as pd
from lazypredict.Supervised import LazyClassifier

In [2]:
train_data = pd.read_csv(
    "e-commerce-shoppers-behaviour-understanding/train_data_v2.csv"
)

In [3]:
cat_features = [col for col in train_data.columns if train_data[col].dtype == object]
print("Total categorical features:", len(cat_features))
num_features = [
    col for col in train_data.columns if train_data[col].dtype not in (object, bool)
]
print("Total numerical features:", len(num_features))

Total categorical features: 6
Total numerical features: 15


In [4]:
test_data = pd.read_csv("e-commerce-shoppers-behaviour-understanding/test_data_v2.csv")

In [5]:
# from sklearn.preprocessing import OrdinalEncoder

# enc = OrdinalEncoder(categories=[[np.nan, "Not Specified", "Others", "Diploma", "Graduate"]])
# data = enc.fit_transform(df.Education.values.reshape(-1, 1))
# # df.Education.values.reshape(-1, 1)

In [6]:
feature = train_data.iloc[:, :-1]
label = train_data.iloc[:, -1]

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(feature, label, test_size=0.25)

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

In [9]:
num_pipeline = Pipeline([("imputer", KNNImputer()), ("scaler", StandardScaler())])

cat_pipeline = Pipeline(
    [
        ("encoder", OneHotEncoder()),
    ]
)

In [10]:
preprocessing_pipe = ColumnTransformer(
    [("num", num_pipeline, num_features), ("cat", cat_pipeline, cat_features)]
)

In [17]:
X_train_tr = preprocessing_pipe.fit_transform(X_train)
X_val_tr = preprocessing_pipe.fit_transform(X_val)
y_train_pre = LabelEncoder().fit_transform(y_train)
y_val_pre = LabelEncoder().fit_transform(y_val)

In [18]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

In [19]:
models, predictions = clf.fit(X_train_tr, X_val_tr, y_train_pre, y_val_pre)

100%|██████████| 29/29 [01:06<00:00,  2.29s/it]


In [20]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoostClassifier,0.67,0.6,0.6,0.64,0.64
BernoulliNB,0.63,0.58,0.58,0.62,0.04
LGBMClassifier,0.66,0.58,0.58,0.62,0.18
LogisticRegression,0.67,0.57,0.57,0.6,0.04
GaussianNB,0.64,0.57,0.57,0.61,0.03
CalibratedClassifierCV,0.67,0.57,0.57,0.6,6.39
LinearSVC,0.66,0.57,0.57,0.6,1.63
SGDClassifier,0.64,0.57,0.57,0.6,0.19
LinearDiscriminantAnalysis,0.66,0.57,0.57,0.59,0.09
NearestCentroid,0.58,0.57,0.57,0.59,0.06


In [77]:
from sklearn.linear_model import LogisticRegression

clf = Pipeline([("preprocessing", preprocessing_pipe), ("logit", LogisticRegression())])

In [131]:
from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'logit__penalty': ['l1', 'l2'],
#     'logit__C': [0.1, 1, 10],
#     'logit__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
# }

param_grid = {
    "logit__C": [0.1, 1, 10, 100],
    "logit__penalty": ["l1", "l2"],
    "logit__solver": ["liblinear", "saga", "newton-cg"],
    "logit__class_weight": ["balanced", None],
    "logit__multi_class": ["auto", "ovr", "multinomial"],
}

grid_clf = GridSearchCV(
    clf,
    param_grid=param_grid,
    cv=5,
    #     refit=True,
    n_jobs=2,
    verbose=2,
    return_train_score=True,
)

In [132]:
grid_clf.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


In [133]:
grid_clf.best_params_

{'logit__C': 0.1,
 'logit__class_weight': None,
 'logit__multi_class': 'multinomial',
 'logit__penalty': 'l1',
 'logit__solver': 'saga'}

In [134]:
grid_clf.best_estimator_.score(X_val, y_val)

0.6611458050502308

In [135]:
grid_clf.best_estimator_

In [136]:
# clf = Pipeline(
#     [
#         ("preprocessing", preprocessing_pipe),
#         (
#             "logit",
#             LogisticRegression(C=1, class_weight=None, penalty="l1", solver="saga"),
#         ),
#     ]
# )
clf = Pipeline(
    [
        ("preprocessing", preprocessing_pipe),
        ("logit", LogisticRegression(solver="saga", C=0.1, penalty="l1", multi_class="multinomial")),
    ]
)

In [137]:
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate

cv = StratifiedShuffleSplit(n_splits=10, test_size=0.25)

scores = cross_validate(clf, X_train, y_train, cv=cv)

In [138]:
np.mean(scores["test_score"])

0.656118754525706

In [143]:
clf.fit(feature, label)

In [144]:
clf.score(X_val, y_val)

0.6633179473255498

In [142]:
from sklearn.metrics import classification_report

print(classification_report(y_val, clf.predict(X_val)))

              precision    recall  f1-score   support

       False       0.65      0.96      0.78      2273
        True       0.73      0.18      0.29      1410

    accuracy                           0.66      3683
   macro avg       0.69      0.57      0.53      3683
weighted avg       0.68      0.66      0.59      3683



In [102]:
# cat_features_1 = [
#     col
#     for col in train_data.columns
#     if train_data[col].dtype == object and col != "Education"
# ]
# cat_features_2 = ["Education"]
# # print("Total categorical features:", len(cat_features))
# num_features = [
#     col for col in train_data.columns if train_data[col].dtype not in (object, bool)
# ]

In [104]:
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import OrdinalEncoder

# cat_pipeline_edu = Pipeline(
#     [
#         (
#             "encoder_edu",
#             OrdinalEncoder(
#                 categories=[[np.nan, "Not Specified", "Others", "Diploma", "Graduate"]]
#             ),
#         ),
#         ("impute_edu", SimpleImputer(strategy="constant", fill_value=0)),
#     ]
# )

In [109]:
# from sklearn.compose import ColumnTransformer

# preprocessing_pipe2 = ColumnTransformer(
#     [
#         ("num", num_pipeline, num_features),
#         ("cat", cat_pipeline, cat_features_1),
#         ("cat_edu", cat_pipeline_edu, cat_features_2),
#     ]
# )

In [115]:
# clf_ord = Pipeline(
#     [
#         ("preprocessing", preprocessing_pipe2),
#         (
#             "logit",
#             LogisticRegression(C=1, class_weight=None, penalty="l1", solver="saga"),
#         ),
#     ]
# )

In [116]:
# cv = StratifiedShuffleSplit(n_splits=5, test_size=0.25)

# scores = cross_validate(clf_ord, X_train, y_train, cv=cv)

In [117]:
# np.mean(scores["test_score"])

0.6524257784214337