In [1]:
import pandas as pd
import numpy as np
import os
import random

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

import warnings
warnings.simplefilter("ignore")

In [2]:
# Set a seed value
seed_value = 42
os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

In [3]:
DATASET_PATH = os.getenv("DATASET_PATH")

sample = pd.read_csv(DATASET_PATH)
sample.shape

(38000, 15)

In [4]:
numerical_columns = sample.select_dtypes("number").columns.tolist()[2:]

In [5]:
X = sample[numerical_columns]
y = sample["category"]

## Modeling

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed_value, test_size=.2)

In [7]:
cv = StratifiedKFold(n_splits=5)

In [23]:
def search_pipeline(pipeline, search_space, cv, random_state, X, y):
    # create a randomsearch for pipeline, fits and returns the best model
    search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=search_space,
        cv=cv,
        verbose=1,
        n_jobs=-1,
        random_state=random_state,
        scoring="roc_auc"
    )

    search.fit(X_train, y_train)

    best_pipe = search.best_estimator_
    print(best_pipe)
    
    return best_pipe

### Logistic Regression

In [36]:
%%time
# Create a pipeline
pipe_log = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("classifier", LogisticRegression(random_state=seed_value))
    ]
)

search_space_log = [
    {
        "classifier__penalty": ['l2', 'l1'],
        "classifier__C": np.logspace(0, 4, 10),
        "classifier__solver":['newton-cg', 'saga', 'liblinear']
    }
]

best_pipe_log = search_pipeline(pipe_log, search_space_log, cv, seed_value, X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()),
                ('classifier',
                 LogisticRegression(random_state=42, solver='newton-cg'))])
CPU times: user 11.5 s, sys: 12.8 s, total: 24.3 s
Wall time: 16.8 s


### Decision Tree

In [25]:
%%time
# Create a pipeline
pipe_tree = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("classifier", DecisionTreeClassifier(random_state=seed_value))
    ]
)

search_space_tree = [
    {
        "classifier__criterion": ['gini', 'entropy'],
        "classifier__max_depth": [2, 4, 6, 8, 10, 12]
    }
]

best_pipe_tree = search_pipeline(pipe_tree, search_space_tree, cv, seed_value, X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('classifier',
                 DecisionTreeClassifier(criterion='entropy', max_depth=10,
                                        random_state=42))])
CPU times: user 581 ms, sys: 120 ms, total: 701 ms
Wall time: 2.25 s


### Random Forest

In [26]:
%%time
# Create a pipeline
pipe_rf = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("classifier", RandomForestClassifier(random_state=seed_value))
    ]
)

search_space_rf = [
    {
        "classifier__n_estimators": [10, 100, 1000],
        "classifier__max_depth":[5, 8, 15, 25, 30, None],
        "classifier__min_samples_leaf":[1, 2, 5, 10, 15, 100],
        "classifier__max_leaf_nodes": [2, 5, 10]
    }
]

best_pipe_rf = search_pipeline(pipe_rf, search_space_rf, cv, seed_value, X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('classifier',
                 RandomForestClassifier(max_depth=15, max_leaf_nodes=5,
                                        min_samples_leaf=5, n_estimators=10,
                                        random_state=42))])
CPU times: user 503 ms, sys: 91.9 ms, total: 595 ms
Wall time: 28.2 s


### K-Nearest Neighbors

In [27]:
%%time
# Create a pipeline
pipe_knn = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("classifier", KNeighborsClassifier(n_neighbors=6))
    ]
)

search_space_knn = [
    {
        'classifier__n_neighbors': [3, 7, 11],
        'classifier__weights': ['uniform', 'distance']
    }
]

best_pipe_knn = search_pipeline(pipe_knn, search_space_knn, cv, seed_value, X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()),
                ('classifier', KNeighborsClassifier(n_neighbors=3))])
CPU times: user 353 ms, sys: 99.7 ms, total: 453 ms
Wall time: 1.38 s


### Multilayer Perceptron

In [28]:
%%time
# Create a pipeline
pipe_mlp = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("classifier", MLPClassifier(max_iter=100, random_state=seed_value))
    ]
)

search_space_mlp = [
    {
        'classifier__hidden_layer_sizes': [(10, 30, 10),(20,)],
        'classifier__activation': ['tanh', 'relu'],
        'classifier__solver': ['sgd', 'adam'],
        'classifier__alpha': [0.0001, 0.05],
        'classifier__learning_rate': ['constant', 'adaptive']
    }
]

best_pipe_mlp = search_pipeline(pipe_mlp, search_space_mlp, cv, seed_value, X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler()),
                ('classifier',
                 MLPClassifier(alpha=0.05, hidden_layer_sizes=(20,),
                               max_iter=100, random_state=42))])
CPU times: user 7.16 s, sys: 108 ms, total: 7.27 s
Wall time: 1min 51s


### LightGBM

In [31]:
%%time
# Create a pipeline
pipe_lgb = Pipeline(
    [
        ("classifier", lgb.LGBMClassifier(n_jobs=-1, random_state=seed_value))
    ]
)

search_space_lgb = [
    {
        'classifier__max_depth'        : [1, 2, 3, 4, 5, 6, 7],
        'classifier__gamma'            : [0, 0.5, 1],
        'classifier__learning_rate'    : [0.1, 0.01, 0.001],
        'classifier__subsample'        : [0.2, 0.4, 0.5, 0.6, 0.7],
        'classifier__reg_alpha'        : [0, 0.5, 1],
        'classifier__reg_lambda'       : [1, 1.5, 2, 3, 4.5]
    }
]

best_pipe_lgb = search_pipeline(pipe_lgb, search_space_lgb, cv, seed_value, X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Pipeline(steps=[('classifier',
                 LGBMClassifier(gamma=0, learning_rate=0.01, max_depth=5,
                                random_state=42, reg_alpha=0.5, reg_lambda=2,
                                subsample=0.2))])
CPU times: user 9.19 s, sys: 199 ms, total: 9.39 s
Wall time: 18.5 s


In [32]:
def model_evaluation_folds(pipe, cv):
    print("=" * 100)
    model_name = pipe["classifier"].__class__.__name__
    print(f"{model_name} evaluation\n")
    i = 1
    for train_ix, val_ix in cv.split(X_train, y_train):
        
        # select rows
        train_X, val_X = X_train.iloc[train_ix, :], X_train.iloc[val_ix, :]
        train_y, val_y = y_train.iloc[train_ix], y_train.iloc[val_ix]
        
        y_train_pred = pipe.predict(train_X)
        y_train_proba = pipe.predict_proba(train_X)
        
        y_val_pred = pipe.predict(val_X)
        y_val_proba = pipe.predict_proba(val_X)
        
        train_auc = roc_auc_score(train_y, y_train_proba, multi_class="ovr")
        val_auc = roc_auc_score(val_y, y_val_proba, multi_class="ovr")
        
        train_precision = precision_score(train_y, y_train_pred, average='weighted')
        val_precision = precision_score(val_y, y_val_pred, average='weighted')
        
        train_recall = recall_score(train_y, y_train_pred, average='weighted')
        val_recall = recall_score(val_y, y_val_pred, average='weighted')
        
        train_f1 = f1_score(train_y, y_train_pred, average="weighted")
        val_f1 = f1_score(val_y, y_val_pred, average="weighted")
        
        print("FOLD", i)
        print(f"\t - Train Accuracy: {accuracy_score(train_y, y_train_pred)} ; Validation Accuracy {accuracy_score(val_y, y_val_pred)}")
        print(f"\t - Train ROC AUC: {train_auc} ; Validation ROC AUC {val_auc}")
        print(f"\t - Train Precision-Score: {train_precision} ; Validation Precision-Score {val_precision}")
        print(f"\t - Train Recall-Score: {train_recall} ; Validation Recall-Score {val_recall}")
        print(f"\t - Train F1-Score: {train_f1} ; Validation F1-Score {val_f1}")
        
        i += 1

In [37]:
for model in [best_pipe_log, best_pipe_tree, best_pipe_rf, best_pipe_knn, best_pipe_mlp, best_pipe_lgb]:
    model_evaluation_folds(model, cv)

LogisticRegression evaluation

FOLD 1
	 - Train Accuracy: 0.5999588815789474 ; Validation Accuracy 0.5988486842105263
	 - Train ROC AUC: 0.7690675025937189 ; Validation ROC AUC 0.7695119534906971
	 - Train Precision-Score: 0.5728078893639454 ; Validation Precision-Score 0.5631096836368839
	 - Train Recall-Score: 0.5999588815789474 ; Validation Recall-Score 0.5988486842105263
	 - Train F1-Score: 0.5271827858680367 ; Validation F1-Score 0.5249586295644985
FOLD 2
	 - Train Accuracy: 0.6007401315789473 ; Validation Accuracy 0.5957236842105263
	 - Train ROC AUC: 0.7686341761807314 ; Validation ROC AUC 0.7713637243165214
	 - Train Precision-Score: 0.5647873833449684 ; Validation Precision-Score 0.5778439478125508
	 - Train Recall-Score: 0.6007401315789473 ; Validation Recall-Score 0.5957236842105263
	 - Train F1-Score: 0.5281905696290425 ; Validation F1-Score 0.520813932748198
FOLD 3
	 - Train Accuracy: 0.5987253289473684 ; Validation Accuracy 0.6037828947368421
	 - Train ROC AUC: 0.76908451

FOLD 5
	 - Train Accuracy: 0.7390625 ; Validation Accuracy 0.7379934210526315
	 - Train ROC AUC: 0.9537719459022562 ; Validation ROC AUC 0.9542209478589033
	 - Train Precision-Score: 0.7449762832611293 ; Validation Precision-Score 0.7443507126335953
	 - Train Recall-Score: 0.7390625 ; Validation Recall-Score 0.7379934210526315
	 - Train F1-Score: 0.7271643421448605 ; Validation F1-Score 0.7273916769052291
MLPClassifier evaluation

FOLD 1
	 - Train Accuracy: 0.6249588815789474 ; Validation Accuracy 0.6199013157894737
	 - Train ROC AUC: 0.8040032608539548 ; Validation ROC AUC 0.804599987351625
	 - Train Precision-Score: 0.6299355696591509 ; Validation Precision-Score 0.6186131525010128
	 - Train Recall-Score: 0.6249588815789474 ; Validation Recall-Score 0.6199013157894737
	 - Train F1-Score: 0.5809705747254766 ; Validation F1-Score 0.5764355048741563
FOLD 2
	 - Train Accuracy: 0.6242598684210526 ; Validation Accuracy 0.6226973684210526
	 - Train ROC AUC: 0.8051291256508496 ; Validation R

In [44]:
def evaluate_models(pipes):
    names = []
    train_acc = []
    test_acc = []
    train_roc = []
    test_roc = []
    train_precision = []
    test_precision = []
    train_recall = []
    test_recall = []
    train_f1 = []
    test_f1 = []
    for pipe in pipes:
        names.append(pipe["classifier"].__class__.__name__)
        train_acc.append(accuracy_score(y_train, pipe.predict(X_train)))
        test_acc.append(accuracy_score(y_test, pipe.predict(X_test)))
        train_roc.append(roc_auc_score(y_train, pipe.predict_proba(X_train), multi_class="ovo"))
        test_roc.append(roc_auc_score(y_test, pipe.predict_proba(X_test), multi_class="ovo"))
        train_precision.append(precision_score(y_train, pipe.predict(X_train), average='weighted'))
        test_precision.append(precision_score(y_test, pipe.predict(X_test), average='weighted'))
        train_recall.append(recall_score(y_train, pipe.predict(X_train), average='weighted'))
        test_recall.append(recall_score(y_test, pipe.predict(X_test), average='weighted'))
        train_f1.append(f1_score(y_train, pipe.predict(X_train), average='weighted'))
        test_f1.append(f1_score(y_test, pipe.predict(X_test), average='weighted'))
    
    return pd.DataFrame(
    {
        "model_name": names,
        "train_accuracy": train_acc,
        "test_accuracy": test_acc,
        "train_roc_auc": train_roc,
        "test_roc_auc": test_roc,
        "train_precision": train_precision,
        "test_precision": test_precision,
        "train_recall": train_recall,
        "test_recall": test_recall,
        "train_f1_score": train_f1,
        "test_f1_score": test_f1,
    }
).sort_values("test_roc_auc", ascending=False)

In [45]:
evaluate_models(pipes=[best_pipe_log, best_pipe_tree, best_pipe_rf, best_pipe_knn, best_pipe_mlp, best_pipe_lgb])

Unnamed: 0,model_name,train_accuracy,test_accuracy,train_roc_auc,test_roc_auc,train_precision,test_precision,train_recall,test_recall,train_f1_score,test_f1_score
5,LGBMClassifier,0.648224,0.638421,0.806469,0.78808,0.641995,0.609126,0.648224,0.638421,0.599273,0.587963
1,DecisionTreeClassifier,0.681776,0.636184,0.85541,0.756832,0.670812,0.59857,0.681776,0.636184,0.653209,0.601525
4,MLPClassifier,0.623947,0.612763,0.758622,0.751123,0.62777,0.594475,0.623947,0.612763,0.580068,0.56792
0,LogisticRegression,0.599737,0.586711,0.735037,0.727704,0.569167,0.568354,0.599737,0.586711,0.526736,0.510897
2,RandomForestClassifier,0.604243,0.595921,0.728565,0.721768,0.538798,0.53262,0.604243,0.595921,0.525339,0.517669
3,KNeighborsClassifier,0.738849,0.566184,0.941436,0.64915,0.74483,0.545938,0.738849,0.566184,0.727215,0.551732


In [46]:
def calculate_feature_importance(pipe):
    x = pd.DataFrame({
        "features": X_train.columns.tolist(),
        "importances": pipe["classifier"].feature_importances_
    }).sort_values("importances", ascending=False)
    
    x["cum_sum"] = np.cumsum(x["importances"])
    
    return x

In [49]:
calculate_feature_importance(best_pipe_lgb)

Unnamed: 0,features,importances,cum_sum
2,price,4296,4296
3,weight,3875,8171
5,minimum_quantity,2354,10525
6,view_counts,2353,12878
7,order_counts,1977,14855
0,search_page,832,15687
4,express_delivery,615,16302
1,position,570,16872


In [47]:
def save_model(model, path):
    pd.to_pickle(model, path)

In [48]:
save_model(best_pipe_lgb, os.getenv("MODEL_PATH"))