In [1]:
import time

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn import pipeline
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import impute
from sklearn import compose
from sklearn import metrics
from sklearn import set_config
import math

set_config(display='diagram')

In [2]:
df = pd.read_csv("data/Mall_Customers.csv")


In [3]:
# Change Gender to categorical using OrdinalEncoder
df.dtypes

CustomerID                 int64
Genre                     object
Age                        int64
Annual Income (k$)         int64
Spending Score (1-100)     int64
dtype: object

In [4]:
df.columns

Index(['CustomerID', 'Genre', 'Age', 'Annual Income (k$)',
       'Spending Score (1-100)'],
      dtype='object')

In [5]:
# Feature Generation : SpendingLabel (Our Target)
# Will add the label column based on Spending Score: If SS>50=1 else 0

df['SpendingLabel'] = df['Spending Score (1-100)'].apply(lambda row: 1 if row>50  else 0)

In [6]:
# Data Enhancement Function
#Based on Gender


def income_RNG_on_gender(df):
    #Copying to a dummy dataframe
    gen_data = df.copy() 
    
    #Localizing standard deviation based on gender
    for gender in gen_data['Genre'].unique():
        gen_std = gen_data[gen_data['Genre']==gender]
        income_std = gen_std ['Annual Income (k$)'].std()

        #Altering the data based on std
        for i in range (gen_data[gen_data['Genre']==gender].shape[0]):
            if np.random.randint(2)==1:
                gen_data['Annual Income (k$)'].values[i] += income_std/10
            else:
                gen_data['Annual Income (k$)'].values[i] -= income_std/10

    return gen_data



In [7]:
# Generate the Generated Data
generated_data = income_RNG_on_gender(df)

# Take 30% of the information
extra_sample = generated_data.sample(math.floor(generated_data.shape[0] * 30 / 100))

In [8]:
extra_sample

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100),SpendingLabel
9,10,Female,30,23,72,1
165,166,Female,36,85,75,1
17,18,Male,20,25,66,1
187,188,Male,28,101,68,1
121,122,Female,38,67,40,0
79,80,Female,49,53,42,0
14,15,Male,37,14,13,0
189,190,Female,36,103,85,1
5,6,Female,22,11,76,1
80,81,Male,57,53,51,1


In [9]:
df['Spending Score (1-100)']

0      39
1      81
2       6
3      77
4      40
       ..
195    79
196    28
197    74
198    18
199    83
Name: Spending Score (1-100), Length: 200, dtype: int64

In [10]:
cat_vars = ['Genre']

num_vars = ['Age', 'Annual Income (k$)']

In [24]:
df.head()

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100),SpendingLabel
0,1,Male,19,15,39,0
1,2,Male,21,15,81,1
2,3,Female,20,16,6,0
3,4,Female,23,16,77,1
4,5,Female,31,17,40,0


In [11]:
num_preproc1 = pipeline.Pipeline(steps=[('imputer', impute.SimpleImputer(strategy='mean')),
                                        ('scaler', preprocessing.StandardScaler()),
                                        ('normalizer', preprocessing.QuantileTransformer(n_quantiles=100))])

cat_preproc1 = pipeline.Pipeline(steps=[('imputer', impute.SimpleImputer(strategy='constant', fill_value=-1)),
                                        ('encoder', preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = -1))])

In [12]:
tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_preproc1, num_vars),
    ('cat', cat_preproc1, cat_vars)
    ], remainder = 'drop')

tree_prepro

In [13]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier
# from sklearn.svm           import SVC

tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees":   ExtraTreesClassifier(n_estimators=100),
  "Random Forest": RandomForestClassifier(n_estimators=100),
  "AdaBoost":      AdaBoostClassifier(n_estimators=100),
  "Skl GBM":       GradientBoostingClassifier(n_estimators=100),
  "Skl HistGBM":   HistGradientBoostingClassifier(max_iter=100),
  "XGBoost":       XGBClassifier(n_estimators=100),
  "LightGBM":      LGBMClassifier(n_estimators=100),
  "CatBoost":      CatBoostClassifier(n_estimators=100),
#   "SVM":           SVC(kernel='linear')
}

In [14]:
df.sample(10)

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100),SpendingLabel
109,110,Male,66,63,48,0
142,143,Female,28,76,40,0
159,160,Female,30,78,73,1
47,48,Female,27,40,47,0
28,29,Female,40,29,31,0
156,157,Male,37,78,1,0
173,174,Male,36,87,92,1
5,6,Female,22,17,76,1
36,37,Female,42,34,17,0
30,31,Male,60,30,4,0


In [15]:
X = df.drop(['SpendingLabel'],axis=1)
y = df['SpendingLabel']

In [16]:
tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}


X_train, x_test, Y_train, y_test = train_test_split(
    X, y,
    test_size=0.1,
    stratify = y,   # ALWAYS RECOMMENDED FOR BETTER VALIDATION
    random_state=42  # Recommended for reproducibility
)

In [17]:
X_train.shape

(180, 5)

In [18]:
## ADDITION OF DATA ENHANCEMENT

## COMMENT THIS OUT IF YOU WANT TO ADD THE ENHANCED DATA

## Concatenate train dataset with extra_sample from generated data

X_train = pd.concat([X_train, extra_sample.drop(['SpendingLabel'],axis=1)])
Y_train = pd.concat([Y_train, extra_sample['SpendingLabel']])

In [19]:
# Second Train_Test_Split as an Alternative for One-Time Cross Validation

x_train, x_val, y_train, y_val = model_selection.train_test_split(
    X_train, Y_train,
    test_size=0.2,
    stratify = Y_train,   # ALWAYS RECOMMENDED FOR BETTER VALIDATION
    random_state=42  # Recommended for reproducibility
)

In [20]:
X_train.shape

(240, 5)

In [21]:
results = pd.DataFrame({'Model': [], 'Heart Att. Acc.': [], 'Healthy Acc.': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

for model_name, model in tree_classifiers.items():

    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(x_val)
    
    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Heart Att. Acc.": metrics.recall_score(y_val,pred)*100,
                              "Healthy Acc.": metrics.precision_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)

Learning rate set to 0.042063
0:	learn: 0.6879785	total: 48.2ms	remaining: 4.77s
1:	learn: 0.6821839	total: 48.8ms	remaining: 2.39s
2:	learn: 0.6774496	total: 58.2ms	remaining: 1.88s
3:	learn: 0.6726361	total: 58.7ms	remaining: 1.41s
4:	learn: 0.6664821	total: 59.2ms	remaining: 1.13s
5:	learn: 0.6605322	total: 59.8ms	remaining: 937ms
6:	learn: 0.6576819	total: 60.3ms	remaining: 801ms
7:	learn: 0.6530968	total: 62.1ms	remaining: 714ms
8:	learn: 0.6479368	total: 63.1ms	remaining: 638ms
9:	learn: 0.6439859	total: 63.8ms	remaining: 574ms
10:	learn: 0.6402890	total: 65ms	remaining: 526ms
11:	learn: 0.6355052	total: 66.3ms	remaining: 486ms
12:	learn: 0.6318228	total: 67.5ms	remaining: 452ms
13:	learn: 0.6281449	total: 68.8ms	remaining: 423ms
14:	learn: 0.6237945	total: 70.1ms	remaining: 397ms
15:	learn: 0.6208015	total: 87.6ms	remaining: 460ms
16:	learn: 0.6171829	total: 89.1ms	remaining: 435ms
17:	learn: 0.6145206	total: 90.3ms	remaining: 411ms
18:	learn: 0.6093397	total: 91.4ms	remaining: 

In [22]:
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

Unnamed: 0,Model,Heart Att. Acc.,Healthy Acc.,Accuracy,Bal Acc.,Time
1,Extra Trees,95.652174,75.862069,83.333333,83.826087,0.281653
2,Skl GBM,91.304348,75.0,81.25,81.652174,0.135492
3,XGBoost,86.956522,76.923077,81.25,81.478261,9.806614
4,Random Forest,86.956522,74.074074,79.166667,79.478261,0.291544
5,LightGBM,86.956522,74.074074,79.166667,79.478261,4.30599
6,Decision Tree,78.26087,75.0,77.083333,77.130435,0.022384
7,Skl HistGBM,86.956522,66.666667,72.916667,73.478261,60.524705
8,CatBoost,78.26087,66.666667,70.833333,71.130435,0.302696
9,AdaBoost,82.608696,61.290323,66.666667,67.304348,0.240287


In [23]:
best_model = tree_classifiers[results_ord.iloc[0].Model]
best_model.fit(X_train,Y_train)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ae2e0bca-572a-4bd6-aa62-1bb6656a18fd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>