In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from pandas_profiling import ProfileReport
from sklearn.metrics import roc_auc_score

## Kaggle Categorical Encoding Challenge

https://www.kaggle.com/c/cat-in-the-dat-ii/overview

In [420]:
data = pd.read_csv('./train.csv')

In [421]:
data.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [146]:
ProfileReport(data)

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=40.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…






## Drop NULLs

In [422]:
data = data.dropna()

## Some Cyclical Feature Engineering

In [423]:
data['sine_day'] = np.sin(2*np.pi*data['day']/30)
data['cosine_day'] = np.cos(2*np.pi*data['day']/30)

data['sine_month'] = np.sin(2*np.pi*data['month']/12)
data['cosine_month'] = np.cos(2*np.pi*data['month']/12)
data.drop(['day','month'],axis = 1,inplace=True)

## Convert all binary categoricals to 0/1

In [424]:
data.bin_4 = data.bin_4.replace('N',0)
data.bin_4 = data.bin_4.replace('Y',1)

In [425]:
data.bin_3 = data.bin_3.replace('F',0)
data.bin_3 = data.bin_3.replace('T',1)

## Check Cardinality

In [426]:
for col in data.columns:
    print(col + ' - Cardinality => ', data[col].nunique())

id - Cardinality =>  298042
bin_0 - Cardinality =>  2
bin_1 - Cardinality =>  2
bin_2 - Cardinality =>  2
bin_3 - Cardinality =>  2
bin_4 - Cardinality =>  2
nom_0 - Cardinality =>  3
nom_1 - Cardinality =>  6
nom_2 - Cardinality =>  6
nom_3 - Cardinality =>  6
nom_4 - Cardinality =>  4
nom_5 - Cardinality =>  1219
nom_6 - Cardinality =>  1519
nom_7 - Cardinality =>  222
nom_8 - Cardinality =>  222
nom_9 - Cardinality =>  2217
ord_0 - Cardinality =>  3
ord_1 - Cardinality =>  5
ord_2 - Cardinality =>  6
ord_3 - Cardinality =>  15
ord_4 - Cardinality =>  26
ord_5 - Cardinality =>  190
target - Cardinality =>  2
sine_day - Cardinality =>  7
cosine_day - Cardinality =>  7
sine_month - Cardinality =>  11
cosine_month - Cardinality =>  12


## Before any analysis, split the dataset

In [509]:
X = data.drop(['id','target'], axis = 1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((208629, 25), (89413, 25))

## Category Encoders Package

* https://contrib.scikit-learn.org/category_encoders/

## Nominal Variables

In [510]:
import category_encoders as ce

encoder1 = ce.JamesSteinEncoder(cols = ['nom_5','nom_6','nom_7','nom_8','nom_9'])
encoder2 = ce.OneHotEncoder(cols = ['nom_0','nom_1','nom_2','nom_3','nom_4'])


encoder1.fit(X_train, y_train)
encoder2.fit(X_train, y_train)

OneHotEncoder(cols=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'])

In [511]:
X_train = encoder1.transform(X_train)

In [512]:
X_train.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,sine_day,cosine_day,sine_month,cosine_month
169911,0.0,0.0,0.0,0,0,Blue,Circle,Hamster,India,Bassoon,0.21682,0.189344,0.18327,0.157842,0.173131,1.0,Novice,Warm,k,I,nn,0.207912,0.978148,0.5,0.8660254
220245,0.0,1.0,0.0,0,1,Blue,Circle,Dog,Finland,Oboe,0.219394,0.16341,0.182306,0.166787,0.122966,3.0,Grandmaster,Lava Hot,c,S,sY,0.994522,0.104528,-2.449294e-16,1.0
39669,0.0,0.0,0.0,0,0,Blue,Triangle,Cat,India,Theremin,0.205821,0.188726,0.215943,0.182125,0.210201,1.0,Expert,Freezing,a,O,al,0.866025,0.5,1.0,6.123234000000001e-17
120082,0.0,0.0,1.0,0,0,Blue,Trapezoid,Hamster,Finland,Bassoon,0.200808,0.174956,0.197951,0.174305,0.18583,1.0,Grandmaster,Warm,m,Y,Ay,0.207912,0.978148,1.224647e-16,-1.0
230696,0.0,0.0,0.0,0,0,Blue,Square,Axolotl,Russia,Theremin,0.204595,0.166245,0.173092,0.156943,0.166829,3.0,Grandmaster,Cold,o,U,aA,0.406737,0.913545,1.224647e-16,-1.0


In [513]:
X_train = encoder2.transform(X_train)

In [514]:
X_train.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0_1,nom_0_2,nom_0_3,nom_1_1,nom_1_2,nom_1_3,nom_1_4,nom_1_5,nom_1_6,nom_2_1,nom_2_2,nom_2_3,nom_2_4,nom_2_5,nom_2_6,nom_3_1,nom_3_2,nom_3_3,nom_3_4,nom_3_5,nom_3_6,nom_4_1,nom_4_2,nom_4_3,nom_4_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,sine_day,cosine_day,sine_month,cosine_month
169911,0.0,0.0,0.0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0.21682,0.189344,0.18327,0.157842,0.173131,1.0,Novice,Warm,k,I,nn,0.207912,0.978148,0.5,0.8660254
220245,0.0,1.0,0.0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0.219394,0.16341,0.182306,0.166787,0.122966,3.0,Grandmaster,Lava Hot,c,S,sY,0.994522,0.104528,-2.449294e-16,1.0
39669,0.0,0.0,0.0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0.205821,0.188726,0.215943,0.182125,0.210201,1.0,Expert,Freezing,a,O,al,0.866025,0.5,1.0,6.123234000000001e-17
120082,0.0,0.0,1.0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0.200808,0.174956,0.197951,0.174305,0.18583,1.0,Grandmaster,Warm,m,Y,Ay,0.207912,0.978148,1.224647e-16,-1.0
230696,0.0,0.0,0.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0.204595,0.166245,0.173092,0.156943,0.166829,3.0,Grandmaster,Cold,o,U,aA,0.406737,0.913545,1.224647e-16,-1.0


## Propagate to Test Set

In [515]:
X_test = encoder1.transform(X_test)
X_test = encoder2.transform(X_test)

In [516]:
X_test.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0_1,nom_0_2,nom_0_3,nom_1_1,nom_1_2,nom_1_3,nom_1_4,nom_1_5,nom_1_6,nom_2_1,nom_2_2,nom_2_3,nom_2_4,nom_2_5,nom_2_6,nom_3_1,nom_3_2,nom_3_3,nom_3_4,nom_3_5,nom_3_6,nom_4_1,nom_4_2,nom_4_3,nom_4_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,sine_day,cosine_day,sine_month,cosine_month
330026,0.0,0.0,0.0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0.182781,0.18143,0.198754,0.16016,0.175483,1.0,Grandmaster,Warm,n,H,cR,0.207912,0.978148,1.224647e-16,-1.0
125941,0.0,0.0,0.0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.188379,0.187901,0.211486,0.165499,0.145652,3.0,Contributor,Boiling Hot,d,R,us,0.207912,0.978148,1.224647e-16,-1.0
330547,0.0,0.0,0.0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0.201567,0.163378,0.203563,0.176523,0.139131,1.0,Grandmaster,Hot,k,M,Fl,0.587785,0.809017,0.8660254,0.5
8260,0.0,0.0,0.0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0.211629,0.206237,0.204295,0.172126,0.215238,2.0,Novice,Cold,n,Z,Ro,0.207912,0.978148,-0.5,0.866025
260232,0.0,0.0,0.0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0.188582,0.189904,0.187112,0.191347,0.18367,3.0,Contributor,Freezing,b,B,Pw,0.994522,0.104528,-0.8660254,-0.5


## Ordinal Variables

We will use Label Encoding here first, as a benchmark

In [517]:
encoder3 = ce.OrdinalEncoder(cols=['ord_1','ord_2','ord_3','ord_4','ord_5'])
encoder3.fit(X_train, y_train)
print('Ordinal encoder fit complete')

  elif pd.api.types.is_categorical(cols):


Ordinal encoder fit complete


In [518]:
X_train = encoder3.transform(X_train)

In [519]:
X_train.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0_1,nom_0_2,nom_0_3,nom_1_1,nom_1_2,nom_1_3,nom_1_4,nom_1_5,nom_1_6,nom_2_1,nom_2_2,nom_2_3,nom_2_4,nom_2_5,nom_2_6,nom_3_1,nom_3_2,nom_3_3,nom_3_4,nom_3_5,nom_3_6,nom_4_1,nom_4_2,nom_4_3,nom_4_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,sine_day,cosine_day,sine_month,cosine_month
169911,0.0,0.0,0.0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0.21682,0.189344,0.18327,0.157842,0.173131,1.0,1,1,1,1,1,0.207912,0.978148,0.5,0.8660254
220245,0.0,1.0,0.0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0.219394,0.16341,0.182306,0.166787,0.122966,3.0,2,2,2,2,2,0.994522,0.104528,-2.449294e-16,1.0
39669,0.0,0.0,0.0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0.205821,0.188726,0.215943,0.182125,0.210201,1.0,3,3,3,3,3,0.866025,0.5,1.0,6.123234000000001e-17
120082,0.0,0.0,1.0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0.200808,0.174956,0.197951,0.174305,0.18583,1.0,2,1,4,4,4,0.207912,0.978148,1.224647e-16,-1.0
230696,0.0,0.0,0.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0.204595,0.166245,0.173092,0.156943,0.166829,3.0,2,4,5,5,5,0.406737,0.913545,1.224647e-16,-1.0


## Propagate to Test Set

In [521]:
X_test = encoder3.transform(X_test)

In [522]:
X_test.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0_1,nom_0_2,nom_0_3,nom_1_1,nom_1_2,nom_1_3,nom_1_4,nom_1_5,nom_1_6,nom_2_1,nom_2_2,nom_2_3,nom_2_4,nom_2_5,nom_2_6,nom_3_1,nom_3_2,nom_3_3,nom_3_4,nom_3_5,nom_3_6,nom_4_1,nom_4_2,nom_4_3,nom_4_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,sine_day,cosine_day,sine_month,cosine_month
330026,0.0,0.0,0.0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0.182781,0.18143,0.198754,0.16016,0.175483,1.0,2,1,6,10,116,0.207912,0.978148,1.224647e-16,-1.0
125941,0.0,0.0,0.0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.188379,0.187901,0.211486,0.165499,0.145652,3.0,5,5,11,17,67,0.207912,0.978148,1.224647e-16,-1.0
330547,0.0,0.0,0.0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0.201567,0.163378,0.203563,0.176523,0.139131,1.0,2,6,1,6,29,0.587785,0.809017,0.8660254,0.5
8260,0.0,0.0,0.0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0.211629,0.206237,0.204295,0.172126,0.215238,2.0,1,4,6,24,131,0.207912,0.978148,-0.5,0.866025
260232,0.0,0.0,0.0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0.188582,0.189904,0.187112,0.191347,0.18367,3.0,5,3,7,12,45,0.994522,0.104528,-0.8660254,-0.5


## First Baseline Model

Primary Performance Metric used - MCC

https://towardsdatascience.com/the-best-classification-metric-youve-never-heard-of-the-matthews-correlation-coefficient-3bf50a2f3e9a

In [523]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier

In [524]:
# Logistic Regression
lr_model = Pipeline([
                     ("model", LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42))])

# Decision Tree
dt_model = Pipeline([
                     ("model", DecisionTreeClassifier(class_weight="balanced"))])

# Random Forest
rf_model = Pipeline([ 
                     ("model", RandomForestClassifier(class_weight="balanced", n_estimators=100, n_jobs=-1))])

xgb_model = Pipeline([
                      # Add a scale_pos_weight to make it balanced
                      ("model", XGBClassifier(scale_pos_weight=(1 - y.mean()), n_jobs=-1))])

In [525]:
from sklearn.model_selection import GridSearchCV

In [527]:
gs = GridSearchCV(rf_model, {"model__max_depth": [10, 15], 
                             "model__min_samples_split": [5, 10]}, 
                  n_jobs=-1, cv=5, scoring="f1")

gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('model',
                                        RandomForestClassifier(class_weight='balanced',
                                                               n_jobs=-1))]),
             n_jobs=-1,
             param_grid={'model__max_depth': [10, 15],
                         'model__min_samples_split': [5, 10]},
             scoring='f1')

In [528]:
print(gs.best_params_)
print(gs.best_score_)

{'model__max_depth': 10, 'model__min_samples_split': 5}
0.4575335930268777


In [529]:
rf_model.set_params(**gs.best_params_)


Pipeline(steps=[('model',
                 RandomForestClassifier(class_weight='balanced', max_depth=10,
                                        min_samples_split=5, n_jobs=-1))])

In [530]:
rf_model.fit(X_train, y_train)

Pipeline(steps=[('model',
                 RandomForestClassifier(class_weight='balanced', max_depth=10,
                                        min_samples_split=5, n_jobs=-1))])

In [531]:
from sklearn.metrics import accuracy_score,roc_auc_score,classification_report, matthews_corrcoef

y_pred = rf_model.predict(X_test)

accuracy_score(y_test, y_pred)

0.6917897844832407

In [534]:
y_prob = rf_model.predict_proba(X_test)

print('AUC - ',roc_auc_score(y_test,y_prob[:,1]))

print(classification_report(y_test,y_pred))

AUC -  0.7180441183065099
              precision    recall  f1-score   support

           0       0.88      0.71      0.79     72616
           1       0.33      0.60      0.42     16797

    accuracy                           0.69     89413
   macro avg       0.60      0.65      0.61     89413
weighted avg       0.78      0.69      0.72     89413



In [535]:
matthews_corrcoef(y_test,y_pred)

0.2542633086994036

##  Approach 2 

#### Trying CountEncoder for nominal variables

#### Using ASCII to establish mapping for ord_3, ord_4 and ord_5.

#### For ord_1 and ord_2, create an intuitive mapping

In [607]:
X = data.drop(['id','target'], axis = 1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((208629, 25), (89413, 25))

In [608]:
encoder1 = ce.HashingEncoder(n_components = 100,cols = ['nom_5','nom_6','nom_7','nom_8','nom_9'])

encoder2 = ce.OneHotEncoder(cols = ['nom_0','nom_1','nom_2','nom_3','nom_4'])


encoder1.fit(X_train, y_train)
X_train = encoder1.transform(X_train)

encoder2.fit(X_train, y_train)

X_train = encoder2.transform(X_train)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [609]:
X_train.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0_1,nom_0_2,nom_0_3,nom_1_1,nom_1_2,nom_1_3,nom_1_4,nom_1_5,nom_1_6,nom_2_1,nom_2_2,nom_2_3,nom_2_4,nom_2_5,nom_2_6,nom_3_1,nom_3_2,nom_3_3,nom_3_4,nom_3_5,nom_3_6,nom_4_1,nom_4_2,nom_4_3,nom_4_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,sine_day,cosine_day,sine_month,cosine_month
169911,0.0,0.0,0.0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0.391001,0.038112,-0.058348,-0.389065,-0.166795,1.0,Novice,Warm,k,I,nn,0.207912,0.978148,0.5,0.8660254
220245,0.0,1.0,0.0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0.420079,-0.294737,-0.070882,-0.272847,-0.734103,3.0,Grandmaster,Lava Hot,c,S,sY,0.994522,0.104528,-2.449294e-16,1.0
39669,0.0,0.0,0.0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0.250099,0.030614,0.364509,-0.073063,0.313737,1.0,Expert,Freezing,a,O,al,0.866025,0.5,1.0,6.123234000000001e-17
120082,0.0,0.0,1.0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0.178876,-0.146317,0.131763,-0.173932,0.052134,1.0,Grandmaster,Warm,m,Y,Ay,0.207912,0.978148,1.224647e-16,-1.0
230696,0.0,0.0,0.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0.22959,-0.236265,-0.19,-0.400389,-0.248099,3.0,Grandmaster,Cold,o,U,aA,0.406737,0.913545,1.224647e-16,-1.0


In [610]:
X_test = encoder1.transform(X_test)
X_test = encoder2.transform(X_test)

In [611]:
X_test.head()

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0_1,nom_0_2,nom_0_3,nom_1_1,nom_1_2,nom_1_3,nom_1_4,nom_1_5,nom_1_6,nom_2_1,nom_2_2,nom_2_3,nom_2_4,nom_2_5,nom_2_6,nom_3_1,nom_3_2,nom_3_3,nom_3_4,nom_3_5,nom_3_6,nom_4_1,nom_4_2,nom_4_3,nom_4_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,sine_day,cosine_day,sine_month,cosine_month
330026,0.0,0.0,0.0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,-0.036502,-0.067067,0.141726,-0.359142,-0.128772,1.0,Grandmaster,Warm,n,H,cR,0.207912,0.978148,1.224647e-16,-1.0
125941,0.0,0.0,0.0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.025534,0.019303,0.308886,-0.288099,-0.512421,3.0,Contributor,Boiling Hot,d,R,us,0.207912,0.978148,1.224647e-16,-1.0
330547,0.0,0.0,0.0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0.188944,-0.296889,0.205081,-0.146317,-0.594014,1.0,Grandmaster,Hot,k,M,Fl,0.587785,0.809017,0.8660254,0.5
8260,0.0,0.0,0.0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0.319252,0.253283,0.218634,-0.201567,0.371335,2.0,Novice,Cold,n,Z,Ro,0.207912,0.978148,-0.5,0.866025
260232,0.0,0.0,0.0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0.060298,0.047839,-0.00509,0.047413,-0.029395,3.0,Contributor,Freezing,b,B,Pw,0.994522,0.104528,-0.8660254,-0.5


In [612]:
# Use ASCII code of char variables to 

X_train['ord_3'] = X_train['ord_3'].apply(ord)
X_train['ord_4'] = X_train['ord_4'].apply(ord)
X_train['ord_5'] = X_train['ord_5'].apply(lambda x : sum([ ord(i) for i in x ]))


In [613]:
# Use ASCII code of char variables to 

X_test['ord_3'] = X_test['ord_3'].apply(ord)
X_test['ord_4'] = X_test['ord_4'].apply(ord)
X_test['ord_5'] = X_test['ord_5'].apply(lambda x : sum([ ord(i) for i in x ]))


In [614]:
mapping =  [
        {'col': 'ord_1', 'mapping': {'Novice': 0, 'Contributor': 1, 'Expert': 2 , 'Master':3 , 'Grandmaster':4}},
        {'col': 'ord_2', 'mapping': {'Freezing': 0, 'Cold': 1, 'Warm': 2 , 'Hot':3 , 'Boiling Hot':4,
                                    'Lava Hot': 5 }},

    ]

In [615]:
encoder4 = ce.OrdinalEncoder(cols=['ord_1','ord_2'],mapping = mapping)

encoder4.fit(X_train)

  elif pd.api.types.is_categorical(cols):


OrdinalEncoder(cols=['ord_1', 'ord_2'],
               mapping=[{'col': 'ord_1',
                         'mapping': {'Contributor': 1, 'Expert': 2,
                                     'Grandmaster': 4, 'Master': 3,
                                     'Novice': 0}},
                        {'col': 'ord_2',
                         'mapping': {'Boiling Hot': 4, 'Cold': 1, 'Freezing': 0,
                                     'Hot': 3, 'Lava Hot': 5, 'Warm': 2}}])

In [616]:
X_train = encoder4.transform(X_train)

In [617]:
X_test = encoder4.transform(X_test)

## Random Forest

In [591]:
gs = GridSearchCV(rf_model, {"model__max_depth": [10, 15], 
                             "model__min_samples_split": [5, 10]}, 
                  n_jobs=-1, cv=5, scoring="f1")

gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('model',
                                        RandomForestClassifier(class_weight='balanced',
                                                               max_depth=10,
                                                               min_samples_split=5,
                                                               n_jobs=-1))]),
             n_jobs=-1,
             param_grid={'model__max_depth': [10, 15],
                         'model__min_samples_split': [5, 10]},
             scoring='f1')

In [592]:
print(gs.best_params_)
print(gs.best_score_)

{'model__max_depth': 10, 'model__min_samples_split': 5}
0.4416808607736403


In [593]:
rf_model.set_params(**gs.best_params_)
rf_model.fit(X_train,y_train)

Pipeline(steps=[('model',
                 RandomForestClassifier(class_weight='balanced', max_depth=10,
                                        min_samples_split=5, n_jobs=-1))])

In [601]:
y_prob = rf_model.predict_proba(X_test)

print('AUC - ',roc_auc_score(y_test,y_prob[:,1]))

print(classification_report(y_test,y_pred))

AUC -  0.74000892370566
              precision    recall  f1-score   support

           0       0.90      0.68      0.78     72616
           1       0.33      0.68      0.45     16797

    accuracy                           0.68     89413
   macro avg       0.62      0.68      0.61     89413
weighted avg       0.80      0.68      0.71     89413



In [594]:
from sklearn.metrics import accuracy_score,roc_auc_score,classification_report, matthews_corrcoef

y_pred = rf_model.predict(X_test)

matthews_corrcoef(y_test, y_pred)

0.2790977211370524

## Logistic Regression

In [618]:
gs = GridSearchCV(lr_model, {"model__C": [1, 1.3, 1.5]}, n_jobs=-1, cv=5, scoring="f1")
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('model',
                                        LogisticRegression(C=1,
                                                           class_weight='balanced',
                                                           random_state=42,
                                                           solver='liblinear'))]),
             n_jobs=-1, param_grid={'model__C': [1, 1.3, 1.5]}, scoring='f1')

In [605]:
print(gs.best_params_)
print(gs.best_score_)

{'model__C': 1.5}
0.4462824444572805


In [598]:
lr_model.set_params(**gs.best_params_)
lr_model.fit(X_train,y_train)

Pipeline(steps=[('model',
                 LogisticRegression(C=1, class_weight='balanced',
                                    random_state=42, solver='liblinear'))])

In [599]:
from sklearn.metrics import accuracy_score,roc_auc_score,classification_report, matthews_corrcoef

y_pred = lr_model.predict(X_test)

matthews_corrcoef(y_test, y_pred)

0.29090253324634413

## Weight of Evidence with Logistic Regression

In [625]:
X = data.drop(['id','target'], axis = 1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((208629, 25), (89413, 25))

In [626]:
encoder1 = ce.WOEEncoder(cols = ['nom_0','nom_1','nom_2','nom_3','nom_4','nom_5','nom_6','nom_7','nom_8','nom_9'])

encoder1.fit(X_train, y_train)
X_train = encoder1.transform(X_train)


  elif pd.api.types.is_categorical(cols):


In [627]:
X_test = encoder1.transform(X_test)


In [628]:
# Use ASCII code of char variables to 

X_train['ord_3'] = X_train['ord_3'].apply(ord)
X_train['ord_4'] = X_train['ord_4'].apply(ord)
X_train['ord_5'] = X_train['ord_5'].apply(lambda x : sum([ ord(i) for i in x ]))


In [629]:
# Use ASCII code of char variables to 

X_test['ord_3'] = X_test['ord_3'].apply(ord)
X_test['ord_4'] = X_test['ord_4'].apply(ord)
X_test['ord_5'] = X_test['ord_5'].apply(lambda x : sum([ ord(i) for i in x ]))


In [630]:
mapping =  [
        {'col': 'ord_1', 'mapping': {'Novice': 0, 'Contributor': 1, 'Expert': 2 , 'Master':3 , 'Grandmaster':4}},
        {'col': 'ord_2', 'mapping': {'Freezing': 0, 'Cold': 1, 'Warm': 2 , 'Hot':3 , 'Boiling Hot':4,
                                    'Lava Hot': 5 }},

    ]

In [631]:
encoder4 = ce.OrdinalEncoder(cols=['ord_1','ord_2'],mapping = mapping)

encoder4.fit(X_train)

  elif pd.api.types.is_categorical(cols):


OrdinalEncoder(cols=['ord_1', 'ord_2'],
               mapping=[{'col': 'ord_1',
                         'mapping': {'Contributor': 1, 'Expert': 2,
                                     'Grandmaster': 4, 'Master': 3,
                                     'Novice': 0}},
                        {'col': 'ord_2',
                         'mapping': {'Boiling Hot': 4, 'Cold': 1, 'Freezing': 0,
                                     'Hot': 3, 'Lava Hot': 5, 'Warm': 2}}])

In [632]:
X_train = encoder4.transform(X_train)

In [633]:
X_test = encoder4.transform(X_test)

In [634]:
gs = GridSearchCV(lr_model, {"model__C": [1, 1.3, 1.5]}, n_jobs=-1, cv=5, scoring="f1")
gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('model',
                                        LogisticRegression(C=1,
                                                           class_weight='balanced',
                                                           random_state=42,
                                                           solver='liblinear'))]),
             n_jobs=-1, param_grid={'model__C': [1, 1.3, 1.5]}, scoring='f1')

In [635]:
lr_model.set_params(**gs.best_params_)
lr_model.fit(X_train,y_train)

Pipeline(steps=[('model',
                 LogisticRegression(C=1.3, class_weight='balanced',
                                    random_state=42, solver='liblinear'))])

In [636]:
from sklearn.metrics import accuracy_score,roc_auc_score,classification_report, matthews_corrcoef

y_pred = lr_model.predict(X_test)

matthews_corrcoef(y_test, y_pred)

0.3218613625293704

## Bottom Line

* There is a lot to try, and what could work/not work highly depends on your dataset, what I discussed were just some guidelines, on what one could try in different situations.

* Always split the dataset first, and then propagate what you learnt from the training data to the test set.

* One hot encoding is good, if your cardinality is limited, and it also maintains intepretability, keep it simple to the point you can.

* Cross Validation is important.

* A metric such as MCC is also important, AUC can also be used, Accuracy should not be trusted most of the times(due to imbalance in real-world scenarios), MCC Considers all the entries equally well in the confusion matrix.