# Imports

In [1]:
# !pip install catboost

In [2]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.metrics import precision_recall_fscore_support, plot_confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from catboost import Pool
from catboost.utils import get_roc_curve, select_threshold
from xgboost import XGBClassifier


from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

# set a seed for reproducibility
RANDOM_STATE = 99

# Feature engineering

In [3]:
df = pd.read_csv('training_data.csv')

In [4]:
df.head()

Unnamed: 0,ID,var1,var2,var3,var4,var5,var6,var7,var8,var9,...,var189,var190,var191,var192,cat1,cat2,cat3,cat4,cat5,target
0,44686,86.52893,80.79771,75.25887,74.02016,69.01476,65.61648,63.23896,59.07834,56.80397,...,85.133333,84.45,85.2,85.9,S,H,C,B,C,0
1,44687,68.56225,72.05599,69.52573,68.79211,65.48515,63.00976,61.19186,57.85757,55.94791,...,90.533333,86.55,87.24,87.3,S,I,C,B,C,0
2,44688,77.88821,76.6227,73.11046,72.20956,68.26166,65.34046,63.19467,59.25676,57.01834,...,93.933333,90.2,89.84,88.6,S,I,C,B,C,0
3,44689,81.11949,78.43038,74.59578,73.63714,69.4554,66.35951,64.07976,59.88543,57.50303,...,93.2,88.15,88.48,87.766667,S,I,C,B,C,0
4,44690,62.18698,68.60618,67.86709,67.44987,65.15601,63.13671,61.52867,58.35072,56.4246,...,92.733333,88.15,88.0,88.566667,S,I,C,B,C,0


In [5]:
df.describe(include='all')

Unnamed: 0,ID,var1,var2,var3,var4,var5,var6,var7,var8,var9,...,var189,var190,var191,var192,cat1,cat2,cat3,cat4,cat5,target
count,14193.0,14193.0,14193.0,14193.0,14193.0,14193.0,14193.0,14193.0,14193.0,14193.0,...,14193.0,14193.0,14193.0,14193.0,14193,14193,14193,14193,14193,14193.0
unique,,,,,,,,,,,...,,,,,20,9,3,3,3,
top,,,,,,,,,,,...,,,,,M,H,B,B,B,
freq,,,,,,,,,,,...,,,,,716,1577,7091,12814,11367,
mean,60691.679067,49.770732,49.816368,49.842512,49.847619,49.866617,49.877695,49.883878,49.889778,49.889927,...,64.371634,64.739135,65.117889,65.51435,,,,,,0.009441
std,9537.84035,19.287088,13.258854,11.027864,10.617691,9.063626,8.01557,7.24892,5.79651,4.95848,...,51.892179,51.490296,51.202395,51.035665,,,,,,0.09671
min,44686.0,1.96209,6.80249,10.04946,10.79022,14.12184,16.95182,19.40626,25.20809,29.42134,...,1.666667,2.05,2.16,2.066667,,,,,,0.0
25%,52284.0,35.13129,40.74508,42.3778,42.68846,43.73788,44.45517,44.99521,46.01659,46.57202,...,22.533333,22.95,22.88,23.4,,,,,,0.0
50%,60726.0,49.79851,49.92422,50.05118,50.09362,50.08018,50.10192,50.0633,49.96799,49.86434,...,50.866667,51.05,51.4,51.266667,,,,,,0.0
75%,69174.0,64.21333,59.01531,57.29067,57.03203,55.97237,55.19449,54.66191,53.79012,53.20811,...,92.066667,92.9,93.76,94.433333,,,,,,0.0


The describe method on the 5 categorical variables shows that the highest number of categories is 20 for cat1.  Encoding the 5 categorical variables will result in 33 columns (since we can drop one category for every variable) compared to the original 5.  Given that there are so many numerical variables, one hot encoding will not result in an overly sparse matrix, so this type of encoding will be sufficient for modeling purposes. While Catboost and XGBoost can be implemented without encoding the categorical variables, logistic regression cannot, so we will still proceed with encoding the data beforehand.

In [6]:
# first scale all the numerical data

scaler = StandardScaler()
X = df.iloc[:, 1:193]
X[X.columns] = scaler.fit_transform(X[X.columns])

In [7]:
# Encoding the categorical columns

# one hot encoding the 5 categorical columns
# get list of categorical columns
cat_vars = ['cat1', 'cat2', 'cat3', 'cat4', 'cat5']

# dummify each categorical variable, drop the first column,
# and add it to the numerical columns
for i in cat_vars:
  X = X.join(pd.get_dummies(df[i]).iloc[:, 1:], rsuffix=i)


# Getting the target volumn as its own series
y = df.target

# get a sense of whether or not y is imbalanced
print(sum(y) / len(y))

0.009441273867399421


We can see from the above that the training sample is highly imbalanced, with less than 1% of the samples with a target of 1.

In [8]:
# Getting testing and training set

x_train, x_test, y_train, y_test =  train_test_split(X, y,
                                                     test_size=.33,
                                                     random_state = RANDOM_STATE)

In [9]:
train_label_allocation = sum(y_train) / len(y_train)
test_label_allocation = sum(y_test) / len(y_test)

print('Percentage of ones in training set is: ', train_label_allocation)
print('Percentage of ones in testing set is: ', test_label_allocation)

Percentage of ones in training set is:  0.008833736460195604
Percentage of ones in testing set is:  0.01067463706233988


In [10]:
(train_label_allocation - test_label_allocation) / train_label_allocation

-0.2083943312594039

The percentage of the one label in the testing set is roughly 20% off from the percentage of the one label in the training set.  This is problematic so we will rerun the train_test_split function, but individually for both groups, and then join them.

In [11]:
cleaned_df = X.join(y)

target_0 = cleaned_df[cleaned_df['target']==0]
target_1 = cleaned_df[cleaned_df['target']==1]

In [12]:
# splitting into different training and test sets where the target is 0
x_train_0, x_test_0, y_train_0, y_test_0 =  train_test_split(target_0.iloc[:, :-1],
                                                             target_0.iloc[:, -1],
                                                             test_size=.33,
                                                             random_state = RANDOM_STATE)

In [13]:
# splitting into different training and test sets where the target is 1
x_train_1, x_test_1, y_train_1, y_test_1 =  train_test_split(target_1.iloc[:, :-1],
                                                             target_1.iloc[:, -1],
                                                             test_size=.33,
                                                             random_state = RANDOM_STATE)

In [14]:
# combining them 
x_train = x_train_0.append(x_train_1)
x_test = x_test_0.append(x_test_1)
y_train = y_train_0.append(y_train_1)
y_test = y_test_0.append(y_test_1)

In [15]:
train_label_allocation = sum(y_train) / len(y_train)
test_label_allocation = sum(y_test) / len(y_test)

print('Percentage of ones in training set is: ', train_label_allocation)
print('Percentage of ones in testing set is: ', test_label_allocation)

Percentage of ones in training set is:  0.009360538493899874
Percentage of ones in testing set is:  0.0096051227321238


In [16]:
(train_label_allocation - test_label_allocation) / train_label_allocation

-0.02612929142733805

The percentage of the one label in the testing set is now less than 3% off from the percentage of the one label in the training set.  Models will work much better now with this data as the distribution of the target variable is more similar.  We will now shuffle the data so all the target=1 observations are not right next to each other at the end.

In [17]:
# adding random state for reproducibility
x_train, y_train = shuffle(x_train, y_train, random_state=RANDOM_STATE)
x_test, y_test = shuffle(x_test, y_test, random_state=RANDOM_STATE)

# Building baseline binary classifiers

With our data shuffled and split into testing and training sets, we can begin building the baseline models.

In [18]:
# Defining a function to easily get scores in a df

score_cols=['Train_acc', 'Train_prec', 'Train_recall', 'Train_fscore', 'Train_auc',
            'Test_acc','Test_prec', 'Test_recall', 'Test_fscore', 'Test_auc',
            'Model', 'Model_version']
all_scores = pd.DataFrame(columns=score_cols)

def get_scores(train_pred, test_pred, y_train, y_test, mod, version):
  scores_train = precision_recall_fscore_support(y_train, train_pred,
                                                 average='binary')
  acc_train = sum(train_pred == y_train) / len(y_train)
  prec_train = scores_train[0]
  recall_train = scores_train[1]
  f_score_train = scores_train[2]
  auc_train = roc_auc_score(y_train, train_pred)

  scores_test = precision_recall_fscore_support(y_test, test_pred,
                                                average='binary')
  acc_test = sum(test_pred == y_test) / len(y_test)
  prec_test = scores_test[0]
  recall_test = scores_test[1]
  f_score_test = scores_test[2]
  auc_test = roc_auc_score(y_test, test_pred)

  scores = [acc_train, prec_train, recall_train, f_score_train, 
            auc_train, acc_test, prec_test, recall_test, f_score_test,
            auc_test, mod, version]

  df_score = pd.DataFrame(np.reshape(scores, newshape=(1,12)),
                          columns=score_cols)
  return df_score

In [19]:
# Looking at benchmark logistic regression

# using a very low value for C and increasing max_inter so the model can converge
lr_bench = LogisticRegression(random_state=RANDOM_STATE, C=.00001, max_iter=1000)
lr_bench.fit(x_train, y_train)

# getting predicitons for LR model
lr_train_pred = lr_bench.predict(x_train)
lr_test_pred = lr_bench.predict(x_test)

In [20]:
# Looking at benchmark cat boost classifier

# using a default catboost classifier
cat_bench = CatBoostClassifier(random_state=RANDOM_STATE)
cat_bench.fit(x_train, y_train)

# getting predicitons for catboost model
cat_train_pred = cat_bench.predict(x_train)
cat_test_pred = cat_bench.predict(x_test)

Learning rate set to 0.026951
0:	learn: 0.6214721	total: 203ms	remaining: 3m 23s
1:	learn: 0.5650572	total: 329ms	remaining: 2m 44s
2:	learn: 0.5102223	total: 459ms	remaining: 2m 32s
3:	learn: 0.4634753	total: 602ms	remaining: 2m 29s
4:	learn: 0.4220953	total: 759ms	remaining: 2m 31s
5:	learn: 0.3849939	total: 875ms	remaining: 2m 24s
6:	learn: 0.3512520	total: 999ms	remaining: 2m 21s
7:	learn: 0.3230709	total: 1.12s	remaining: 2m 19s
8:	learn: 0.2967875	total: 1.26s	remaining: 2m 18s
9:	learn: 0.2715826	total: 1.43s	remaining: 2m 21s
10:	learn: 0.2502700	total: 1.54s	remaining: 2m 18s
11:	learn: 0.2286762	total: 1.68s	remaining: 2m 18s
12:	learn: 0.2107195	total: 1.8s	remaining: 2m 17s
13:	learn: 0.1952871	total: 1.93s	remaining: 2m 15s
14:	learn: 0.1817283	total: 2.04s	remaining: 2m 14s
15:	learn: 0.1694078	total: 2.17s	remaining: 2m 13s
16:	learn: 0.1589298	total: 2.28s	remaining: 2m 11s
17:	learn: 0.1462302	total: 2.42s	remaining: 2m 11s
18:	learn: 0.1368736	total: 2.54s	remaining: 

In [21]:
# Looking at benchmark xgboost

# using the default xgboost classifier
xgb_bench = XGBClassifier(random_state=RANDOM_STATE, use_best_model=True)
xgb_bench.fit(x_train, y_train)

# getting predicitons for xgboost model
xgb_train_pred = xgb_bench.predict(x_train)
xgb_test_pred = xgb_bench.predict(x_test)

In [22]:
all_scores = all_scores.append(get_scores(lr_train_pred, lr_test_pred, y_train, y_test, 'lr', 'benchmark'))
all_scores = all_scores.append(get_scores(cat_train_pred, cat_test_pred, y_train, y_test, 'cat', 'benchmark'))
all_scores = all_scores.append(get_scores(xgb_train_pred, xgb_test_pred, y_train, y_test, 'xgb', 'benchmark'))

all_scores.set_index(['Model_version', 'Model'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0_level_0,Unnamed: 1_level_0,Train_acc,Train_prec,Train_recall,Train_fscore,Train_auc,Test_acc,Test_prec,Test_recall,Test_fscore,Test_auc
Model_version,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
benchmark,lr,0.9906394615061002,0.0,0.0,0.0,0.5,0.9903948772678762,0.0,0.0,0.0,0.5
benchmark,cat,0.9992637778712664,1.0,0.9213483146067416,0.9590643274853802,0.9606741573033708,0.9901814300960512,0.0,0.0,0.0,0.4998922413793103
benchmark,xgb,0.9912705090450148,1.0,0.0674157303370786,0.1263157894736842,0.5337078651685393,0.9901814300960512,0.0,0.0,0.0,0.4998922413793103


It seems like the every benchmark model performs poorly on the test set, and none of the models seem to predict 1 for any of the observations.

Catboost seems to have pretty good results for the training set, however, so we will proceed with catboost as the algorithm of choice, even though it greatly overfits the training set.

# Hyperparameter tuning on baseline classifiers

Tuning the hyper parameters for all three models was tested, but there were no improvements to any of the models, and it exponentially increased training time, so these results will not be shown.

# Modify objective and evaluation function of CatBoost

Catboost base package provides only logloss and crossentropy as optimization functions.  For binary classification, the two are identical, so it will not be changed.  However, we can use precision as our evaluation metric to and try to have catboost fit the data for the testing set.

In [None]:
cat_obj_func = CatBoostClassifier(eval_metric='Precision', 
                   custom_metric=['F1', 'Precision', 'Recall'],
                   random_state=RANDOM_STATE,
                   use_best_model=True)
cat_obj_func.fit(x_train, y_train, eval_set=(x_test, y_test))


# getting the predictions
cat_obj_func_train_pred = cat_obj_func.predict(x_train)
cat_obj_func_test_pred = cat_obj_func.predict(x_test)

In [24]:
all_scores = all_scores.append(get_scores(cat_obj_func_train_pred,
                                          cat_obj_func_test_pred,
                                          y_train, y_test,
                                          'cat', 'maximizing precision'))

all_scores.set_index(['Model_version', 'Model'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0_level_0,Unnamed: 1_level_0,Train_acc,Train_prec,Train_recall,Train_fscore,Train_auc,Test_acc,Test_prec,Test_recall,Test_fscore,Test_auc
Model_version,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
benchmark,lr,0.9906394615061002,0.0,0.0,0.0,0.5,0.9903948772678762,0.0,0.0,0.0,0.5
benchmark,cat,0.9992637778712664,1.0,0.9213483146067416,0.9590643274853802,0.9606741573033708,0.9901814300960512,0.0,0.0,0.0,0.4998922413793103
benchmark,xgb,0.9912705090450148,1.0,0.0674157303370786,0.1263157894736842,0.5337078651685393,0.9901814300960512,0.0,0.0,0.0,0.4998922413793103
maximizing precision,cat,0.9906394615061002,0.0,0.0,0.0,0.5,0.9903948772678762,0.0,0.0,0.0,0.5


This did not improve the model, so the next thing we will try is selecting a threshold based off the roc curve for the benchmark model to improve precision.

In [25]:
# getting thresholds for what probability counts as a prediction based off fpr and npr

cat_pool = Pool(x_test, y_test)
curve = get_roc_curve(cat_bench, cat_pool)
thresh_l_fpr = []
thresh_l_fnr = []
rates = [.1,.2, .3, .4, .5, .6, .7, .8, .9,]
for i in rates:
  thresh_l_fpr.append(select_threshold(cat_bench, curve=curve, FPR = i))
  thresh_l_fnr.append(select_threshold(cat_bench, curve=curve, FNR = i))

In [26]:
# getting predicitons for benchmark catboost model
cat_train_thresh = cat_bench.predict_proba(x_train)
cat_test_thresh = cat_bench.predict_proba(x_test)

In [27]:
# getting scores for different thresholds based off different fpr and npr rates

for i in range(len(thresh_l_fpr)):
  cat_train_thresh_pred = [1 if cat_train_thresh[j][1] > thresh_l_fpr[i] else 0 for j in range(len(cat_train_thresh))]
  cat_test_thresh_pred = [1 if cat_test_thresh[j][1] > thresh_l_fpr[i] else 0 for j in range(len(cat_test_thresh))]
  all_scores = all_scores.append(get_scores(cat_train_thresh_pred,
                                          cat_test_thresh_pred,
                                          y_train, y_test,
                                          'cat', 'pred_thresh_for_fpr_' + str(rates[i])))
  
for i in range(len(thresh_l_fnr)):
  cat_train_thresh_pred = [1 if cat_train_thresh[j][1] > thresh_l_fnr[i] else 0 for j in range(len(cat_train_thresh))]
  cat_test_thresh_pred = [1 if cat_test_thresh[j][1] > thresh_l_fnr[i] else 0 for j in range(len(cat_test_thresh))]
  all_scores = all_scores.append(get_scores(cat_train_thresh_pred,
                                          cat_test_thresh_pred,
                                          y_train, y_test,
                                          'cat', 'pred_thresh_for_fnr_' + str(rates[i])))

In [28]:
all_scores.set_index(['Model_version', 'Model'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Train_acc,Train_prec,Train_recall,Train_fscore,Train_auc,Test_acc,Test_prec,Test_recall,Test_fscore,Test_auc
Model_version,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
benchmark,lr,0.9906394615061002,0.0,0.0,0.0,0.5,0.9903948772678762,0.0,0.0,0.0,0.5
benchmark,cat,0.9992637778712664,1.0,0.9213483146067416,0.9590643274853802,0.9606741573033708,0.9901814300960512,0.0,0.0,0.0,0.4998922413793103
benchmark,xgb,0.9912705090450148,1.0,0.0674157303370786,0.1263157894736842,0.5337078651685393,0.9901814300960512,0.0,0.0,0.0,0.4998922413793103
maximizing precision,cat,0.9906394615061002,0.0,0.0,0.0,0.5,0.9903948772678762,0.0,0.0,0.0,0.5
pred_thresh_for_fpr_0.1,cat,0.92522086663862,0.11125,1.0,0.2002249718785151,0.9622571398237604,0.8975453575240128,0.0588235294117647,0.6444444444444445,0.1078066914498141,0.7722222222222223
pred_thresh_for_fpr_0.2,cat,0.8113167858645352,0.0472650026553372,1.0,0.0902636916835699,0.9047669603991932,0.8008537886872998,0.0413223140495867,0.8888888888888888,0.0789733464955577,0.8444444444444444
pred_thresh_for_fpr_0.3,cat,0.7164493058477072,0.0319569120287253,1.0,0.0619345859429366,0.8568850196411508,0.7026680896478121,0.0306406685236768,0.9777777777777776,0.0594193112761647,0.8388888888888888
pred_thresh_for_fpr_0.4,cat,0.612852334875894,0.0236074270557029,1.0,0.0461259393625291,0.8045970909863043,0.6036286019210245,0.0231578947368421,0.9777777777777776,0.0452442159383033,0.788888888888889
pred_thresh_for_fpr_0.5,cat,0.515776188472865,0.0189644150862987,1.0,0.0372229192806357,0.755600382206179,0.5045891141942369,0.0186125211505922,0.9777777777777776,0.0365296803652968,0.7388888888888889
pred_thresh_for_fpr_0.6,cat,0.4183845183003786,0.0158391172806549,1.0,0.0311843027330063,0.7064444208514704,0.4057630736392743,0.0159066808059384,1.0,0.0313152400835073,0.7


A threshold based off of a false negative rate of .9 results in the highest precision and f score for this test data set, though its auc isn't the best.  A threshold based off of the fnr being at .4 or .5 results in a better balance for precision, fscore, and auc.

# Sample weighting scheme

Since less than 1% of the observations having a target of 1, this is very imbalanced dataset.  As this is troublesome for machine learning models, we will proceed by balancing the class distributions with different techniques and compare their effects.  The class balancing will only occur on the training set.

## SMOTE

SMOTE (synthetic minority oversampling technique) is a class balancing technique that we can experiment with.

On a high level, SMOTE randomly picks a point from the minority class and computes KNN on it.  New synthetic points are added between this point and the neighbors, which increases instances of the minority class, hence being an oversampling technique.  

In [29]:
# Use smote to balance data

smote = SMOTE(random_state=RANDOM_STATE)
smote_x, smote_y = smote.fit_resample(x_train, y_train)
smote_x = pd.DataFrame(data=smote_x, columns=x_train.columns)
smote_y = pd.Series(data=smote_y, name = 'target')

In [None]:
# use smote-balanced data to train catboost

cat_smote = CatBoostClassifier(random_state=RANDOM_STATE)
cat_smote.fit(smote_x, smote_y, eval_set=(x_test, y_test))


# getting the predictions
cat_smote_train_pred = cat_smote.predict(smote_x)
cat_smote_test_pred = cat_smote.predict(x_test)

## TOMEK links

Another balancing method we can look at is Tomek links, which is a pair of observations that are are close together, but have different classes.  By removing the majority class within a pair, we can undersample the data.

In [31]:
# Use tomek links to balance data

tl = TomekLinks(sampling_strategy='majority')
tl_x, tl_y = tl.fit_resample(x_train, y_train)
tl_x = pd.DataFrame(data = tl_x, columns = x_train.columns)
tl_y = pd.Series(data=tl_y, name = 'target')

In [None]:
# use tomek links-balanced data to train catboost

cat_tl = CatBoostClassifier(random_state=RANDOM_STATE)
cat_tl.fit(tl_x, tl_y, eval_set=(x_test, y_test))


# getting the predictions
cat_tl_train_pred = cat_tl.predict(tl_x)
cat_tl_test_pred = cat_tl.predict(x_test)

## SmoteTomek

The last balancing method we will look at is Smote Tomek.  It essentially combines Smote and Tomek links by first using SMOTE then cleaning with Tomek Links.

In [33]:
# using SmoteTomek to balance data

smo_tl = SMOTETomek(random_state=RANDOM_STATE)
smo_tl_x, smo_tl_y = smo_tl.fit_resample(x_train, y_train)
smo_tl_x = pd.DataFrame(data = smo_tl_x, columns = x_train.columns)
smo_tl_y = pd.Series(data=smo_tl_y, name = 'target')

In [None]:
# use smotetomek-balanced data to train catboost

cat_smo_tl = CatBoostClassifier(random_state=RANDOM_STATE)
cat_smo_tl.fit(smo_tl_x, smo_tl_y, eval_set=(x_test, y_test))


# getting the predictions
cat_smo_tl_train_pred = cat_smo_tl.predict(smo_tl_x)
cat_smo_tl_test_pred = cat_smo_tl.predict(x_test)

In [35]:
all_scores = all_scores.append(get_scores(cat_smote_train_pred, cat_smote_test_pred, smote_y, y_test, 'cat', 'smote'))
all_scores = all_scores.append(get_scores(cat_tl_train_pred, cat_tl_test_pred, tl_y, y_test, 'cat', 'tomek'))
all_scores = all_scores.append(get_scores(cat_smo_tl_train_pred, cat_smo_tl_test_pred, smo_tl_y, y_test, 'cat', 'smote_tomek'))

all_scores.set_index(['Model_version', 'Model'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Train_acc,Train_prec,Train_recall,Train_fscore,Train_auc,Test_acc,Test_prec,Test_recall,Test_fscore,Test_auc
Model_version,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
benchmark,lr,0.9906394615061002,0.0,0.0,0.0,0.5,0.9903948772678762,0.0,0.0,0.0,0.5
benchmark,cat,0.9992637778712664,1.0,0.9213483146067416,0.9590643274853802,0.9606741573033708,0.9901814300960512,0.0,0.0,0.0,0.4998922413793103
benchmark,xgb,0.9912705090450148,1.0,0.0674157303370786,0.1263157894736842,0.5337078651685393,0.9901814300960512,0.0,0.0,0.0,0.4998922413793103
maximizing precision,cat,0.9906394615061002,0.0,0.0,0.0,0.5,0.9903948772678762,0.0,0.0,0.0,0.5
pred_thresh_for_fpr_0.1,cat,0.92522086663862,0.11125,1.0,0.2002249718785151,0.9622571398237604,0.8975453575240128,0.0588235294117647,0.6444444444444445,0.1078066914498141,0.7722222222222223
pred_thresh_for_fpr_0.2,cat,0.8113167858645352,0.0472650026553372,1.0,0.0902636916835699,0.9047669603991932,0.8008537886872998,0.0413223140495867,0.8888888888888888,0.0789733464955577,0.8444444444444444
pred_thresh_for_fpr_0.3,cat,0.7164493058477072,0.0319569120287253,1.0,0.0619345859429366,0.8568850196411508,0.7026680896478121,0.0306406685236768,0.9777777777777776,0.0594193112761647,0.8388888888888888
pred_thresh_for_fpr_0.4,cat,0.612852334875894,0.0236074270557029,1.0,0.0461259393625291,0.8045970909863043,0.6036286019210245,0.0231578947368421,0.9777777777777776,0.0452442159383033,0.788888888888889
pred_thresh_for_fpr_0.5,cat,0.515776188472865,0.0189644150862987,1.0,0.0372229192806357,0.755600382206179,0.5045891141942369,0.0186125211505922,0.9777777777777776,0.0365296803652968,0.7388888888888889
pred_thresh_for_fpr_0.6,cat,0.4183845183003786,0.0158391172806549,1.0,0.0311843027330063,0.7064444208514704,0.4057630736392743,0.0159066808059384,1.0,0.0313152400835073,0.7


Smote_tomek and smote seem to perform the same on the test set and training set with benchmark catboost model.  We will go with smote_tomek for the final model since it is a bit more complex and may perform better on future unseen data.

# Cat classifier with smotetomek and threshold - Final Model

We will combine smotetomek with our previous method of looking at the fnr and fpr rates to get the threshold.  The previous thresholds won't work since the data has changed.

In [36]:
cat_pool = Pool(x_test, y_test)
curve = get_roc_curve(cat_smo_tl, cat_pool)
thresh_l_fpr = []
thresh_l_fnr = []
rates = [.1,.2, .3, .4, .5, .6, .7, .8, .9,]
for i in rates:
  thresh_l_fpr.append(select_threshold(cat_smo_tl, curve=curve, FPR = i))
  thresh_l_fnr.append(select_threshold(cat_smo_tl, curve=curve, FNR = i))

In [37]:
# getting predicitons for benchmark catboost model
cat_smo_tl_train_thresh = cat_smo_tl.predict_proba(x_train)
cat_smo_tl_test_thresh = cat_smo_tl.predict_proba(x_test)

In [38]:
# getting scores for different thresholds based off different fpr and npr rates

for i in range(len(thresh_l_fpr)):
  cat_smo_tl_train_thresh_pred = [1 if cat_smo_tl_train_thresh[j][1] > thresh_l_fpr[i] else 0 for j in range(len(cat_smo_tl_train_thresh))]
  cat_smo_tl_test_thresh_pred = [1 if cat_smo_tl_test_thresh[j][1] > thresh_l_fpr[i] else 0 for j in range(len(cat_smo_tl_test_thresh))]
  all_scores = all_scores.append(get_scores(cat_smo_tl_train_thresh_pred,
                                          cat_smo_tl_test_thresh_pred,
                                          y_train, y_test,
                                          'cat', 'smo_tl_pred_thresh_for_fpr_' + str(rates[i])))
  
for i in range(len(thresh_l_fnr)):
  cat_smo_tl_train_thresh_pred = [1 if cat_smo_tl_train_thresh[j][1] > thresh_l_fnr[i] else 0 for j in range(len(cat_smo_tl_train_thresh))]
  cat_smo_tl_test_thresh_pred = [1 if cat_smo_tl_test_thresh[j][1] > thresh_l_fnr[i] else 0 for j in range(len(cat_smo_tl_test_thresh))]
  all_scores = all_scores.append(get_scores(cat_smo_tl_train_thresh_pred,
                                          cat_smo_tl_test_thresh_pred,
                                          y_train, y_test,
                                          'cat', 'smo_tl_pred_thresh_for_fnr_' + str(rates[i])))

In [39]:
all_scores.set_index(['Model_version', 'Model'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Train_acc,Train_prec,Train_recall,Train_fscore,Train_auc,Test_acc,Test_prec,Test_recall,Test_fscore,Test_auc
Model_version,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
benchmark,lr,0.9906394615061002,0.0,0.0,0.0,0.5,0.9903948772678762,0.0,0.0,0.0,0.5
benchmark,cat,0.9992637778712664,1.0,0.9213483146067416,0.9590643274853802,0.9606741573033708,0.9901814300960512,0.0,0.0,0.0,0.4998922413793103
benchmark,xgb,0.9912705090450148,1.0,0.0674157303370786,0.1263157894736842,0.5337078651685393,0.9901814300960512,0.0,0.0,0.0,0.4998922413793103
maximizing precision,cat,0.9906394615061002,0.0,0.0,0.0,0.5,0.9903948772678762,0.0,0.0,0.0,0.5
pred_thresh_for_fpr_0.1,cat,0.92522086663862,0.11125,1.0,0.2002249718785151,0.9622571398237604,0.8975453575240128,0.0588235294117647,0.6444444444444445,0.1078066914498141,0.7722222222222223
pred_thresh_for_fpr_0.2,cat,0.8113167858645352,0.0472650026553372,1.0,0.0902636916835699,0.9047669603991932,0.8008537886872998,0.0413223140495867,0.8888888888888888,0.0789733464955577,0.8444444444444444
pred_thresh_for_fpr_0.3,cat,0.7164493058477072,0.0319569120287253,1.0,0.0619345859429366,0.8568850196411508,0.7026680896478121,0.0306406685236768,0.9777777777777776,0.0594193112761647,0.8388888888888888
pred_thresh_for_fpr_0.4,cat,0.612852334875894,0.0236074270557029,1.0,0.0461259393625291,0.8045970909863043,0.6036286019210245,0.0231578947368421,0.9777777777777776,0.0452442159383033,0.788888888888889
pred_thresh_for_fpr_0.5,cat,0.515776188472865,0.0189644150862987,1.0,0.0372229192806357,0.755600382206179,0.5045891141942369,0.0186125211505922,0.9777777777777776,0.0365296803652968,0.7388888888888889
pred_thresh_for_fpr_0.6,cat,0.4183845183003786,0.0158391172806549,1.0,0.0311843027330063,0.7064444208514704,0.4057630736392743,0.0159066808059384,1.0,0.0313152400835073,0.7


Given the above table, our final model should be the catboost classifier trained on data that has been balanced with smotetomek, with a threshold based off the the fnr of .05.  Athough there are other models that perform better on the precision, such as some models with different thresholds and the default threshold on the smote and smote tomek balanced data, all these other models have a worse auc.  Our final model has the best balance between precision, f score, and auc.