# Model Evaluation

In this notebook, I will test multiple model and evaluate to choose the best one.

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle

from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, SCORERS
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

pd.set_option('precision', 4)
pd.options.display.max_seq_items = None
pd.options.display.max_columns = 50

In [11]:
X = pd.read_pickle('PKL/X_train.pkl')
y = pd.read_pickle('PKL/Y_train.pkl')

## Train/Test Split
I'm actually going to split my train data once more. It's because this is a competition dataset and I actually don't have the 'test' score result. So I will use the initial test features as a holdout set, and make another subset of train/test for validations.

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13, test_size = 0.2)

In [13]:
# One Hot Encoding 
X_train_ohe = pd.get_dummies(X_train)

## KNN
First, I will run KNN using GridSearchCV.

In [16]:
score_keeper = {}

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
params = {
    'n_neighbors': range(1, 20, 2), # setting K
    'weights': ['uniform', 'distance'] # voting weights
}

knc = KNeighborsClassifier()
knc_g = GridSearchCV(knc, params, cv = 5, scoring = 'f1_weighted', verbose = 1, n_jobs = -1)
knc_g.fit(X_train_ohe, y_train)
print(knc_g.best_params_, ': ', knc_g.best_score_)

y_pred = knc_g.best_estimator_.predict(X_test)    
f1_test = round(f1_score(y_test, y_pred), 3)
acc_test = round(accuracy_score(y_test, y_pred), 3)

print('Test F1 score: ', f1_test, '/ Test Accuracy: ', acc_test)
score_keeper['knn_gsc'] = (f1_test, acc_test)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
def dtree_test(X, y):
    dtree = DecisionTreeClassifier(class_weight='balanced', max_depth = 5, random_state = 10)
    cv = KFold(n_splits = 5, shuffle = True, random_state = 20)
    score = np.mean(cross_val_score(dtree, X, y, cv = cv, 
                                    n_jobs = -1, verbose = 1))
    return score

In [None]:
score = dtree_test(X_train_ohe, y_train)
score_keeper = {'dtree_basic': round(score, 3)}

In [None]:
# checking available scorers
from sklearn.metrics import SCORERS
#sorted(SCORERS)

In [None]:
score_keeper