# Final Testing

In this notebook, we test the final model on a holdout set (set aside from competition training set) before we deploy our model for final submission. Also we create a prediction set from the competition test set.

In [None]:
import pandas as pd
import numpy as np
import pickle

from sklearn.metrics import f1_score, balanced_accuracy_score, plot_confusion_matrix, classification_report

In [None]:
# import holdout test set
X_train = pd.read_pickle('PKL/X_train.pkl')
X_test = pd.read_pickle('PKL/X_test.pkl')
y_test = pd.read_pickle('PKL/y_test.pkl')

In [None]:
# one-hot-encoding and standardization (if final model is KNN)
X_train_ohe = pd.get_dummies(X_train)
X_test_ohe = pd.get_dummies(X_test)

# Check if they have the same features
if X_train_ohe.shape[1] != X_test_ohe.shape[1]:
    print([x for x in X_train_ohe.columns if x not in X_test_ohe.columns])
    print([x for x in X_test_ohe.columns if x not in X_train_ohe.columns])
else: 
    print ('Good to go')

In [None]:
# import model
final_model = pickle.load(open('PKL/final_model.pkl', 'rb'))

In [None]:
# predict and plot performance
y_pred = final_model.predict(X_test_ohe)

f1_test = round(f1_score(y_test, y_pred, average = 'weighted'), 3)
acc_test = round(balanced_accuracy_score(y_test, y_pred), 3)
print('Test F1 score: ', f1_test, '/ Test Accuracy: ', acc_test)
plot_confusion_matrix(dummyc, X_test_ohe, y_test, 
                      xticks_rotation = 'vertical', cmap = plt.cm.Blues)
print(classification_report(y_test, y_pred))

In [None]:
# fit the full dataset
X_full = pd.read_pickle('PKL/X_full.pkl')
y_full = pd.read_pickle('PKL/y_full.pkl')
X_full_ohe = pd.get_dummies(X_full)

final_model.fit(X_full_ohe, y_full)

In [None]:
# import the final test set
X_submission = pd.read_csv('DATA/TEST_VALUES.csv')

In [None]:
# run preprocessing
from preprocessing_pipeline import preprocessing

X_submission_pp = preprocessing(X_submission)

# one-hot-encoding and standardization (if final model is KNN)
X_submission_ohe = pd.get_dummies(X_submission_pp)

# Check if they have the same features
if X_train_ohe.shape[1] != X_submission_ohe.shape[1]:
    print([x for x in X_train_ohe.columns if x not in X_submission_ohe.columns])
    print([x for x in X_submission_ohe.columns if x not in X_train_ohe.columns])
else: 
    print ('Good to go')

In [None]:
# predict and plot performance
y_pred = final_model.predict(X_submission_ohe)

In [None]:
# save the submission