### Final model evaluation
- Check the performance of the trained model on the test set which was set aside in the exploration phase
- This performance is the best estimator for performance in 'the real world'

In [1]:
import datetime
import os
import pickle

import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

import src.features.data_cleaning as data_clean
import src.features.data_exploration as data_exp
import src.features.feature_engineering as feat_eng

#### Load the model

In [2]:
models_path = os.path.join("..", "models")

for file in os.scandir(models_path):
    print(file.name)

.gitkeep
20200304_lightgbm_45796_rows
lightgbm_first_round.sav


In [3]:
filename = os.path.join(models_path, "20200304_lightgbm_45796_rows")

In [4]:
lightgbm = pickle.load(open(filename, "rb"))

#### Load the data

In [5]:
test_path = os.path.join("..", "data", "test_set")

In [6]:
test_set = pd.read_csv(
    test_path, sep=";", decimal=".", low_memory=False, compression="zip"
)

In [7]:
test_set.shape

(11450, 416)

#### Load the feature list

In [8]:
feature_path = os.path.join("..", "feature_lists", "combined_list_4_votes")

In [9]:
feature_list = list(pd.read_csv(feature_path, sep=";")["features"])

In [10]:
feature_list

['ele_bron_ecostroom_ind',
 'ele_bron_wind_eu_ind',
 'ele_vbr_prf_1b_ind',
 'gas_klant_ind',
 'gas_lpt_4jr_ind',
 'gas_type_prijs_vast_ind',
 'vzgebied_stedin_buiten_ind',
 'woon_eigendom_koop_ind',
 'ntal_kind_in_huis_geen_ind',
 'ntal_volwassen_in_huis_1_ind',
 'ntal_volwassen_in_huis_2_ind',
 'postorder_koper_ind',
 'auto_bezit_geen_ind',
 'auto_bezit_eigen_ind',
 'bezit_spelcomputer_ind',
 'web_activit_zeer_veel_ind',
 'btwz_onbekend_ind',
 'incgrp_medium_risk_ind',
 'incgrp_low_risk_ind',
 'contact_lst_jr_aant_freq',
 'cont_kcm_soort_service_freq',
 'sale_lstjr_knl_last_inb_ind',
 'web_url_app_lst_jr_ind',
 'web_url_meterstnden_lst_jr_ind',
 'web_url_verhuizen_lst_jr_ind',
 'days_since_ltst_sale_num',
 'klant_jr_sinds_frst_start_rec']

### Incorporate cleaning & changes to the final dataset
- In this case there is only 1 additional feature added, the rest is the same as the original
- most cleaning consisted of removing items

In [11]:
today = datetime.date.today()
test_set["days_since_ltst_sale_num"] = (
    today - pd.to_datetime(test_set["ltst_sale_dat"]).dt.date
).dt.days
test_set["days_since_ltst_sale_num"] = test_set["days_since_ltst_sale_num"].fillna(
    test_set["days_since_ltst_sale_num"].median()
)

In [13]:
feature_list.insert(0, "toon_churn")

In [15]:
data_exp.describe_df(test_set[feature_list], dependent_variable="toon_churn")

This dataframe has 11450 rows and 28 columns.

The dependent variable consists of 49.7% of ones.

The variables have the following data types:
int64      26
float64     2
dtype: int64

The postfixes are distributed as follows:
Counter({'ind': 23, 'freq': 2, 'churn': 1, 'num': 1, 'rec': 1})

The following variables have missing values:
Empty DataFrame
Columns: [column, perc_missings]
Index: []




### Final results
- Very slight decrease in performance .. bueno

In [19]:
y = test_set["toon_churn"]
X = test_set[feature_list].drop("toon_churn", axis=1)

In [20]:
lightgbm_prediction = lightgbm.predict(X)

In [21]:
print(confusion_matrix(lightgbm_prediction, y))
print(classification_report(lightgbm_prediction, y))

[[5351 1002]
 [ 403 4694]]
              precision    recall  f1-score   support

           0       0.93      0.84      0.88      6353
           1       0.82      0.92      0.87      5097

    accuracy                           0.88     11450
   macro avg       0.88      0.88      0.88     11450
weighted avg       0.88      0.88      0.88     11450



#### Next steps
- Create a make_dataset function which collects only the necessary columns for this model
- Create seperate functions to re-train the model or use it to predict toon churn for new customers
- Create a hyperopt function
- Add the above mentioned functions in a script
- Plug the model in on the future DS & DE platform