In [52]:
import datetime, time, math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report

In [54]:
# Show plots in jupyter notebook
%matplotlib inline

In [55]:
DATA_DIR = os.path.join("C:/Users/saivi/Documents/BCG", "processed_data")
TRAINING_DATA = os.path.join(DATA_DIR, "train_data.pkl")
HISTORY_DATA = os.path.join(DATA_DIR, "history_data.pkl")

In [56]:
train_data = pd.read_pickle(TRAINING_DATA)
history_data = pd.read_pickle(HISTORY_DATA)

In [57]:
train_data.head()

Unnamed: 0,id,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_p1,forecast_price_energy_p2,forecast_price_pow_p1,...,origin_usa,activity_apd,activity_ckf,activity_clu,activity_cwo,activity_fmw,activity_kkk,activity_kwu,activity_sfi,activity_wxe
0,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,0,0,0,0,0,0,0
1,24011ae4ebbe3035111d65fa7c15bc57,4.327104,4.739944,0.0,3.085953,0.0,0.444045,0.114481,0.098142,40.606701,...,0,0,0,0,0,0,0,0,0,0
2,d29c2c54acc38ff3c0614d0a653813dd,3.668479,0.0,0.0,2.28092,0.0,1.237292,0.145711,0.0,44.311378,...,0,0,0,0,0,0,0,0,0,0
3,764c75f661154dac3a6c254cd082ea7d,2.736397,0.0,0.0,1.689841,0.0,1.599009,0.165794,0.087899,44.311378,...,0,0,0,0,0,0,0,0,0,0
4,bba03439a292a1e166f80264c16191cb,3.200029,0.0,0.0,2.382089,0.0,1.318689,0.146694,0.0,44.311378,...,0,0,0,0,0,0,0,0,0,0


In [58]:
train_data.shape

(16096, 44)

In [59]:
history_data.head()

Unnamed: 0,id,price_date,price_p1_var,price_p2_var,price_p3_var,price_p1_fix,price_p2_fix,price_p3_fix
0,038af19179925da21a25619c5a24b745,2015-01-01,0.151367,0.0,0.0,44.266931,0.0,0.0
1,038af19179925da21a25619c5a24b745,2015-02-01,0.151367,0.0,0.0,44.266931,0.0,0.0
2,038af19179925da21a25619c5a24b745,2015-03-01,0.151367,0.0,0.0,44.266931,0.0,0.0
3,038af19179925da21a25619c5a24b745,2015-04-01,0.149626,0.0,0.0,44.266931,0.0,0.0
4,038af19179925da21a25619c5a24b745,2015-05-01,0.149626,0.0,0.0,44.266931,0.0,0.0


In [60]:
train_data.shape

(16096, 44)

In [61]:
train_data['id']

0        48ada52261e7cf58715202705a0451c9
1        24011ae4ebbe3035111d65fa7c15bc57
2        d29c2c54acc38ff3c0614d0a653813dd
3        764c75f661154dac3a6c254cd082ea7d
4        bba03439a292a1e166f80264c16191cb
                       ...               
16091    18463073fb097fc0ac5d3e040f356987
16092    d0a6f71671571ed83b2645d23af6de00
16093    10e6828ddd62cbcf687cb74928c4c2d2
16094    1cf20fd6206d7678d5bcafd28c53b4db
16095    563dde550fd624d7352f3de77c0cdfcd
Name: id, Length: 16096, dtype: object

In [62]:
train_data[train_data['id']=='038af19179925da21a25619c5a24b745']

Unnamed: 0,id,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_p1,forecast_price_energy_p2,forecast_price_pow_p1,...,origin_usa,activity_apd,activity_ckf,activity_clu,activity_cwo,activity_fmw,activity_kkk,activity_kwu,activity_sfi,activity_wxe
12189,038af19179925da21a25619c5a24b745,3.553519,0.0,2.800029,2.72632,0.0,1.313656,0.14398,0.0,44.311378,...,0,0,0,0,0,0,0,0,0,1


In [63]:
history_data[history_data['id']=='038af19179925da21a25619c5a24b745']

Unnamed: 0,id,price_date,price_p1_var,price_p2_var,price_p3_var,price_p1_fix,price_p2_fix,price_p3_fix
0,038af19179925da21a25619c5a24b745,2015-01-01,0.151367,0.0,0.0,44.266931,0.0,0.0
1,038af19179925da21a25619c5a24b745,2015-02-01,0.151367,0.0,0.0,44.266931,0.0,0.0
2,038af19179925da21a25619c5a24b745,2015-03-01,0.151367,0.0,0.0,44.266931,0.0,0.0
3,038af19179925da21a25619c5a24b745,2015-04-01,0.149626,0.0,0.0,44.266931,0.0,0.0
4,038af19179925da21a25619c5a24b745,2015-05-01,0.149626,0.0,0.0,44.266931,0.0,0.0
5,038af19179925da21a25619c5a24b745,2015-06-01,0.149626,0.0,0.0,44.26693,0.0,0.0
6,038af19179925da21a25619c5a24b745,2015-07-01,0.150321,0.0,0.0,44.44471,0.0,0.0
7,038af19179925da21a25619c5a24b745,2015-08-01,0.145859,0.0,0.0,44.44471,0.0,0.0
8,038af19179925da21a25619c5a24b745,2015-09-01,0.145859,0.0,0.0,44.44471,0.0,0.0
9,038af19179925da21a25619c5a24b745,2015-10-01,0.145859,0.0,0.0,44.44471,0.0,0.0


Both the DataFrames can be mearged on 'id'

In [64]:
train = pd.merge(train_data, history_data, on='id')

In [65]:
train.head()

Unnamed: 0,id,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_p1,forecast_price_energy_p2,forecast_price_pow_p1,...,activity_kwu,activity_sfi,activity_wxe,price_date,price_p1_var,price_p2_var,price_p3_var,price_p1_fix,price_p2_fix,price_p3_fix
0,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-01-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426
1,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-02-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426
2,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-03-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426
3,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-04-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426
4,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-05-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426


In [66]:
train.shape

(193002, 51)

In [67]:
copy_train=train

In [68]:
train.dtypes

id                                  object
cons_12m                           float64
cons_gas_12m                       float64
cons_last_month                    float64
forecast_cons_12m                  float64
forecast_discount_energy           float64
forecast_meter_rent_12m            float64
forecast_price_energy_p1           float64
forecast_price_energy_p2           float64
forecast_price_pow_p1              float64
has_gas                              int64
imp_cons                           float64
margin_gross_pow_ele               float64
margin_net_pow_ele                 float64
nb_prod_act                          int64
net_margin                         float64
pow_max                            float64
churn                                int64
tenure                               int32
months_activ                       float64
months_to_end                      float64
months_modif_prod                  float64
months_renewal                     float64
channel_epu

In [69]:
train

Unnamed: 0,id,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_p1,forecast_price_energy_p2,forecast_price_pow_p1,...,activity_kwu,activity_sfi,activity_wxe,price_date,price_p1_var,price_p2_var,price_p3_var,price_p1_fix,price_p2_fix,price_p3_fix
0,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-01-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426
1,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-02-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426
2,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-03-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426
3,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-04-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426
4,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-05-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192997,563dde550fd624d7352f3de77c0cdfcd,3.941064,0.0,0.000000,2.882758,0.0,0.315970,0.167086,0.088454,45.311378,...,0,0,0,2015-08-01,0.165962,0.086905,0.000000,44.266930,0.000000,0.00000
192998,563dde550fd624d7352f3de77c0cdfcd,3.941064,0.0,0.000000,2.882758,0.0,0.315970,0.167086,0.088454,45.311378,...,0,0,0,2015-09-01,0.165962,0.086905,0.000000,44.266930,0.000000,0.00000
192999,563dde550fd624d7352f3de77c0cdfcd,3.941064,0.0,0.000000,2.882758,0.0,0.315970,0.167086,0.088454,45.311378,...,0,0,0,2015-10-01,0.165962,0.086905,0.000000,44.266930,0.000000,0.00000
193000,563dde550fd624d7352f3de77c0cdfcd,3.941064,0.0,0.000000,2.882758,0.0,0.315970,0.167086,0.088454,45.311378,...,0,0,0,2015-11-01,0.165962,0.086905,0.000000,44.266930,0.000000,0.00000


In [70]:
train['id'].nunique()

16096

In [71]:
train

Unnamed: 0,id,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_p1,forecast_price_energy_p2,forecast_price_pow_p1,...,activity_kwu,activity_sfi,activity_wxe,price_date,price_p1_var,price_p2_var,price_p3_var,price_p1_fix,price_p2_fix,price_p3_fix
0,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-01-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426
1,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-02-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426
2,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-03-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426
3,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-04-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426
4,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,0,2015-05-01,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192997,563dde550fd624d7352f3de77c0cdfcd,3.941064,0.0,0.000000,2.882758,0.0,0.315970,0.167086,0.088454,45.311378,...,0,0,0,2015-08-01,0.165962,0.086905,0.000000,44.266930,0.000000,0.00000
192998,563dde550fd624d7352f3de77c0cdfcd,3.941064,0.0,0.000000,2.882758,0.0,0.315970,0.167086,0.088454,45.311378,...,0,0,0,2015-09-01,0.165962,0.086905,0.000000,44.266930,0.000000,0.00000
192999,563dde550fd624d7352f3de77c0cdfcd,3.941064,0.0,0.000000,2.882758,0.0,0.315970,0.167086,0.088454,45.311378,...,0,0,0,2015-10-01,0.165962,0.086905,0.000000,44.266930,0.000000,0.00000
193000,563dde550fd624d7352f3de77c0cdfcd,3.941064,0.0,0.000000,2.882758,0.0,0.315970,0.167086,0.088454,45.311378,...,0,0,0,2015-11-01,0.165962,0.086905,0.000000,44.266930,0.000000,0.00000


In [72]:
train['price_date'].dtype

dtype('<M8[ns]')

In [73]:
train['price_date'] = pd.to_datetime(train['price_date'], format='%Y-%m-%d')

In [74]:
train['price_date']=train['price_date'].map(datetime.datetime.toordinal)

In [75]:
train['price_date'].dtype

dtype('int64')

In [76]:
pd.DataFrame({'DataFrame Columns': train.columns})

Unnamed: 0,DataFrame Columns
0,id
1,cons_12m
2,cons_gas_12m
3,cons_last_month
4,forecast_cons_12m
5,forecast_discount_energy
6,forecast_meter_rent_12m
7,forecast_price_energy_p1
8,forecast_price_energy_p2
9,forecast_price_pow_p1


In [77]:
train['uniqueId'] = (train['id']).astype('category').cat.codes
train.head()

Unnamed: 0,id,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_p1,forecast_price_energy_p2,forecast_price_pow_p1,...,activity_sfi,activity_wxe,price_date,price_p1_var,price_p2_var,price_p3_var,price_p1_fix,price_p2_fix,price_p3_fix,uniqueId
0,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,735599,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666
1,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,735630,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666
2,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,735658,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666
3,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,735689,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666
4,48ada52261e7cf58715202705a0451c9,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,...,0,0,735719,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666


In [78]:
train.tail()

Unnamed: 0,id,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_p1,forecast_price_energy_p2,forecast_price_pow_p1,...,activity_sfi,activity_wxe,price_date,price_p1_var,price_p2_var,price_p3_var,price_p1_fix,price_p2_fix,price_p3_fix,uniqueId
192997,563dde550fd624d7352f3de77c0cdfcd,3.941064,0.0,0.0,2.882758,0.0,0.31597,0.167086,0.088454,45.311378,...,0,0,735811,0.165962,0.086905,0.0,44.26693,0.0,0.0,5472
192998,563dde550fd624d7352f3de77c0cdfcd,3.941064,0.0,0.0,2.882758,0.0,0.31597,0.167086,0.088454,45.311378,...,0,0,735842,0.165962,0.086905,0.0,44.26693,0.0,0.0,5472
192999,563dde550fd624d7352f3de77c0cdfcd,3.941064,0.0,0.0,2.882758,0.0,0.31597,0.167086,0.088454,45.311378,...,0,0,735872,0.165962,0.086905,0.0,44.26693,0.0,0.0,5472
193000,563dde550fd624d7352f3de77c0cdfcd,3.941064,0.0,0.0,2.882758,0.0,0.31597,0.167086,0.088454,45.311378,...,0,0,735903,0.165962,0.086905,0.0,44.26693,0.0,0.0,5472
193001,563dde550fd624d7352f3de77c0cdfcd,3.941064,0.0,0.0,2.882758,0.0,0.31597,0.167086,0.088454,45.311378,...,0,0,735933,0.165962,0.086905,0.0,44.26693,0.0,0.0,5472


In [79]:
#Changing the id column and uniqueid column to a dictionary

id_dict = train.set_index('id').to_dict()['uniqueId']
id_dict

{'48ada52261e7cf58715202705a0451c9': 4666,
 '24011ae4ebbe3035111d65fa7c15bc57': 2361,
 'd29c2c54acc38ff3c0614d0a653813dd': 13250,
 '764c75f661154dac3a6c254cd082ea7d': 7430,
 'bba03439a292a1e166f80264c16191cb': 11748,
 '568bb38a1afd7c0fc49c77b3789b59a3': 5493,
 '149d57cf92fc41cf94415803a877cb4b': 1328,
 '1aa498825382410b098937d65c4ec26d': 1704,
 '7ab4bf4878d8f7661dfc20e9b8e18011': 7719,
 '01495c955be7ec5e7f3203406785aae0': 100,
 'f53a254b1115634330c12c7fdbf7958a': 15404,
 '10c1b2f97a2d2a6f10299dc213d1a370': 1082,
 'd5e512dbc8118a830a7b8c7f61d89992': 13453,
 '6ee77a019251bcc304c88167e0dfcb4c': 7017,
 '2c05848e1a577dd041ea6804bbf5bbf5': 2866,
 '21860c2ff2d5df75503b230ce629c253': 2167,
 '227254ed8be1393d5e37e6dc9c3f359b': 2241,
 'c5037389dd8be351d3c40f2973da2391': 12348,
 'e4773b60d2e68aac94795782abfc006b': 14398,
 'bd14bbac3aca710ee5288fcd8511d4bb': 11829,
 '4e4a34a6d91ff25d0fdf68733907ad44': 5013,
 'f2b93783adecf0d0d8d60ab547bda3de': 15243,
 '47e976c5d7f1ee1fee6d60ca39fd0218': 4621,
 'db

In [80]:
train_uniID_copy = train
train.drop('id', inplace=True, axis=1)

In [81]:
train.head()

Unnamed: 0,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_p1,forecast_price_energy_p2,forecast_price_pow_p1,has_gas,...,activity_sfi,activity_wxe,price_date,price_p1_var,price_p2_var,price_p3_var,price_p1_fix,price_p2_fix,price_p3_fix,uniqueId
0,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,0,...,0,0,735599,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666
1,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,0,...,0,0,735630,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666
2,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,0,...,0,0,735658,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666
3,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,0,...,0,0,735689,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666
4,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,0,...,0,0,735719,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666


1. Splitting Data

In [82]:
# standardize the features of the training set using StardardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train)
train.head()

Unnamed: 0,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_p1,forecast_price_energy_p2,forecast_price_pow_p1,has_gas,...,activity_sfi,activity_wxe,price_date,price_p1_var,price_p2_var,price_p3_var,price_p1_fix,price_p2_fix,price_p3_fix,uniqueId
0,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,0,...,0,0,735599,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666
1,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,0,...,0,0,735630,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666
2,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,0,...,0,0,735658,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666
3,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,0,...,0,0,735689,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666
4,5.490346,0.0,4.001128,4.423595,0.0,2.556652,0.095919,0.088347,58.995952,0,...,0,0,735719,0.106043,0.093474,0.067378,58.936774,36.344726,8.33426,4666


In [83]:
y = train["churn"]
X = train.drop(labels = ["uniqueId","churn"],axis = 1)

In [92]:
Counter(y)

Counter({0: 173893, 1: 19109})

In [84]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [85]:
from imblearn.under_sampling import RandomUnderSampler

In [86]:
undersample = RandomUnderSampler(sampling_strategy='majority')


In [90]:
X_under, y_under = undersample.fit_resample(X, y)

In [91]:
from collections import Counter
Counter(y_under)

Counter({0: 19109, 1: 19109})

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_under, y_under, test_size=0.25, random_state=50)


In [94]:
y_train.value_counts()

0    14361
1    14302
Name: churn, dtype: int64

In [95]:


Counter(y_test)

Counter({1: 4807, 0: 4748})

In [96]:
Counter(y_train)

Counter({0: 14361, 1: 14302})

y_train has very less churn customers 14332 for total of 144751. 
y_train is not a balanced dataset so 
we need standardize the features of the training set using StandardScaler 
and then apply RandonUnderSampler 
which is a "way to balance the data by randomly selecting a subset of data for the targeted classes"

2. Modeling and Evaluation

In [98]:
# Create a function to fit ML algorithms
def fit_ml_algo(algo, X_train, y_train, cv):
    # algo is algorithms name
    # cv is cross-fold validation
    # One Pass
    model = algo.fit(X_train,y_train)
    
    # Accuracy score
    acs = round(model.score(X_train,y_train)*100,2)
    
    # Cross Validation
    train_pred = model_selection.cross_val_predict(algo, X_train,y_train,cv=cv,n_jobs=-1)
    
    # Cross Validation Accuracy Score
    cvs = round(metrics.accuracy_score(y_train, train_pred)*100,3)
    
    # Precision Score
    ps = round(metrics.precision_score(y_train, train_pred)*100,3)
    
    # Recall Score
    rs = round(metrics.recall_score(y_train, train_pred)*100,3)
    
    return train_pred, acs, cvs, rs, ps

In [100]:
#Stochastic Gradient Descent
start_time = time.time()
train_pred_SGD, acs_SGD, cvs_SGD, rs_SGD, ps_SGD = fit_ml_algo(SGDClassifier(),X_train, y_train,5)

log_time = (time.time()-start_time)
print("Accuracy: %s" %acs_SGD)
print("Accuracy CV 10-Fold: %s" %cvs_SGD)
print("Recall Score: %s" %rs_SGD)
print("Precision score: %s" %ps_SGD)
print("Running Time: %s" %datetime.timedelta(seconds=log_time))

Accuracy: 50.1
Accuracy CV 10-Fold: 50.023
Recall Score: 40.001
Precision score: 49.9
Running Time: 0:00:06.771312


In [101]:
# Random Forest Classifier
start_time = time.time()
train_pred_RF, acs_RF, cvs_RF, rs_RF, ps_RF = fit_ml_algo(RandomForestClassifier(),X_train, y_train,5)

log_time = (time.time()-start_time)
print("Accuracy: %s" %acs_RF)
print("Accuracy CV 10-Fold: %s" %cvs_RF)
print("Recall Score: %s" %rs_RF)
print("Precision score: %s" %ps_RF)
print("Running Time: %s" %datetime.timedelta(seconds=log_time))

Accuracy: 100.0
Accuracy CV 10-Fold: 99.219
Recall Score: 100.0
Precision score: 98.458
Running Time: 0:00:12.272411


In [103]:
# Gradient Boost Trees
start_time = time.time()
train_pred_GBC, acs_GBC, cvs_GBC, rs_GBC, ps_GBC = fit_ml_algo(GradientBoostingClassifier(),X_train,y_train,5)

log_time = (time.time()-start_time)
print("Accuracy: %s" %acs_GBC)
print("Accuracy CV 10-Fold: %s" %cvs_GBC)
print("Recall Score: %s" %rs_GBC)
print("Precision score: %s" %ps_GBC)
print("Running Time: %s" %datetime.timedelta(seconds=log_time))

Accuracy: 71.47
Accuracy CV 10-Fold: 70.376
Recall Score: 68.788
Precision score: 70.956
Running Time: 0:00:20.175446


From the model above we can choose Random Forest Classifier as the apt one

Let's test the data

In [105]:
clf_forest = RandomForestClassifier()
clf_forest.fit(X_train,y_train)

RandomForestClassifier()

In [107]:
pred = clf_forest.predict(X_train)
print("Accuracy Score with y_train:", round(metrics.accuracy_score(y_train, pred)*100,3))

Accuracy Score with y_train: 100.0


In [108]:
pred_test = clf_forest.predict(X_test)
print("Accuracy Score with y_test:", round(metrics.accuracy_score(y_test, pred_test)*100,3))

Accuracy Score with y_test: 99.676


In [111]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_test)                # Confusion Matrix

array([[4717,   31],
       [   0, 4807]], dtype=int64)

In [112]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_test)                  # Accuracy

0.9967556253270539

In [113]:
from sklearn.metrics import recall_score
recall_score(y_test, pred_test, average=None)      # Recall

array([0.99347094, 1.        ])

In [114]:
from sklearn.metrics import precision_score
precision_score(y_test, pred_test, average=None)   # Precision

array([1.        , 0.99359239])