In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from imblearn.over_sampling import SMOTENC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import warnings

# Shows plots in jupyter notebook
%matplotlib inline

# Set plot style
sns.set(color_codes=True)
pd.set_option("max_columns", None)
warnings.filterwarnings("ignore")


In [2]:
df=pd.read_csv('cleaned_data.csv')

In [3]:
df["date_activ"] = pd.to_datetime(df["date_activ"], format='%Y-%m-%d')
df["date_end"] = pd.to_datetime(df["date_end"], format='%Y-%m-%d')
df["date_modif_prod"] = pd.to_datetime(df["date_modif_prod"], format='%Y-%m-%d')
df["date_renewal"] = pd.to_datetime(df["date_renewal"], format='%Y-%m-%d')

In [4]:
df['months_active'] = ((df['date_end'] - df['date_activ'])/ np.timedelta64(1, 'M')).astype(int)

In [5]:
df = df.rename(
    index=str, 
    columns={
        "average_price1": "average_off_peak",
        "average_price2": "average_peak",
        "average_price3": "average_mid_peak",
        "average_6m_price1": "average_6m_off_peak",
        "average_6m_price2": "average_6m_peak",
        "average_6m_price3": "average_6m_mid_peak",
        "average_3m_price1": "average_3m_off_peak",
        "average_3m_price2": "average_3m_peak",
        "average_3m_price3": "average_3m_mid_peak",
        
    })

In [6]:
df.drop(['Unnamed: 0','date_activ', 'date_end', 'date_modif_prod',
       'date_renewal','average_off_peak',
       'average_peak', 'average_mid_peak', 'average_off_peak_var',
       'average_peak_var', 'average_mid_peak_var','average_6m_off_peak',
       'average_6m_peak', 'average_6m_mid_peak',
       'average_6m_off_peak_var', 'average_6m_peak_var',
       'average_6m_mid_peak_var','average_3m_off_peak', 'average_3m_peak',
       'average_3m_mid_peak', 'average_3m_off_peak_var',
       'average_3m_peak_var', 'average_3m_mid_peak_var'],axis=1,inplace=True)

In [7]:
df=df[(df.origin_up!='ewxeelcelemmiwuafmddpobolfuxioce')
         &(df.origin_up!='usapbepcfoloekilkwsdiboslwaxobdp')
         &(df.channel_sales!='epumfxlbckeskwekxbiuasklxalciiuu')
         &(df.channel_sales!='fixdbufsefwooaasfcxdxadsiekoceaa')
         &(df.forecast_discount_energy!=17)
         &(df.forecast_discount_energy!=19)
         &(df.forecast_discount_energy!=26)
         &(df['cons_12m']<1856956)
         &(df['cons_gas_12m']<465856)
         &(df['cons_last_month']<100576)]

In [8]:
df=df.drop(['has_both'],axis=1)

In [9]:
df

Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_cons_year,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_off_peak,forecast_price_energy_peak,forecast_price_pow_off_peak,has_gas,imp_cons,margin_gross_pow_ele,margin_net_pow_ele,nb_prod_act,net_margin,num_years_antig,origin_up,pow_max,has_electricity,average_off_peak_fix,average_peak_fix,average_mid_peak_fix,average_6m_off_peak_fix,average_6m_peak_fix,average_6m_mid_peak_fix,average_3m_off_peak_fix,average_3m_peak_fix,average_3m_mid_peak_fix,churn,months_active
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,0.00,0,0.0,1.78,0.114481,0.098142,40.606701,True,0.00,25.44,25.44,2,678.99,3,lxidpiddsbxsbosboudacockeimpuepw,43.648,False,40.942265,22.352010,14.901340,41.318559,20.364442,13.576296,42.497907,12.218665,8.145777,1,36
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,189.95,0,0.0,16.27,0.145711,0.000000,44.311378,False,0.00,16.38,16.38,1,18.89,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.800,True,44.311375,0.000000,0.000000,44.355820,0.000000,0.000000,44.444710,0.000000,0.000000,0,84
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,47.96,0,0.0,38.72,0.165794,0.087899,44.311378,False,0.00,28.60,28.60,1,6.60,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.856,True,44.385450,0.000000,0.000000,44.444710,0.000000,0.000000,44.444710,0.000000,0.000000,0,72
3,bba03439a292a1e166f80264c16191cb,lmkebamcaaclubfxadlmueccxoimlema,1584,0,0,240.04,0,0.0,19.83,0.146694,0.000000,44.311378,False,0.00,30.22,30.22,1,25.46,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.200,True,44.400265,0.000000,0.000000,44.444710,0.000000,0.000000,44.444710,0.000000,0.000000,0,72
4,149d57cf92fc41cf94415803a877cb4b,MISSING,4425,0,526,445.75,526,0.0,131.73,0.116900,0.100015,40.606701,False,52.32,44.91,44.91,1,47.98,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,19.800,True,40.688156,24.412893,16.275263,40.728885,24.437330,16.291555,40.728885,24.437330,16.291555,0,73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14600,18463073fb097fc0ac5d3e040f356987,foosdfpfkusacimwkcsosbicdxkicaua,32270,47940,0,4648.01,0,0.0,18.57,0.138305,0.000000,44.311378,True,0.00,27.88,27.88,2,381.77,4,lxidpiddsbxsbosboudacockeimpuepw,15.000,True,44.370635,0.000000,0.000000,44.444710,0.000000,0.000000,44.444710,0.000000,0.000000,0,47
14601,d0a6f71671571ed83b2645d23af6de00,foosdfpfkusacimwkcsosbicdxkicaua,7223,0,181,631.69,181,0.0,144.03,0.100167,0.091892,58.995952,False,15.94,0.00,0.00,1,90.34,3,lxidpiddsbxsbosboudacockeimpuepw,6.000,True,59.015674,36.393379,8.345418,59.094572,36.442033,8.356575,59.173468,36.490689,8.367731,1,48
14602,10e6828ddd62cbcf687cb74928c4c2d2,foosdfpfkusacimwkcsosbicdxkicaua,1844,0,179,190.39,179,0.0,129.60,0.116900,0.100015,40.606701,False,18.05,39.84,39.84,1,20.38,4,lxidpiddsbxsbosboudacockeimpuepw,15.935,True,40.701732,24.421038,16.280694,40.728885,24.437330,16.291555,40.728885,24.437330,16.291555,1,47
14603,1cf20fd6206d7678d5bcafd28c53b4db,foosdfpfkusacimwkcsosbicdxkicaua,131,0,0,19.34,0,0.0,7.18,0.145711,0.000000,44.311378,False,0.00,13.08,13.08,1,0.96,3,lxidpiddsbxsbosboudacockeimpuepw,11.000,True,44.311375,0.000000,0.000000,44.355820,0.000000,0.000000,44.444710,0.000000,0.000000,0,48


In [10]:
def preprocess(X_df):
    df = X_df.copy()
    
    categories = df['channel_sales'].unique()
    df[[f'channel_sales: {x}' for x in categories]] = df['channel_sales'].apply(lambda x: pd.Series([1 if y == x else 0 for y in categories]))
    df.drop('channel_sales', axis = 1, inplace = True)
    
    categories = df['origin_up'].unique()
    df[[f'origin_up: {x}' for x in categories]] = df['origin_up'].apply(lambda x: pd.Series([1 if y == x else 0 for y in categories]))
    df.drop('origin_up', axis = 1, inplace = True)
    
    columns = ['cons_12m',
 'cons_gas_12m',
 'cons_last_month',
 'forecast_cons_12m',
 'forecast_cons_year',
 'forecast_discount_energy',
 'forecast_meter_rent_12m',
 'forecast_price_energy_off_peak',
 'forecast_price_energy_peak',
 'forecast_price_pow_off_peak',
 'has_gas',
 'imp_cons',
 'margin_gross_pow_ele',
 'margin_net_pow_ele',
 'nb_prod_act',
 'net_margin',
 'num_years_antig',
 'pow_max',
 'has_electricity',
 'average_off_peak_fix',
 'average_peak_fix',
 'average_mid_peak_fix',
 'average_6m_off_peak_fix',
 'average_6m_peak_fix',
 'average_6m_mid_peak_fix',
 'average_3m_off_peak_fix',
 'average_3m_peak_fix',
 'average_3m_mid_peak_fix',
 'months_active',
 'channel_sales: foosdfpfkusacimwkcsosbicdxkicaua',
 'channel_sales: MISSING',
 'channel_sales: lmkebamcaaclubfxadlmueccxoimlema',
 'channel_sales: usilxuppasemubllopkaafesmlibmsdf',
 'channel_sales: ewpakwlliwisiwduibdlfmalxowmwpci',
 'channel_sales: sddiedcslfslkckwlfkdpoeeailfpeds',
 'origin_up: lxidpiddsbxsbosboudacockeimpuepw',
 'origin_up: kamkkxfxxuwbdslkwifmmcsiusiuosws',
 'origin_up: ldkssxwpmemidmecebumciepifcamkci',
 'origin_up: MISSING']
    columns_removed = ['channel_sales: foosdfpfkusacimwkcsosbicdxkicaua',
 'channel_sales: MISSING',
 'channel_sales: lmkebamcaaclubfxadlmueccxoimlema',
 'channel_sales: usilxuppasemubllopkaafesmlibmsdf',
 'channel_sales: ewpakwlliwisiwduibdlfmalxowmwpci',
 'channel_sales: sddiedcslfslkckwlfkdpoeeailfpeds',
 'origin_up: lxidpiddsbxsbosboudacockeimpuepw',
 'origin_up: kamkkxfxxuwbdslkwifmmcsiusiuosws',
 'origin_up: ldkssxwpmemidmecebumciepifcamkci',
 'origin_up: MISSING']
    for column in columns_removed:
        columns.remove(column)
        
    for column in columns:
        df[f'{column} Edited'] = df[column]
        df.drop(column,axis = 1, inplace = True)
    
    return df

In [11]:
temp=preprocess(df)
X=temp.drop(['id','churn'],axis=1)

In [12]:
df.head()

Unnamed: 0,id,channel_sales,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_cons_year,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_off_peak,forecast_price_energy_peak,forecast_price_pow_off_peak,has_gas,imp_cons,margin_gross_pow_ele,margin_net_pow_ele,nb_prod_act,net_margin,num_years_antig,origin_up,pow_max,has_electricity,average_off_peak_fix,average_peak_fix,average_mid_peak_fix,average_6m_off_peak_fix,average_6m_peak_fix,average_6m_mid_peak_fix,average_3m_off_peak_fix,average_3m_peak_fix,average_3m_mid_peak_fix,churn,months_active
0,24011ae4ebbe3035111d65fa7c15bc57,foosdfpfkusacimwkcsosbicdxkicaua,0,54946,0,0.0,0,0.0,1.78,0.114481,0.098142,40.606701,True,0.0,25.44,25.44,2,678.99,3,lxidpiddsbxsbosboudacockeimpuepw,43.648,False,40.942265,22.35201,14.90134,41.318559,20.364442,13.576296,42.497907,12.218665,8.145777,1,36
1,d29c2c54acc38ff3c0614d0a653813dd,MISSING,4660,0,0,189.95,0,0.0,16.27,0.145711,0.0,44.311378,False,0.0,16.38,16.38,1,18.89,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.8,True,44.311375,0.0,0.0,44.35582,0.0,0.0,44.44471,0.0,0.0,0,84
2,764c75f661154dac3a6c254cd082ea7d,foosdfpfkusacimwkcsosbicdxkicaua,544,0,0,47.96,0,0.0,38.72,0.165794,0.087899,44.311378,False,0.0,28.6,28.6,1,6.6,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.856,True,44.38545,0.0,0.0,44.44471,0.0,0.0,44.44471,0.0,0.0,0,72
3,bba03439a292a1e166f80264c16191cb,lmkebamcaaclubfxadlmueccxoimlema,1584,0,0,240.04,0,0.0,19.83,0.146694,0.0,44.311378,False,0.0,30.22,30.22,1,25.46,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.2,True,44.400265,0.0,0.0,44.44471,0.0,0.0,44.44471,0.0,0.0,0,72
4,149d57cf92fc41cf94415803a877cb4b,MISSING,4425,0,526,445.75,526,0.0,131.73,0.1169,0.100015,40.606701,False,52.32,44.91,44.91,1,47.98,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,19.8,True,40.688156,24.412893,16.275263,40.728885,24.43733,16.291555,40.728885,24.43733,16.291555,0,73


In [13]:
y = df['churn']
X = df.drop(['id', 'churn'], axis = 1)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.20)

In [15]:
from imblearn.combine import SMOTETomek, SMOTEENN

In [16]:
categorical_columns = ['channel_sales', 'has_gas', 'has_electricity', 'origin_up']
categorical_indices = [X.columns.tolist().index(x) for x in categorical_columns]


In [17]:
smote=SMOTENC(categorical_features=categorical_indices)
smote
x_train , y_train = smote.fit_resample(x_train,y_train)

In [18]:
x_train

Unnamed: 0,channel_sales,cons_12m,cons_gas_12m,cons_last_month,forecast_cons_12m,forecast_cons_year,forecast_discount_energy,forecast_meter_rent_12m,forecast_price_energy_off_peak,forecast_price_energy_peak,forecast_price_pow_off_peak,has_gas,imp_cons,margin_gross_pow_ele,margin_net_pow_ele,nb_prod_act,net_margin,num_years_antig,origin_up,pow_max,has_electricity,average_off_peak_fix,average_peak_fix,average_mid_peak_fix,average_6m_off_peak_fix,average_6m_peak_fix,average_6m_mid_peak_fix,average_3m_off_peak_fix,average_3m_peak_fix,average_3m_mid_peak_fix,months_active
0,foosdfpfkusacimwkcsosbicdxkicaua,14365,0,2501,2115.490000,2501,0.0,0.000000,0.142819,0.000000,44.311378,False,361.600000,27.880000,27.880000,1,176.780000,4,lxidpiddsbxsbosboudacockeimpuepw,15.000000,True,44.385450,0.000000,0.000000,44.444710,0.000000,0.000000,44.444710,0.000000,0.000000,48
1,MISSING,45959,321940,4150,593.300000,926,0.0,131.430000,0.114604,0.098262,40.606701,False,87.900000,19.140000,19.140000,1,68.760000,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,17.000000,True,40.620275,24.372163,16.248109,40.674581,24.404746,16.269831,40.728885,24.437330,16.291555,84
2,usilxuppasemubllopkaafesmlibmsdf,13039,33478,0,1735.040000,0,0.0,16.940000,0.143166,0.000000,44.311378,True,0.000000,7.600000,7.600000,3,137.560000,4,kamkkxfxxuwbdslkwifmmcsiusiuosws,11.400000,True,44.444710,0.000000,0.000000,44.444710,0.000000,0.000000,44.444710,0.000000,0.000000,60
3,MISSING,8976,0,0,1323.150000,0,0.0,15.230000,0.144902,0.000000,44.311378,False,0.000000,12.820000,12.820000,1,133.200000,6,kamkkxfxxuwbdslkwifmmcsiusiuosws,13.200000,True,44.311375,0.000000,0.000000,44.355820,0.000000,0.000000,44.444710,0.000000,0.000000,84
4,foosdfpfkusacimwkcsosbicdxkicaua,0,0,0,0.000000,0,0.0,19.740000,0.146688,0.000000,44.311378,False,0.000000,37.560000,37.560000,1,0.000000,5,lxidpiddsbxsbosboudacockeimpuepw,15.000000,False,44.266930,0.000000,0.000000,44.266930,0.000000,0.000000,44.266930,0.000000,0.000000,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19897,foosdfpfkusacimwkcsosbicdxkicaua,1130168,0,80948,7542.884340,165,0.0,27.836033,0.143555,0.008370,44.006644,False,16.133816,6.421450,6.421450,1,686.651905,4,lxidpiddsbxsbosboudacockeimpuepw,14.416752,True,43.972552,2.008122,1.338748,43.975902,2.010133,1.340088,43.975902,2.010133,1.340088,59
19898,foosdfpfkusacimwkcsosbicdxkicaua,6610,0,0,952.002016,0,0.0,0.919193,0.147881,0.011761,44.311378,False,0.000000,23.268127,23.268127,1,95.088643,4,lxidpiddsbxsbosboudacockeimpuepw,13.287141,True,44.353852,0.000000,0.000000,44.440774,0.000000,0.000000,44.444710,0.000000,0.000000,52
19899,MISSING,2026,0,0,224.939943,0,0.0,26.969457,0.157873,0.057514,45.651877,False,0.000000,29.698449,29.698449,1,30.443849,6,lxidpiddsbxsbosboudacockeimpuepw,13.856000,True,44.447199,0.000000,0.000000,44.569203,0.000000,0.000000,45.057219,0.000000,0.000000,84
19900,foosdfpfkusacimwkcsosbicdxkicaua,11890,0,805,737.095353,603,0.0,129.263230,0.113771,0.098655,40.892057,False,55.567415,39.118787,39.118787,1,97.680886,3,lxidpiddsbxsbosboudacockeimpuepw,16.697082,True,40.622732,24.383177,16.263402,40.679493,24.426774,16.300416,40.871996,24.580441,16.434666,51


In [19]:
x_train=preprocess(x_train)
x_test= preprocess(x_test)

In [20]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train= scaler.transform(x_train)
x_test=scaler.transform(x_test)

In [46]:
models = {
    'LR': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier(),
    'XGB': XGBClassifier()
}

for name, model in models.items():
    print(f'Model Name: {name}')
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'precision: {precision_score(y_test, y_pred)}')
    print(f'f1_score: {f1_score(y_test, y_pred)}')
    print('-'*30)

Model Name: LR
Accuracy: 0.8030357788218287
Recall: 0.23076923076923078
precision: 0.16879795396419436
f1_score: 0.19497784342688332
------------------------------
Model Name: KNN
Accuracy: 0.748464040477051
Recall: 0.24475524475524477
precision: 0.12727272727272726
f1_score: 0.1674641148325359
------------------------------
Model Name: SVM
Accuracy: 0.8218286953379111
Recall: 0.17482517482517482
precision: 0.16286644951140064
f1_score: 0.16863406408094433
------------------------------
Model Name: DT
Accuracy: 0.714853632092519
Recall: 0.27972027972027974
precision: 0.12066365007541478
f1_score: 0.16859852476290832
------------------------------
Model Name: RF
Accuracy: 0.8915793277918324
Recall: 0.05944055944055944
precision: 0.3541666666666667
f1_score: 0.10179640718562875
------------------------------
Model Name: XGB
Accuracy: 0.8800144560896278
Recall: 0.1258741258741259
precision: 0.3050847457627119
f1_score: 0.17821782178217824
------------------------------


In [22]:
model=XGBClassifier()

In [23]:
model.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [24]:
ros_y_test = y_test

In [25]:
ros_y_probs=model.predict_proba(x_test)[:, 1]

In [26]:
from sklearn.metrics import classification_report
print(classification_report(ros_y_test, ros_y_probs > 0.60))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      2481
           1       0.30      0.07      0.11       286

    accuracy                           0.89      2767
   macro avg       0.60      0.52      0.52      2767
weighted avg       0.84      0.89      0.85      2767



In [27]:
ros_y_probs

array([0.0552131 , 0.04046421, 0.0265073 , ..., 0.34910297, 0.059641  ,
       0.06910326], dtype=float32)

In [33]:
temp=pd.DataFrame(ros_y_probs)


In [45]:
(temp>0.4).value_counts(normalize=True)

False    0.926635
True     0.073365
dtype: float64