In [488]:
from sklearn import preprocessing

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from IPython.display import Image  

import pydotplus
import io

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, export_graphviz

from imblearn.over_sampling import SMOTE 


import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

sns.set()

%config InlineBackend.figure_format = 'svg'


In [489]:
def tree_graph_to_png(tree, feature_names, png_file_to_save):
    tree_str = export_graphviz(
        tree, feature_names=feature_names, filled=True, out_file=None
    )
    graph = pydotplus.graph_from_dot_data(tree_str)
    graph.write_png(png_file_to_save)

In [490]:
df = pd.read_csv('Data Set.csv')
columns_to_drop = ['customerID', 'gender', 'Partner', 'Dependents', 'StreamingTV', 'StreamingMovies']
df.tail()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7042,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [491]:
df.drop(columns_to_drop, inplace=True, axis=1)

TotalCharges column has some null values: <br><br>
    1. Replace ' ' with NaN<br>
    2. Drop NaN values from the DataFrame<br>

In [492]:
df.loc[df['TotalCharges'] == ' ', 'TotalCharges'] = np.nan
df['TotalCharges'] = df['TotalCharges'].astype(float)
df.dropna(inplace=True)
df.isnull().sum()

SeniorCitizen       0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [493]:

df['SeniorCitizen'] = df['SeniorCitizen'].astype(object)

df.loc[df['SeniorCitizen'] == 1, 'SeniorCitizen'] = "Yes"
df.loc[df['SeniorCitizen'] == 0, 'SeniorCitizen'] = "No"


In [494]:
df['tenure'] = df['tenure'].astype(float)

In [495]:



binary_categorical_columns = [c for c in df if len(df[c].unique()) == 2 and df[c].dtype.name == 'object']
binary_categorical_columns

non_binary_categorical_columns = [c for c in df if len(df[c].unique()) > 2 and len(df[c].unique()) < 6 and df[c].dtype.name == 'object']
non_binary_categorical_columns

numerical_columns = [c for c in df if df[c].dtype.name == 'float64']


set(df.columns) - set(binary_categorical_columns + non_binary_categorical_columns + numerical_columns)


for c in binary_categorical_columns:
    df.loc[df[c] == 'Yes', c] = 1
    df.loc[df[c] == 'No', c] = 0
    df[c] = df[c].astype(int)


non_binary_categorical_columns_dummies = pd.get_dummies(df[non_binary_categorical_columns])

df = pd.concat(
    [df[binary_categorical_columns], df[numerical_columns], non_binary_categorical_columns_dummies],
    axis=1
)

In [496]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 32 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   SeniorCitizen                            7032 non-null   int64  
 1   PhoneService                             7032 non-null   int64  
 2   PaperlessBilling                         7032 non-null   int64  
 3   Churn                                    7032 non-null   int64  
 4   tenure                                   7032 non-null   float64
 5   MonthlyCharges                           7032 non-null   float64
 6   TotalCharges                             7032 non-null   float64
 7   MultipleLines_No                         7032 non-null   uint8  
 8   MultipleLines_No phone service           7032 non-null   uint8  
 9   MultipleLines_Yes                        7032 non-null   uint8  
 10  InternetService_DSL                      7032 no

# MinMax

In [497]:
to_scale = [col for col in df.columns if df[col].max() > 1]
print(to_scale)

['tenure', 'MonthlyCharges', 'TotalCharges']


In [498]:

scaler = MinMaxScaler()
scaler.fit(df[to_scale])

scaled = scaler.transform(df[to_scale])
print(scaled)

print('+ + + + + +')

unscaled = scaler.inverse_transform(scaled)
print(unscaled)

df_cols_unscaled = pd.DataFrame(unscaled, columns=to_scale)
df_cols_scaled = pd.DataFrame(scaled, columns=to_scale)



[[0.         0.11542289 0.0012751 ]
 [0.46478873 0.38507463 0.21586661]
 [0.01408451 0.35422886 0.01031041]
 ...
 [0.14084507 0.11293532 0.03780868]
 [0.04225352 0.55870647 0.03321025]
 [0.91549296 0.86965174 0.78764136]]
+ + + + + +
[[1.0000e+00 2.9850e+01 2.9850e+01]
 [3.4000e+01 5.6950e+01 1.8895e+03]
 [2.0000e+00 5.3850e+01 1.0815e+02]
 ...
 [1.1000e+01 2.9600e+01 3.4645e+02]
 [4.0000e+00 7.4400e+01 3.0660e+02]
 [6.6000e+01 1.0565e+02 6.8445e+03]]


In [499]:
df.reset_index(drop=True, inplace=True)

# for c in df_cols_scaled:
#     df[c] = df_cols_scaled[c]

# for c in df_cols_unscaled:
#     df[c] = df_cols_unscaled[c]




In [500]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 32 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   SeniorCitizen                            7032 non-null   int64  
 1   PhoneService                             7032 non-null   int64  
 2   PaperlessBilling                         7032 non-null   int64  
 3   Churn                                    7032 non-null   int64  
 4   tenure                                   7032 non-null   float64
 5   MonthlyCharges                           7032 non-null   float64
 6   TotalCharges                             7032 non-null   float64
 7   MultipleLines_No                         7032 non-null   uint8  
 8   MultipleLines_No phone service           7032 non-null   uint8  
 9   MultipleLines_Yes                        7032 non-null   uint8  
 10  InternetService_DSL                      7032 no

# Cleanup finished, let's grow some trees!


In [501]:
# df.tail(30)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 32 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   SeniorCitizen                            7032 non-null   int64  
 1   PhoneService                             7032 non-null   int64  
 2   PaperlessBilling                         7032 non-null   int64  
 3   Churn                                    7032 non-null   int64  
 4   tenure                                   7032 non-null   float64
 5   MonthlyCharges                           7032 non-null   float64
 6   TotalCharges                             7032 non-null   float64
 7   MultipleLines_No                         7032 non-null   uint8  
 8   MultipleLines_No phone service           7032 non-null   uint8  
 9   MultipleLines_Yes                        7032 non-null   uint8  
 10  InternetService_DSL                      7032 no

In [502]:
y = df['Churn']

df.drop(['Churn'], inplace=True, axis=1)


In [503]:
X_train, X_holdout, y_train, y_holdout = train_test_split(
    df.values, y, test_size=0.33, random_state=17
)

In [504]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [505]:
# tree = DecisionTreeClassifier(random_state=17, max_depth=12, max_features=24, max_leaf_nodes=199)
# tree.fit(X_train, y_train)


# tree_predictions = tree.predict(X_holdout)

# print(classification_report(y_holdout, tree_predictions))
# print('Accuracy Score: ', accuracy_score(y_holdout, tree_predictions))


In [506]:
# tree = DecisionTreeClassifier(random_state=17)

# tree_params = {"max_depth": range(1, 14), "max_features": range(3, 25), "max_leaf_nodes": range(2, 200)}

# tree_grid = GridSearchCV(tree, tree_params, cv=5, n_jobs=-1, verbose=True)

# tree_grid.fit(X_res, y_res)


In [507]:
# tree_unscaled = DecisionTreeClassifier(random_state=17)

# tree_params = {"max_depth": range(1, 14), "max_features": range(3, 25), "max_leaf_nodes": range(2, 200)}

# tree_grid_unscaled = GridSearchCV(tree_unscaled, tree_params, cv=5, n_jobs=-1, verbose=True)

# tree_grid_unscaled.fit(X_res, y_res)


Fitting 5 folds for each of 56628 candidates, totalling 283140 fits


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=17), n_jobs=-1,
             param_grid={'max_depth': range(1, 14),
                         'max_features': range(3, 25),
                         'max_leaf_nodes': range(2, 200)},
             verbose=True)

In [514]:
tree_predictions = tree_grid.predict(X_holdout)

print(tree_grid.best_params_)
print(classification_report(y_holdout, tree_predictions))
print('Accuracy Score: ', accuracy_score(y_holdout, tree_predictions))




{'max_depth': 12, 'max_features': 24, 'max_leaf_nodes': 199}
              precision    recall  f1-score   support

           0       0.85      0.80      0.82      1713
           1       0.52      0.61      0.56       608

    accuracy                           0.75      2321
   macro avg       0.68      0.70      0.69      2321
weighted avg       0.76      0.75      0.75      2321

Accuracy Score:  0.7479534683326152


In [523]:
tree_predictions_unscaled = tree_grid_unscaled.predict(X_holdout)

print(tree_grid_unscaled.best_params_)
print(classification_report(y_holdout, tree_predictions_unscaled))
print('Accuracy Score: ', accuracy_score(y_holdout, tree_predictions_unscaled))
recall_score(y_holdout, tree_predictions_unscaled)
f1_score(y_holdout, tree_predictions_unscaled)





{'max_depth': 9, 'max_features': 18, 'max_leaf_nodes': 50}
              precision    recall  f1-score   support

           0       0.86      0.85      0.85      1713
           1       0.58      0.60      0.59       608

    accuracy                           0.78      2321
   macro avg       0.72      0.72      0.72      2321
weighted avg       0.79      0.78      0.78      2321

Accuracy Score:  0.7828522188711762


0.5922330097087379

In [515]:
tree_graph_to_png(
    tree=tree_grid_unscaled.best_estimator_,
    feature_names=df.columns,
    png_file_to_save="model2.0_unscaled.png",
)


In [518]:
df_importances = pd.DataFrame([tree_grid_unscaled.best_estimator_.feature_importances_], columns=df.columns.values)
df_importances.max()


SeniorCitizen                              0.000000
PhoneService                               0.000000
PaperlessBilling                           0.061262
tenure                                     0.056973
MonthlyCharges                             0.049085
TotalCharges                               0.011356
MultipleLines_No                           0.000000
MultipleLines_No phone service             0.004413
MultipleLines_Yes                          0.000000
InternetService_DSL                        0.003168
InternetService_Fiber optic                0.000000
InternetService_No                         0.000000
OnlineSecurity_No                          0.007122
OnlineSecurity_No internet service         0.000000
OnlineSecurity_Yes                         0.045369
OnlineBackup_No                            0.005059
OnlineBackup_No internet service           0.000000
OnlineBackup_Yes                           0.000000
DeviceProtection_No                        0.002074
DeviceProtec