In [349]:
from sklearn import preprocessing

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from IPython.display import Image  

import pydotplus
import io

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, export_graphviz




import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

sns.set()

%config InlineBackend.figure_format = 'svg'


In [159]:
def tree_graph_to_png(tree, feature_names, png_file_to_save):
    tree_str = export_graphviz(
        tree, feature_names=feature_names, filled=True, out_file=None
    )
    graph = pydotplus.graph_from_dot_data(tree_str)
    graph.write_png(png_file_to_save)

In [128]:
df = pd.read_csv("Data Set.csv")
df.tail()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7042,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [129]:
df['TotalCharges'] = df['TotalCharges'].str.replace(" ","NaN")
df['TotalCharges'] = df['TotalCharges'].astype(float)



In [130]:
df['tenure'] = df['tenure'].astype(float)

In [131]:
df['SeniorCitizen'] = df['SeniorCitizen'].astype(object)

df.loc[df['SeniorCitizen'] == True, 'SeniorCitizen'] = 'Yes'
df.loc[df['SeniorCitizen'] == False, 'SeniorCitizen'] = 'No'

In [132]:
df.loc[df['Churn'] == 'Yes', 'Churn'] = 1
df.loc[df['Churn'] == 'No', 'Churn'] = 0
df['Churn'] = df['Churn'].astype(int)
df.tail()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7038,6840-RESVB,Male,No,Yes,Yes,24.0,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,0
7039,2234-XADUH,Female,No,Yes,Yes,72.0,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,0
7040,4801-JZAZL,Female,No,Yes,Yes,11.0,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,0
7041,8361-LTMKD,Male,Yes,Yes,No,4.0,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,1
7042,3186-AJIEK,Male,No,No,No,66.0,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,0


In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   object 
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   float64
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [134]:
customer_IDs = df['customerID']
genders = df['gender']
partners = df['Partner']

y = df['Churn']

df.drop(['Churn', 'gender', 'Partner', 'customerID'], inplace=True, axis=1)

df.head()

Unnamed: 0,SeniorCitizen,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,No,No,1.0,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,No,No,34.0,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,No,No,2.0,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,No,No,45.0,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75
4,No,No,2.0,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65


In [135]:
categorical_columns = [
    c for c in df.columns if df[c].dtype.name == "object"
]
numerical_columns = [
    c for c in df.columns if df[c].dtype.name != "object"
]

In [136]:
for c in categorical_columns:
    df[c].fillna(df[c].mode()[0], inplace=True)

for c in numerical_columns:
    df[c].fillna(df[c].median(), inplace=True)

In [137]:

y.info()


<class 'pandas.core.series.Series'>
RangeIndex: 7043 entries, 0 to 7042
Series name: Churn
Non-Null Count  Dtype
--------------  -----
7043 non-null   int64
dtypes: int64(1)
memory usage: 55.1 KB


In [138]:
df = pd.concat(
    [df[numerical_columns], pd.get_dummies(df[categorical_columns])],
    axis=1,
)

In [139]:
df.tail()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,SeniorCitizen_No,SeniorCitizen_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
7038,24.0,84.8,1990.5,1,0,0,1,0,1,0,...,1,0,1,0,0,1,0,0,0,1
7039,72.0,103.2,7362.9,1,0,0,1,0,1,0,...,1,0,1,0,0,1,0,1,0,0
7040,11.0,29.6,346.45,1,0,0,1,1,0,0,...,0,1,0,0,0,1,0,0,1,0
7041,4.0,74.4,306.6,0,1,1,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1
7042,66.0,105.65,6844.5,1,0,1,0,0,1,1,...,1,0,0,1,0,1,1,0,0,0


In [201]:
X_train, X_holdout, y_train, y_holdout = train_test_split(
    df.values, y, test_size=0.3, random_state=17
)

In [344]:
tree = DecisionTreeClassifier(random_state=17)

# tree.fit(X_train, y_train)


# , max_depth=6, max_features=18, max_leaf_nodes=300 - - - - 0.7832

In [319]:
tree_params = {"max_depth": range(1, 14), "max_features": range(3, 25), "max_leaf_nodes": range(2, 200)}

tree_grid = GridSearchCV(tree, tree_params, cv=5, n_jobs=-1, verbose=True)

tree_grid.fit(X_train, y_train)

Fitting 5 folds for each of 56628 candidates, totalling 283140 fits


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=17), n_jobs=-1,
             param_grid={'max_depth': range(1, 14),
                         'max_features': range(3, 25),
                         'max_leaf_nodes': range(2, 200)},
             verbose=True)

In [324]:
print(tree_grid.best_score_)
print(tree_grid.best_params_)

0.7997971602434077
{'max_depth': 11, 'max_features': 15, 'max_leaf_nodes': 43}


In [345]:
tree_predictions = tree_grid.predict(X_holdout)

In [348]:
print('Accuracy Score: ', accuracy_score(y_holdout, tree_predictions))
# print('Confusion Matrix: ',confusion_matrix(y_holdout, tree_predictions))
print('Precision Score: ', precision_score(y_holdout, tree_predictions))
f1_score(y_holdout, tree_predictions)

Accuracy Score:  0.7827733080927591
Precision Score:  0.6397379912663755


0.5607655502392345

In [350]:
print(classification_report(y_holdout, tree_predictions))

              precision    recall  f1-score   support

           0       0.82      0.89      0.86      1526
           1       0.64      0.50      0.56       587

    accuracy                           0.78      2113
   macro avg       0.73      0.70      0.71      2113
weighted avg       0.77      0.78      0.77      2113



In [326]:
tree_graph_to_png(
    tree=tree_grid.best_estimator_,
    feature_names=df.columns,
    png_file_to_save="topic3_decision_tree4.png",
)
