In [19]:
import pandas as pd
import numpy as np 
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest,mutual_info_classif

In [20]:
df = pd.read_csv("customer_churn.csv")

In [21]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [22]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [23]:
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

#### As we can see the dtype of totalcharges is an object hence we convert it into floating point numbers

In [24]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"].str.strip())
df["TotalCharges"] = df["TotalCharges"].fillna(0)

In [25]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [26]:
fig = px.histogram(df, x="Churn", color = "Contract", barmode = "group", title = "<b>Customer contract distribution<b>")
fig.update_layout(width=700, height=500, bargap=0.2)
fig.show()

*As per the above graph people who opt for month-to-month contract are most likley to churn*

In [27]:
# Graph for churn 
fig1 = px.pie(df,names=df['Churn'],hole= 0.4)

fig1.update_traces(hoverinfo='label+percent',
    textinfo='percent',
    marker = dict(line = dict(color ='black', width = 2)),
    pull = [0.1,0]
    )

fig1.update_layout(
    title_text = '<b>Churn Ratio<b>',
    paper_bgcolor='LightSteelBlue',
    height=500, width=600
    )

fig1.show()


In [28]:
# graph showing churn w.r.t to M or F
fig2 = px.sunburst(df,path = ['Churn','gender',],color_discrete_sequence=px.colors.qualitative.G10)

fig2.update_layout(
    title = '<b>Churn Distribution w.r.t Gender: Male(M), Female(F)<b>',
    paper_bgcolor='LightSteelBlue',
    height=500, width=600
)
fig2.show()

In [29]:
fig3 = px.histogram(df, x="Churn", color = "InternetService", barmode="group",title = '<b>Churn w.r.t Internet Service Type<b>')
fig3.update_layout(width=700, height=500, bargap=0.1)
fig3.show()

In [30]:
fig4 = px.histogram(df, x="Churn", color="PaperlessBilling", title="<b>Chrun distribution w.r.t. Paperless Billing</b>")
fig4.update_layout(width=700, height=500, bargap=0.1)
fig4.show()

In [31]:
fig5 = px.histogram(df, x="Churn", color="OnlineSecurity", barmode="group", title="<b>Churn w.r.t Online Security</b>")
fig5.update_layout(width=700, height=500, bargap=0.1)
fig5.show()

In [32]:

fig6 = px.box(df, x='Churn', y = 'tenure')

# Update yaxis properties
fig6.update_yaxes(title_text='Tenure (Months)', row=1, col=1)
# Update xaxis properties
fig6.update_xaxes(title_text='Churn', row=1, col=1)

# Update size and title
fig6.update_layout(autosize=True, width=750, height=600,
    title_font=dict(size=25, family='Courier'),
    title='<b>Tenure vs Churn</b>',
)

fig6.show()

In [33]:
labels = df['PaymentMethod'].unique()
values = df['PaymentMethod'].value_counts()

fig7 = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3, pull = [0.2,0,0,0])])
fig7.update_layout(title_text="Payment Method Distribution",width=700, height=500)
fig7.show()

fig8 = px.histogram(df, x="Churn", color="PaymentMethod", title="Customer Payment Method distribution w.r.t. Churn")
fig8.update_layout(width=700, height=500, bargap=0.1)
fig8.show()

In [34]:
from pycaret.classification import *

In [35]:
s = setup(df, target ='Churn')

Unnamed: 0,Description,Value
0,session_id,7456
1,Target,Churn
2,Target Type,Binary
3,Label Encoded,"No: 0, Yes: 1"
4,Original Data,"(7043, 21)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,17
8,Ordinal Features,False
9,High Cardinality Features,False


In [36]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.8039,0.8464,0.5303,0.6711,0.5916,0.4649,0.4711,5.049
gbc,Gradient Boosting Classifier,0.8037,0.8475,0.5091,0.6805,0.5816,0.4568,0.4656,16.448
lr,Logistic Regression,0.7953,0.8406,0.5356,0.6462,0.5842,0.4501,0.4546,16.028
rf,Random Forest Classifier,0.7941,0.832,0.4652,0.6674,0.5473,0.4196,0.4316,3.688
lightgbm,Light Gradient Boosting Machine,0.7913,0.8289,0.5265,0.6348,0.5751,0.4383,0.4421,0.742
et,Extra Trees Classifier,0.7838,0.8201,0.4545,0.6357,0.5294,0.3941,0.4038,5.601
ridge,Ridge Classifier,0.7785,0.0,0.4068,0.6413,0.4941,0.362,0.3791,16.881
dt,Decision Tree Classifier,0.7761,0.6957,0.5227,0.5956,0.5557,0.4071,0.4093,0.811
knn,K Neighbors Classifier,0.7619,0.7466,0.453,0.5705,0.5045,0.3506,0.3549,3.012
svm,SVM - Linear Kernel,0.7363,0.0,0.472,0.5725,0.4771,0.3184,0.3459,1.978


In [37]:
print(best)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=7456)


In [38]:
finalize_model(best)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=7456)

In [39]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [70]:
predict_model(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.8036,0.8505,0.5519,0.6419,0.5935,0.465,0.4673


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,customerID_0003-MKNFE,customerID_0004-TLHLJ,customerID_0011-IGKFF,customerID_0013-EXCHZ,customerID_0013-MHZWF,customerID_0013-SMEOE,customerID_0014-BMAQU,...,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn,Label,Score
0,1.0,76.199997,76.199997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Yes,Yes,0.5061
1,1.0,20.799999,20.799999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,No,No,0.5040
2,59.0,99.449997,5623.700195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,No,No,0.5030
3,50.0,93.500000,4747.500000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,No,No,0.5070
4,23.0,20.150000,405.600006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,No,No,0.5205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2108,54.0,110.449997,6077.750000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,No,No,0.5101
2109,63.0,86.699997,5309.500000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,No,No,0.5276
2110,4.0,38.549999,156.100006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,No,No,0.5020
2111,20.0,19.400000,415.399994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,No,No,0.5157


In [40]:
profile = ProfileReport(df, title="Pandas Profiling Report")

In [41]:
profile.to_file("your_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Dropping the following columns

* customerID = its a unique identifier we can use index for the same 
* Partner is highly correlated with Dependents and vice versa
* tenure is highly correlated with Contract
* PhoneService is highly correlated with MultipleLines and 1 other fields	
* StreamingMovies is highly correlated with MultipleLines and 8 other fields	
* InternetService is highly correlated with MultipleLines and 8 other fields	
* MultipleLines is highly correlated with PhoneService and 8 other fields	
* OnlineSecurity is highly correlated with MultipleLines and 8 other fields	
* StreamingTV is highly correlated with MultipleLines and 8 other fields	
* OnlineBackup is highly correlated with MultipleLines and 8 other fields	
* DeviceProtection is highly correlated with MultipleLines and 8 other fields	
* TechSupport is highly correlated with MultipleLines and 8 other fields	
* Partner is highly correlated with Dependents	
* Dependents is highly correlated with Partner	
* tenure is highly correlated with Contract	
* Contract is highly correlated with tenure and 7 other fields	
* MonthlyCharges is highly correlated with PhoneService and 8 other fields

In [42]:
columns_with_high_corr = ['customerID','Dependents','Contract','PhoneService','StreamingMovies','InternetService','OnlineSecurity','OnlineBackup','StreamingTV','DeviceProtection','TechSupport']

df.drop(labels = columns_with_high_corr, axis = 1, inplace = True)

In [43]:
# profile = ProfileReport(df, title="Pandas Profiling Report")
# profile.to_file("report_After_cleaning.html")

In [44]:
df.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
tenure              0
MultipleLines       0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [45]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,tenure,MultipleLines,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,1,No phone service,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,34,No,No,Mailed check,56.95,1889.50,No
2,Male,0,No,2,No,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,45,No phone service,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,2,No,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,24,Yes,Yes,Mailed check,84.80,1990.50,No
7039,Female,0,Yes,72,Yes,Yes,Credit card (automatic),103.20,7362.90,No
7040,Female,0,Yes,11,No phone service,Yes,Electronic check,29.60,346.45,No
7041,Male,1,Yes,4,Yes,Yes,Mailed check,74.40,306.60,Yes


In [46]:
label_encoder = LabelEncoder()

In [47]:
for i in df.columns:
    if df[i].dtype == 'object':
        df[i]= label_encoder.fit_transform(df[i])

In [48]:
df.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
tenure              0
MultipleLines       0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [49]:
X = df.drop(labels = ['Churn'], axis = 1)
y = df['Churn']

In [50]:
Six_best_fetures = SelectKBest(mutual_info_classif, k = 6)
Six_best_fetures.fit(X,y)


SelectKBest(k=6,
            score_func=<function mutual_info_classif at 0x0000027971712160>)

In [51]:
required_fetures = X.columns[Six_best_fetures.get_support()]


In [52]:
columns_to_drop = list(x for x in X.columns if x not in required_fetures)

In [53]:
df.drop(labels = columns_to_drop, axis = 1, inplace = True)

In [54]:
df.head(5)

Unnamed: 0,Partner,tenure,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,1,1,2,29.85,29.85,0
1,0,34,0,3,56.95,1889.5,0
2,0,2,1,3,53.85,108.15,1
3,0,45,0,0,42.3,1840.75,0
4,0,2,1,2,70.7,151.65,1


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Using Decision tree classifier

In [56]:
from sklearn.tree import DecisionTreeClassifier

In [57]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.7175301632363378

In [58]:
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}

In [59]:
GDC = GridSearchCV(DecisionTreeClassifier(), params, cv = 10, scoring='accuracy')
GDC.fit(X_train, y_train)
print(GDC.best_params_)
print(GDC.best_score_)

{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 100}
0.790554967688296


In [60]:
clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 10, min_samples_leaf = 100)

In [61]:
# clf.get_params()

In [62]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=100, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [63]:
y_pred = clf.predict(X_test)

In [64]:
accuracy_score(y_test, y_pred)

0.7835344215755855

using randomforest

In [65]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)
accuracy_score(y_test , y_pred_rfc)

0.7799858055358411

In [66]:
# rfc.get_params()

In [67]:
forest_params = [{'max_depth': list(range(10, 15)), 'max_features': ['sqrt','log2']}]
GSC = GridSearchCV(RandomForestClassifier(), forest_params, cv = 10, scoring='accuracy')
GSC.fit(X_train, y_train)
print(GSC.best_params_)
print(GSC.best_score_)

{'max_depth': 11, 'max_features': 'log2'}
0.7985447136036683


In [68]:
rfc = RandomForestClassifier(max_depth=11,max_features = 'sqrt')
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)

In [69]:
accuracy_score(y_test , y_pred_rfc)

0.7806955287437899