In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import plotly
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots

In [None]:
import os
print(os.listdir("../input"))

In [None]:
telco=pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
telco.info()

In [None]:
# The dtype of TotalCharges in dataframe is object, 
# so we need to transfer it to numeric to facilitate the later data analysis.
telco['TotalCharges'] = pd.to_numeric(telco['TotalCharges'], errors='coerce')
telco = telco.fillna(telco.median())

In [None]:
telco.isnull().sum()

There isn't any missing value in the dataframe, saving us the data cleaning work. Yay!

In [None]:
pd.set_option('display.max_columns', 50)
telco.head()

In [None]:
ax = sns.countplot(x='Churn',data=telco)

The churn rate is more than a third of non-churn rate, which isn't a low rate. We will explore the patterns for the correlations between the features and the churn rate before we start modeling.

## EDA

Before delving into any specific features, let's get the general picture here: Get correlation of Churn with all the variables.

I decided to process categorical features manually instead of using dummies function, since I want the correlation matrix looks compact and clear.

In [None]:
print(telco['Contract'].unique())
print(telco['MultipleLines'].unique())
print(telco['InternetService'].unique())
print(telco['PaymentMethod'].unique())
print(telco['OnlineSecurity'].unique())

In [None]:
cat_columns = ['gender','Partner', 'Dependents', 'PhoneService',
               'MultipleLines','InternetService','OnlineSecurity',
               'OnlineBackup','DeviceProtection',
               'TechSupport','StreamingTV','StreamingMovies',
               'Contract','PaperlessBilling','PaymentMethod','Churn']

cat_to_num = {'No':0,'Yes':1, 
              'No phone service':3,
              'No internet service': 3,
              'Female':0, 'Male':1, 
              'Month-to-month':0, 'One year':1, 'Two year': 2,
              'DSL':1, 'Fiber optic':2,
              'Electronic check':0, 'Mailed check':1, 
              'Bank transfer (automatic)':2,'Credit card (automatic)':3}

telco_copy = telco.copy()
for i in cat_columns:
    telco_copy[i] = telco_copy[i].map(cat_to_num)

corr_matrix = telco_copy.corr()
corr_matrix["Churn"].sort_values(ascending=False)

**Among customers background (demographics) features**, SeniorCitizen is positively related to churn, while having Partner/Dependents is negatively related to churn. Gender is slightly negatively related to churn but it may not be constructive.

**Among commercial features**, MonthlyCharges and PaperlessBilling are positively related to churn as expected, but interestingly, TotalCharges is negatively related to churn. long period contract and tenure are negatively related to churn.

**Among service features**, most of them such as DeviceProtection,OnlineBackup, TechSupport and OnlineSecurity are negatively related to churn.

Now let's look into these three aspects of features one by one.

**a. Customer background**

In [None]:
bull_to_str = {0: 'No', 1: 'Yes'}
              
telco['SeniorCitizenStr'] = telco['SeniorCitizen'].map(bull_to_str)

fig = make_subplots(rows=2, 
                    cols=2, 
                    specs=[[{'type':'domain'}, {'type':'domain'}],
                           [{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(labels=telco['gender'], 
                     name='gender',
                     title='gender'),
              1, 1)
fig.add_trace(go.Pie(labels=telco['SeniorCitizenStr'], 
                     name='SeniorCitizenStr',
                     title='SeniorCitizenStr'),
              1, 2)
fig.add_trace(go.Pie(labels=telco['Partner'], 
                     name='Partner',
                     title='Partner'),
              2, 1)
fig.add_trace(go.Pie(labels=telco['Dependents'], 
                     name='Dependents',
                     title='Dependents'),
              2, 2)


fig.update_layout(
    autosize=False,
    width=500,
    height=500)

fig.update_layout(colorway=['#73C6B6', '#D98880', 
                            '#85C1E9', '#C39BD3']) 

fig.show()

**Gender distribution**: almost half male half female.  

**Senior citizen**: Overwhelming majority (83.8%) of customers are not senior citizen.  

**Partner and dependents status**: 51.7% don't have partner, while 70% don't have independentes.

In [None]:
ax = sns.countplot(x="gender", hue="SeniorCitizenStr", data=telco)

In [None]:
ax = sns.countplot(x="Partner",hue="Dependents", data=telco)

**Now viewing these features comparatively:**  

Distribution of senior citizen by gender is quite equal.  
Most of the customers without parter don't have dependents too.

**b. Commercial features**

In [None]:
fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'},
                                           {'type':'domain'}]])

fig.add_trace(go.Pie(labels=telco['Contract'], name='Contract', title='Contract'),
              1, 1)
fig.add_trace(go.Pie(labels=telco['PaperlessBilling'], name='PaperlessBilling', title='PaperlessBilling'),
              1, 2)
fig.add_trace(go.Pie(labels=telco['PaymentMethod'], name='PaymentMethod', title='PaymentMethod'),
              1, 3)

fig.update_layout(
    autosize=False,
    width=800,
    height=500)

fig.update_layout(colorway=['#73C6B6', '#D98880', '#85C1E9', 
                            '#C39BD3', '#F7DC6F', '#F0B27A', 
                            '#E59866', '#BFC9CA', '#BCAAA4']) 


fig.show()

**Contract**: most of the customers (55%) take month-to-month contract. While the proportions of one year and two year contracts are close.  

**PaperlessBilling**: most of the customers (59%) adopt paperless billing.  

**PaymentMethos**: 4 type os payment methods are almost distributed equally, with electonic check a bit more than others (33.6%).

In [None]:
fig = px.box(telco, 
             x='Contract', 
             y='tenure')
fig.show()

**Tenure distibution by contract type**: the tenure of customers increases as the contract period extends. Most of the monthly contract customers last for less than 12 months while most of two year contract customers last for more than 48 months. We can see that longers contract customers stick to the company for longer time.

In [None]:
# disabling hovers or avoiding rescaling can speed up the running time.
fig1 = px.histogram(telco, 
                    x="tenure", 
                    color= "Contract", 
                    range_x=[0, 75],
                    range_y=[0, 700],
                    color_discrete_map={'Month-to-month':'#73C6B6','One year':'#D98880', 'Two year':'#85C1E9'}
                   )

fig2 = px.histogram(telco, 
                    x="MonthlyCharges", 
                    range_x=[0, 130],
                    range_y=[0, 700],
                    color= "Contract", 
                    color_discrete_map={'Month-to-month':'#73C6B6','One year':'#D98880', 'Two year':'#85C1E9'}
                   )

fig3 = px.histogram(telco, 
                    x="TotalCharges", 
                    range_x=[0, 9000],
                    range_y=[0, 1200],
                    color= "Contract", 
                    color_discrete_map={'Month-to-month':'#73C6B6','One year':'#D98880', 'Two year':'#85C1E9'}
                   )

fig1.show()
fig2.show()
fig3.show()

**Tenure distibution by contract type**: now in barplot, we can see the clear negative correlation of churn with contract period. That is, customers signing long term contract are more loyal to the company.  

**Monthly Charges by contract types**: Monthly Charges are mostly under 20, and its distribution isn't obviously biased toward any contract type, but we can still perceive that two year contracts usually have lower monthly charge (<26), while monthly contracts account for most of the high monthly charge (>40).  

**Total Charges by contract types**: Total Charges are mostly under 2000, explained by monthly contracts, which are mostly under 2000. Total Charges distribution of one and two year contracts is more even.

**c. Service features**

In [None]:
fig = make_subplots(rows=3, cols=3, 
                    specs=[[{'type':'domain'}, {'type':'domain'},{'type':'domain'}],
                           [{'type':'domain'}, {'type':'domain'},{'type':'domain'}],
                           [{'type':'domain'}, {'type':'domain'},{'type':'domain'}]])

fig.add_trace(go.Pie(labels=telco['PhoneService'], name='PhoneService', title='PhoneService'),
              1, 1)
fig.add_trace(go.Pie(labels=telco['MultipleLines'], name='MultipleLines', title='MultipleLines'),
              1, 2)
fig.add_trace(go.Pie(labels=telco['InternetService'], name='InternetService', title='InternetService'),
              1, 3)

fig.add_trace(go.Pie(labels=telco['OnlineSecurity'], name='OnlineSecurity', title='OnlineSecurity'),
              2, 1)
fig.add_trace(go.Pie(labels=telco['OnlineBackup'], name='OnlineBackup', title='OnlineBackup'),
              2, 2)
fig.add_trace(go.Pie(labels=telco['DeviceProtection'], name='DeviceProtection', title='DeviceProtection'),
              2, 3)

fig.add_trace(go.Pie(labels=telco['TechSupport'], name='TechSupport', title='TechSupport'),
              3, 1)
fig.add_trace(go.Pie(labels=telco['StreamingTV'], name='StreamingTV', title='StreamingTV'),
              3, 2)
fig.add_trace(go.Pie(labels=telco['StreamingMovies'], name='StreamingMovies', title='StreamingMovies'),
              3, 3)

fig.update_layout(
    autosize=False,
    width=800,
    height=800)

fig.update_layout(colorway=['#73C6B6', '#D98880', '#85C1E9', 
                            '#C39BD3', '#F7DC6F', '#F0B27A']) 

fig.show()

**d. Correlations of churn rate with main numerical features**

Finished all the exploratory data analysis of the features, let's see the correlations of churn rate with main numerical features picked out of the correlation matrix.

In [None]:
fig1 = px.histogram(telco, 
                   x="tenure", 
                   range_x=[0, 73],
                   range_y=[0, 1],
                   color= "Churn",
                   barnorm='fraction',
                   color_discrete_map={'No':'#73C6B6','Yes':'#D98880'})

fig2 = px.histogram(telco, 
                   x="MonthlyCharges",
                   range_x=[20, 120],
                   range_y=[0, 1],
                   color= "Churn", 
                   barnorm='fraction',
                   color_discrete_map={'No':'#73C6B6','Yes':'#D98880'})

fig3 = px.histogram(telco, 
                   x="TotalCharges", 
                   range_x=[0, 8800],
                   range_y=[0, 1],
                   color= "Churn", 
                   barnorm='fraction',
                   color_discrete_map={'No':'#73C6B6','Yes':'#D98880'})

fig1.show()
fig2.show()
fig3.show()

**Tenure**: the chrun rate decreases as the tenure go up.  

**Monthly Charges**: there isn't a clear trend, but the churn rate is very low when the monthly charge is very low (<28).  

**Total Charges**: the churn rate goes down as the total chrages go up. Supposedly it's because the two year contracts account for the higher total charges.

**d. Correlations of churn rate of main categorical features**

Then let's see the correlations of churn rate with main categorical features picked out of the correlation matrix.

In [None]:
fig1 = px.histogram(telco, 
                   x="Dependents", 
                   range_y=[0, 1],
                   color= "Churn", 
                   barnorm='fraction',
                   color_discrete_map={'No':'#73C6B6','Yes':'#D98880'})

fig2 = px.histogram(telco, 
                   x="SeniorCitizenStr", 
                   color= "Churn", 
                   range_y=[0, 1],
                   barnorm='fraction',
                   color_discrete_map={'No':'#73C6B6','Yes':'#D98880'})

fig1.show()
fig2.show()

**Dependents**: only 15% of customers with dependents churn, while 31% of customers without dependents churn.  

**Senior Citizen**: only 23% of young customers churn, while 41% of senior customers churn.

In [None]:

fig1 = px.histogram(telco, 
                   x="Contract", 
                   range_y=[0, 1],
                   color= "Churn", 
                   barnorm='fraction',
                   color_discrete_map={'No':'#73C6B6','Yes':'#D98880'})


fig2 = px.histogram(telco, 
                   x="InternetService", 
                   range_y=[0, 1],
                   color= "Churn", 
                   barnorm='fraction',
                   color_discrete_map={'No':'#73C6B6','Yes':'#D98880'})

fig3 = px.histogram(telco, 
                   x="PaperlessBilling",
                   range_y=[0, 1],
                   color= "Churn", 
                   barnorm='fraction',
                   color_discrete_map={'No':'#73C6B6','Yes':'#D98880'})

fig4 = px.histogram(telco, 
                   x="DeviceProtection", 
                   range_y=[0, 1],
                   color= "Churn", 
                   barnorm='fraction',
                   color_discrete_map={'No':'#73C6B6','Yes':'#D98880'})

fig5 = px.histogram(telco, 
                   x="OnlineBackup", 
                   range_y=[0, 1],
                   color= "Churn", 
                   barnorm='fraction',
                   color_discrete_map={'No':'#73C6B6','Yes':'#D98880'})

fig6 = px.histogram(telco, 
                   x="PaymentMethod", 
                   range_y=[0, 1],
                   color= "Churn", 
                   barnorm='fraction',
                   color_discrete_map={'No':'#73C6B6','Yes':'#D98880'})

fig7 = px.histogram(telco, 
                   x="TechSupport", 
                   range_y=[0, 1],
                   color= "Churn", 
                   barnorm='fraction',
                   color_discrete_map={'No':'#73C6B6','Yes':'#D98880'})

fig8 = px.histogram(telco, 
                   x="OnlineSecurity", 
                   range_y=[0, 1],
                   color= "Churn", 
                   barnorm='fraction',
                   color_discrete_map={'No':'#73C6B6','Yes':'#D98880'})

fig1.show()
fig2.show()
fig3.show()
fig4.show()
fig5.show()
fig6.show()
fig7.show()
fig8.show()

**Contract**: two year contracts has the lowest churn rate (2%).  
**Internet Service**: "no internet service" has the lowest churn rate (7%).  
**Paperless Billing**: "no paperless billing" has the lower churn rate (16%).  
**Device Protection**: "no internet service" has the lowest churn rate (7%).  
**Online Backup**: "no internet service" has the lowest churn rate (7%).  
**Payment Method** "Electronic check" has the highest churn rate (45%), while others have very close churn rate (15%-20%).  
**Tech Support**: "no internet service" has the lowest churn rate (7%).  
**OnlineSecurity**: "no internet service" has the lowest churn rate (7%).  

## Churn prediction

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold,cross_val_score

In [None]:
telco_cat = telco[['gender','Partner', 'Dependents', 'PhoneService',
               'MultipleLines','InternetService','OnlineSecurity',
               'OnlineBackup','DeviceProtection',
               'TechSupport','StreamingTV','StreamingMovies',
               'Contract','PaperlessBilling','PaymentMethod']]
cat_encoder = OneHotEncoder(sparse=False)
telco_cat_1hot = cat_encoder.fit_transform(telco_cat)
telco_cat_1hot

In [None]:
telco_num = telco[['tenure', 'MonthlyCharges', 'TotalCharges']]

num_pipeline = Pipeline([('std_scaler', StandardScaler())])
telco_num_tr = num_pipeline.fit_transform(telco_num)

In [None]:
num_attribs = list(telco_num)
cat_attribs = list(telco_cat)

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

In [None]:
X = telco.drop('Churn', axis=1)
y = telco['Churn']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

X_train_prepared = full_pipeline.fit_transform(X_train)

In [None]:
base_models = [('DT_model', DecisionTreeClassifier(random_state=42)),
            ('RF_model', RandomForestClassifier(random_state=42,n_jobs=-1)),
            ('KN_model', KNeighborsClassifier(n_jobs=-1)),
            ('SVM_model',SVC(random_state=42)),
            ('LR_model',LogisticRegression(random_state=42,n_jobs=-1)),
            ('XGB_model', XGBClassifier(random_state=42, n_jobs=-1)),
            ('AdaBoost_model', AdaBoostClassifier(random_state=42)),
            ('GB_model', GradientBoostingClassifier(random_state=42))]
# split data into 'kfolds' parts for cross validation,
# use shuffle to ensure random distribution of data:
kfolds = 4
split = KFold(n_splits=kfolds,shuffle=True,random_state=42)

# Preprocessing, fitting, making predictions and scoring for every model:
for name,model in base_models:
    model_steps=Pipeline(steps=[('model',model)])
    model_steps.fit(X_train_prepared, y_train)
    cv_results=cross_val_score(model_steps,X_train_prepared,y_train,cv=split,scoring='accuracy',
                              n_jobs=-1)
    # output:
    min_score=round(min(cv_results),4)
    max_score=round(max(cv_results),4)
    mean_score=round(np.mean(cv_results),4)
    std_dev=round(np.std(cv_results),4)
    print(f'{name} cross validation accuracy score:{mean_score} +- {std_dev} (std) min:{min_score},max:{max_score}')

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix  

In [None]:
# As the cross validation accuracy score of SVM, XGB, GB and LogisticRegression models on train test are very close,
# I decided to test all of them on test set
X_test_prepared = full_pipeline.fit_transform(X_test)

final_models = [('SVM_model',SVC(random_state=42)),
               ('XGB_model', XGBClassifier(random_state=42, n_jobs=-1)),
               ('GB_model', GradientBoostingClassifier(random_state=42)),
               ('LR_model', LogisticRegression(random_state=42,n_jobs=-1))]

kfolds = 4
split = KFold(n_splits=kfolds,shuffle=True,random_state=42)

# Preprocessing, fitting, making predictions and scoring for every model on test set:
for name,model in final_models:
    model_steps=Pipeline(steps=[('model',model)])
    model_steps.fit(X_train_prepared, y_train)
    preds = model.predict(X_test_prepared)
    accuracy = metrics.accuracy_score(y_test, preds)
    # output:
    print(f'{name} accuracy score:{accuracy}')
    print('confusion_matrix:', confusion_matrix(y_test, preds))


Logistic Regression model turned out to be the best model among all, with accuracy on test data to 81.7%, and a higher true positive rate. I tried to fine tune it but the improvement is minimal.