In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
data.sample(5)

In [None]:
data.drop(['customerID'],axis=1,inplace=True)
data.isnull().sum()

In [None]:
data['TotalCharges'] = data['TotalCharges'].replace(" ", 0).astype('float32')
sns.countplot(data['Churn'])
plt.show()

data['Churn'].value_counts()

In [None]:
data.info()

In [None]:
data.nunique()

We have a mix of datatypes. Tenure, MonthlyCharges and TotalCharges are Numerical. The others are binary/categorical. The binary are Yes/No, we'll need to
change to 1/0. Lets do this now before looking at the categorical features (where nunique > 2)

In [None]:
plt.figure(figsize=(10,4))
plt.title('Tenure vs Churn')
sns.kdeplot(data[data['Churn'] == 'Yes']['tenure'],label = 'Churn : Yes',shade=True)
sns.kdeplot(data[data['Churn'] == 'No']['tenure'],label = 'Churn : No',shade=True)

Customers who have a shorter tenure are more likely to churn

In [None]:
plt.figure(figsize=(10,4))
plt.title('Total Charges vs Churn')
sns.kdeplot(data[data['Churn'] == 'Yes']['TotalCharges'],label = 'Churn : Yes',shade=True)
sns.kdeplot(data[data['Churn'] == 'No']['TotalCharges'],label = 'Churn : No',shade=True)

In [None]:
plt.figure(figsize=(10,4))
plt.title('Monthly Charges vs Churn')
sns.kdeplot(data[data['Churn'] == 'Yes']['MonthlyCharges'],label = 'Churn : Yes',shade=True)
sns.kdeplot(data[data['Churn'] == 'No']['MonthlyCharges'],label = 'Churn : No',shade=True)

Higher churn rate amongst customers that have higher monthly charges

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,4))
ax[0].title.set_text('Gender : Male')
ax[1].title.set_text('Gender : Female')
sns.countplot(data[data['gender'] == 'Male']['Churn'],ax=ax[0])
sns.countplot(data[data['gender'] == 'Female']['Churn'],ax=ax[1])

There seems to be no clear difference in churn-rate between genders, we could consider dropping this feature for modelling

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,4))
ax[0].title.set_text('SeniorCitizen : Yes')
ax[1].title.set_text('SeniorCitizen : No')
sns.countplot(data[data['SeniorCitizen'] == 1]['Churn'],ax=ax[0])
sns.countplot(data[data['SeniorCitizen'] == 0]['Churn'],ax=ax[1])

There seems to be a higher proportion of senior citizens churning when compared to non-citizens

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,4))
ax[0].title.set_text('Partner : Yes')
ax[1].title.set_text('Partner : No')
sns.countplot(data[data['Partner'] == 'Yes']['Churn'],ax=ax[0])
sns.countplot(data[data['Partner'] == 'No']['Churn'],ax=ax[1])

Customers with partners tended not to churn as much as those without

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,4))
ax[0].title.set_text('Dependants : Yes')
ax[1].title.set_text('Dependants : No')
sns.countplot(data[data['Dependents'] == 'Yes']['Churn'],ax=ax[0])
sns.countplot(data[data['Dependents'] == 'No']['Churn'],ax=ax[1])

Customers without dependants were more likely to churn

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,4))
ax[0].title.set_text('PhoneService : Yes')
ax[1].title.set_text('PhoneService : No')
sns.countplot(data[data['PhoneService'] == 'Yes']['Churn'],ax=ax[0])
sns.countplot(data[data['PhoneService'] == 'No']['Churn'],ax=ax[1])

There seems to be no difference in churn rate between customers with and without phone service

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,4))
ax[0].title.set_text('Churn : Yes')
ax[1].title.set_text('Churn : No')
sns.countplot(data[data['Churn'] == 'Yes']['MultipleLines'],ax=ax[0])
sns.countplot(data[data['Churn'] == 'No']['MultipleLines'],ax=ax[1])

In [None]:
data.groupby('MultipleLines')['Churn'].value_counts()/len(data['MultipleLines'])

There is a higher proportion of those with MultipleLines who are churning, could MultipleLines share correlation with MonthlyCharges?

In [None]:
sns.boxplot(y=data['MonthlyCharges'],x=data['MultipleLines'])

Customers with MultipleLines have more MonthlyCharges than those with no phone service or just a single line. Let's take note of this to see if we can drop this feature without impacting model performance later on

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,4))
ax[0].title.set_text('Churn : Yes')
ax[1].title.set_text('Churn : No')
sns.countplot(data[data['Churn'] == 'Yes']['InternetService'],ax=ax[0])
sns.countplot(data[data['Churn'] == 'No']['InternetService'],ax=ax[1])

Customers with No Internet have a very low rate of churn, whereas customers with Fiber Optic have the highest. Similar to the MultipleLines feature, could this again be correlation with an increase in monthlycharges? ie. do customers with fiber optic broadband pay more?

In [None]:
sns.boxplot(y=data['MonthlyCharges'],x=data['InternetService'])

As expected, customers with Fiber optic broadband pay more monthly. This could be another feature we can look at dropping to see if there are any negative impacts on model performance

In [None]:
cols = ['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']
df1 = data[(data['Churn'] == 'Yes') & (data['InternetService'] != 'No')][cols]
df2 = data[data['InternetService'] != 'No'][cols]
df1 = pd.melt(df1).rename({'value':'Has Service'},axis=1)
df2 = pd.melt(df2).rename({'value':'Has Service'},axis=1)

In [None]:
fig, ax = plt.subplots(2,1,figsize=(10,8))
ax1 = sns.countplot(data=df1,x='variable',hue='Has Service',ax=ax[0],hue_order=['No','Yes'])
ax2 = sns.countplot(data=df2,x='variable',hue='Has Service',ax=ax[1],hue_order=['No','Yes'])
ax1.set(xlabel='Additional Internet Services',ylabel='Churns')
ax2.set(xlabel='Additional Internet Services',ylabel='Number of Customers')

OnlineSecurity, OnlineBackup, DeviceProtection and TechSupport all show tendancies where customers withouth these services are more likely to churn. There appears to be no differences for those with StreamingTv and StreamingMovies

In [None]:
data.columns

In [None]:
data1 = data[['SeniorCitizen', 'Partner', 'Dependents', 'tenure',
        'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract',
       'MonthlyCharges', 'PaperlessBilling','PaymentMethod','Churn']]

In [None]:
datacategorical = pd.get_dummies(data1)
datacategorical.columns

In [None]:
datacategorical = datacategorical[['SeniorCitizen', 'tenure', 'MonthlyCharges',
       'Partner_Yes', 'Dependents_Yes',
        'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic', 'OnlineSecurity_Yes',
       'OnlineBackup_Yes','DeviceProtection_Yes','TechSupport_Yes','StreamingTV_Yes',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year','PaperlessBilling_Yes','PaymentMethod_Bank transfer (automatic)','PaymentMethod_Credit card (automatic)','PaymentMethod_Electronic check',
        'PaymentMethod_Mailed check','Churn_Yes']]

In [None]:
datacategorical.rename({'Churn_Yes':'Churn'},axis=1,inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
y = datacategorical.pop('Churn')
x = datacategorical

# Scaling all the variables to a range of 0 to 1
from sklearn.preprocessing import MinMaxScaler
features = x.columns.values
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(x)
x = pd.DataFrame(scaler.transform(x))
x.columns = features

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
#class_weight has been adjusted to account for the fact the target variable 'Churn' is skewed, where Churn = 1 occurs less than half as Churn = 0
model = LogisticRegression(random_state=1,max_iter=400,class_weight={0:1,1:2}).fit(xtrain,ytrain)

In [None]:
predictions = model.predict(xtest)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print('Accuracy Score: {}'.format(accuracy_score(ytest,predictions)))
sns.heatmap(confusion_matrix(ytest,predictions),annot=True,fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of Model Predictions')
plt.show()

**A question we would have on optimising this model would be in terms of risk analysis on falsely predicting customers to churn (increasing costs if we give those customers offers/sales promotions even though they weren't going to churn) vs the cost of falsely predicting customers not churning when in reality they do (cost of revenue loss). This would be a business decision looking at both of these costs. The model could then be adjusted accordingly**

In [None]:
from sklearn.metrics import precision_score,recall_score
precision = precision_score(ytest,predictions)
recall = recall_score(ytest,predictions)
fmeasure = (2*recall*precision)/(recall+precision)
print('Precision : {}'.format(precision))
print('Recall : {}'.format(recall))
print('FMeasure : {}'.format(fmeasure))


In [None]:
from sklearn.model_selection import learning_curve
trainsize, trainscore,testscore = learning_curve(model,x,y,scoring='accuracy',train_sizes=np.linspace(0.1,1.0,10))
trainscoremean = trainscore.mean(axis=1)
testscoremean = testscore.mean(axis=1)

In [None]:
sns.lineplot(x=trainsize,y=trainscoremean,label='Train')
sns.lineplot(x=trainsize,y=testscoremean,label='Test')
plt.title('Learning Curve')
plt.xlabel('Training Size')
plt.ylabel('Accuracy Score')
plt.legend(loc='upper right')
plt.show()

**Looking at the above learning curve, getting more data is unlikely to yield improvements in the accuracy. If we want to improve the accuracy further a better method would be to increase the number of features or increasing model capacity**

In [None]:
plt.figure(figsize=(14,4))
featureimportance = pd.Series(model.coef_[0],index=x.columns.values)
featureimportance.abs().sort_values(ascending=True).plot(kind='barh')
print(featureimportance.abs().sort_values(ascending=True).index)
plt.title('Feature Importances (Absolute)')
plt.show()

In [None]:
plt.figure(figsize=(18,14))
sns.heatmap(x.corr(),cmap='vlag',annot=True)
plt.show()

Looking at the feature importances determined by the Logistic Regression model as well as the collinearity heatmap above we can look at removing some features from the model. The collinearity of monthly charges and fibre optic suggests we can drop one, for intuition's sake and for model interpretation let's drop the fibre optic feature

In [None]:
xlimited = x[['tenure', 'MonthlyCharges',
       'InternetService_DSL', 'Contract_Month-to-month', 'Contract_Two year',
        'PaperlessBilling_Yes',
       'OnlineSecurity_Yes', 'SeniorCitizen', 'PaymentMethod_Electronic check']]

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(xlimited,y,test_size=0.2)
model = LogisticRegression(random_state=1,max_iter=400,class_weight={0:1,1:2}).fit(xtrain,ytrain)
predictions = model.predict(xtest)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
print('Accuracy Score: {}'.format(accuracy_score(ytest,predictions)))
sns.heatmap(confusion_matrix(ytest,predictions),annot=True,fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix of Model Predictions')
plt.show()

Of the 705 customers in the test set, 48 left when the model predicted they would stay and 114 were predicted to leave when they didn't. A business decision could be based on the cost implications on running sales promotions on the 114 who didn't leave vs the cost of losing the revenue from those 48 customers who did leave. This is assuming the sales promotion would have a 100% success rate in retaining those 48 customers

In [None]:
precision = precision_score(ytest,predictions)
recall = recall_score(ytest,predictions)
fmeasure = (2*recall*precision)/(recall+precision)
print('Precision : {}'.format(precision))
print('Recall : {}'.format(recall))
print('FMeasure : {}'.format(fmeasure))

In [None]:
trainsize, trainscore,testscore = learning_curve(model,xlimited,y,scoring='accuracy',train_sizes=np.linspace(0.1,1.0,5))
trainscoremean = trainscore.mean(axis=1)
testscoremean = testscore.mean(axis=1)
sns.lineplot(x=trainsize,y=trainscoremean,label='Train')
sns.lineplot(x=trainsize,y=testscoremean,label='Test')
plt.title('Learning Curve')
plt.xlabel('Training Size')
plt.ylabel('Accuracy Score')
plt.legend(loc='upper right')
plt.show()

In [None]:
plt.figure(figsize=(6,4))
featureimportance = pd.Series(model.coef_[0],index=xlimited.columns.values)
featureimportance.sort_values(ascending=False).plot(kind='barh')
plt.title('Feature Importances')
plt.show()