### **Objective: To predict the behaviour of the customers to retain them**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option('display.max_rows',3000)
pd.set_option('display.max_columns',3000)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. READING AND UNDERSTANDING THE DATA

In [None]:
# Data Reading
churn_data = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [None]:
churn_data.head()

In [None]:
# Data Inspection
churn_data.info()

#### The above data shows no missing values in the dataset.

In [None]:
# Shape of the dataset
churn_data.shape

#### Therefore there are 7043 rows and 21 columns in the data set.

In [None]:
# Statistical aspects of the numerical columns of the dataset
churn_data.describe()

# 2. DATA PREPARATION

In [None]:
# Observing the data in the dataset to search for binary variables
churn_data.head()

#### The binary variables are as follows:
- Partner,Dependents,PhoneService,OnlineSecurity, OnlineBackup, DeviceProtection,TechSupport, StreamingTV,StreamingMovies,PaperlessBilling,Churn	 

In [None]:
# Converting binary variables (Yes/No) to 0 or 1

In [None]:
varlist = ['Partner','Dependents','PhoneService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','Churn']

In [None]:
# function to map binary columns

def binary_mapping(x):
    return x.map({'Yes':1,'No':0})

In [None]:
churn_data[varlist]=churn_data[varlist].apply(binary_mapping)

In [None]:
churn_data.head()

In [None]:
# Creating dummy variables with some of the categorical variables with multiple levels
dummy1=pd.get_dummies(churn_data[['Contract','PaymentMethod','gender','InternetService']], drop_first=True)

In [None]:
churn_data = pd.concat([churn_data,dummy1],axis=1)

In [None]:
churn_data.head()

In [None]:
# Creating dummy variables for the remaining categorical variables and dropping the level with big names.


ml = pd.get_dummies(churn_data['MultipleLines'], prefix='MultipleLines')
# Dropping MultipleLines_No phone service column
ml1 = ml.drop(['MultipleLines_No phone service'], 1)
#Adding the results to the master dataframe
churn_data = pd.concat([churn_data,ml1], axis=1)


In [None]:
churn_data.head()

In [None]:
# Dropping the redundant variables
churn_data=churn_data.drop(['Contract','PaymentMethod','gender','InternetService','MultipleLines'],1)

In [None]:
churn_data.head()

In [None]:
# Observing the data types of the variables
churn_data.info()

In [None]:
# Converting TotalCharges into float as it is a numerical column
churn_data['TotalCharges'] = pd.to_numeric(churn_data.TotalCharges, errors='coerce')


In [None]:
# Observing the change
churn_data.info()

In [None]:
# Checking for outliers in numerical columns
num_churn_data=churn_data[['tenure','MonthlyCharges','TotalCharges']]

In [None]:
# Checking whether the numbers are gradually increasing
num_churn_data.describe(percentiles=[0.25,0.50,0.75,0.90,0.95,0.99])

#### The numbers seems to be gradually increasing. Therefore there are no outliers.

# MISSING VALUE TREATMENT

In [None]:
# Checking for missing values
churn_data.isnull().sum()

In [None]:
# Checking the percentage of missing values
round(100*(churn_data.isnull().sum()/len(churn_data.index)),2)

### Therefore the columns haveing missing values are as follows:
- OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
### So the above columns missing value needs to be treated
- TotalCharges column contains very less missing values so they can be dropped

In [None]:
# Checking the rows against the missing values of Online Security
churn_data[np.isnan(churn_data['OnlineSecurity'])]

In [None]:
# Checking the rows against the missing values of OnlineBackup
churn_data[np.isnan(churn_data['OnlineBackup'])]

In [None]:
# Checking the rows against the missing values of DeviceProtection
churn_data[np.isnan(churn_data['DeviceProtection'])]

In [None]:
# Checking the rows against the missing values of TechSupport
churn_data[np.isnan(churn_data['TechSupport'])]

In [None]:
# Checking the rows against the missing values of StreamingTV
churn_data[np.isnan(churn_data['StreamingTV'])]

In [None]:
# Checking the rows against the missing values of StreamingTV
churn_data[np.isnan(churn_data['StreamingMovies'])]

#### From the above data we can assume that all missing values are 0 as all these variables are related to internet and these customers have not availed internet as all InternetService_No variables against them appears to be 1.

In [None]:
# To check which one has maximum count 0 or 1 
churn_data['OnlineSecurity'].value_counts()

In [None]:
# ,'OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies'
# To check which one has maximum count 0 or 1 
churn_data['OnlineBackup'].value_counts()

In [None]:
churn_data['DeviceProtection'].value_counts()

In [None]:
churn_data['TechSupport'].value_counts()

In [None]:
churn_data['StreamingTV'].value_counts()

In [None]:
churn_data['StreamingTV'].mode()

In [None]:
churn_data['StreamingMovies'].value_counts()

In [None]:
churn_data['StreamingMovies'].mode()

#### Since in all the above variables 0 appears most of the time we will replace the missing values of all the above columns with 0.

In [None]:
churn_data[['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']]=churn_data[['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']].fillna(0)

In [None]:
# Checking the sum of missing values in the columns now
churn_data.isnull().sum()

In [None]:
# Checking the percentage of missing values in each of the columns now
round(100*(churn_data.isnull().sum()/churn_data.shape[0]),2)

#### From the above data we can observe that the column TotalCharges have negligible number and percentage of missing values.

In [None]:
# Dropping the rows where TotalCharges have missing values
clean_data=churn_data[~np.isnan(churn_data['TotalCharges'])]

In [None]:
# Checking the number of missing values in all the columns
clean_data.isnull().sum()

#### The data is clean now. Thus we can proceed with the next process.

# 3. TRAIN-TEST SPLIT

In [None]:
# importing train test split library
from sklearn.model_selection import train_test_split

In [None]:
# Putting feature variable to X
X = clean_data.drop(['Churn','customerID'], axis=1)

X.head()

In [None]:
# Putting response variable in y
y = clean_data['Churn']
y.head()

In [None]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

# 4. FEATURE SCALING

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# scaling numerical variables
scaler = StandardScaler()

X_train[['tenure','MonthlyCharges','TotalCharges']] = scaler.fit_transform(X_train[['tenure','MonthlyCharges','TotalCharges']])

X_train.head()

In [None]:
### Checking the Churn Rate
churn = (sum(clean_data['Churn'])/len(clean_data['Churn'].index))*100
churn

## This shows there is 27% of churn rate as per existing data.

# 5. CHECKING CORRELATIONS

In [None]:
# Importing matplotlib and seaborn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Let's see the correlation matrix 
plt.figure(figsize = (20,10))        # Size of the figure
sns.heatmap(clean_data.corr(),annot = True)
plt.show()

In [None]:
# Dropping highly correlated dummy variable
X_test = X_test.drop(['MultipleLines_No'], 1)
X_train = X_train.drop(['MultipleLines_No'], 1)

In [None]:
# Checking the correlation matrix
plt.figure(figsize = (20,10))
sns.heatmap(X_train.corr(),annot = True)
plt.show()

# 6. MODEL BUILDING

In [None]:
# library required for building the model
import statsmodels.api as sm

In [None]:
# LOGISTIC REGRESSION MODEL
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

# 7. FEATURE SELECTION USING RFE

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [None]:
from sklearn.feature_selection import RFE
rfe = RFE(logreg, 15)             # running RFE with 15 variables as output
rfe = rfe.fit(X_train, y_train)

In [None]:
rfe.support_

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]

In [None]:
col

In [None]:
X_train.columns[~rfe.support_]

#### Assessing the model using StatsModel

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
# Getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred_final = pd.DataFrame({'Churn':y_train.values, 'Churn_Prob':y_train_pred})
y_train_pred_final['CustID'] = y_train.index
y_train_pred_final.head()

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final.Churn_Prob.map(lambda x: 1 if x > 0.5 else 0)

# Let's see the head
y_train_pred_final.head()

In [None]:
from sklearn import metrics

In [None]:
# Confusion matrix 
confusion = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.predicted )
print(confusion)

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.predicted))

## Checking VIFs

In [None]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### Here Monthly Charges has the highest VIF. So we will drop that column.

In [None]:
col = col.drop('MonthlyCharges', 1)


In [None]:
col

In [None]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm3.fit()
res.summary()

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### Next we will remove Total Charges with high VIF.

In [None]:
col = col.drop('TotalCharges', 1)

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logm4 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm4.fit()
res.summary()

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

#### Now all the variables are highly significant.

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)

In [None]:
y_train_pred[:10]

In [None]:
y_train_pred_final['Churn_Prob'] = y_train_pred

In [None]:
# Creating new column 'predicted' with 1 if Churn_Prob > 0.5 else 0
y_train_pred_final['predicted'] = y_train_pred_final.Churn_Prob.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
# Let's check the overall accuracy.
print(metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.predicted))

# 8. CONFUSION MATRIX

In [None]:
# Let's take a look at the confusion matrix again 
confusion = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.predicted )
confusion

In [None]:
# Actual/Predicted     not_churn    churn
        # not_churn        3269      366
        # churn            595       692 

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.predicted)

In [None]:
# Metrics beyond accuracy
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FP)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

In [None]:
# Calculate false postive rate - predicting churn when customer does not have churned
print(FP/ float(TN+FP))

# PLOTTING ROC CURVE

In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final.Churn, y_train_pred_final.Churn_Prob, drop_intermediate = False )

In [None]:
draw_roc(y_train_pred_final.Churn, y_train_pred_final.Churn_Prob)

# 9. FINDING THE OPTIMAL CUT OFF POINT

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.Churn_Prob.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

#### From the curve above, 0.3 is the optimum point to take it as a cutoff probability.

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final.Churn_Prob.map( lambda x: 1 if x > 0.3 else 0)

y_train_pred_final.head()

# 10. EVALUATION OF THIS MODEL WHICH IS CREATED ACCORDING TO THE OPTIMAL CUT OFF

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

In [None]:
# Calculate false postive rate - predicting churn when customer does not have churned
print(FP/ float(TN+FP))

In [None]:
# Positive predictive value 
print (TP / float(TP+FP))

In [None]:
# Negative predictive value
print (TN / float(TN+ FN))

# 11. PRECISION AND RECALL

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.predicted )
confusion

##### Precision
TP / TP + FP

In [None]:
confusion[1,1]/(confusion[0,1]+confusion[1,1])

##### Recall
TP / TP + FN

In [None]:
confusion[1,1]/(confusion[1,0]+confusion[1,1])

In [None]:
from sklearn.metrics import precision_score, recall_score

In [None]:
precision_score(y_train_pred_final.Churn, y_train_pred_final.predicted)

# PRECISION AND RECALL TRADEOFF

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
y_train_pred_final.Churn, y_train_pred_final.predicted

In [None]:
p, r, thresholds = precision_recall_curve(y_train_pred_final.Churn, y_train_pred_final.Churn_Prob)

In [None]:
# Precision and Recall method of finding the optimal value
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.show()

#### Therefore the optimal value is 0.42

# 12. MAKING PREDICTIONS ON THE TEST SET

In [None]:
X_test[['tenure','MonthlyCharges','TotalCharges']] = scaler.transform(X_test[['tenure','MonthlyCharges','TotalCharges']])

In [None]:
X_test = X_test[col]
X_test.head()

In [None]:
X_test_sm = sm.add_constant(X_test)

In [None]:
y_test_pred = res.predict(X_test_sm)

In [None]:
y_test_pred[:10]

In [None]:
# Converting y_pred to a dataframe which is an array
y_pred_1 = pd.DataFrame(y_test_pred)

In [None]:
# Let's see the head
y_pred_1.head()

In [None]:
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)

In [None]:
# Putting CustID to index
y_test_df['CustID'] = y_test_df.index

In [None]:
# Removing index for both dataframes to append them side by side 
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [None]:
y_pred_final.head()

In [None]:
# Renaming the column 
y_pred_final= y_pred_final.rename(columns={ 0 : 'Churn_Prob'})

In [None]:
# Let's see the head of y_pred_final
y_pred_final.head()

In [None]:
y_pred_final['final_predicted'] = y_pred_final.Churn_Prob.map(lambda x: 1 if x > 0.42 else 0)

In [None]:
y_pred_final.head()

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_pred_final.Churn, y_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_pred_final.Churn, y_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

In [None]:
# Precision
TP / float(TP+FP)

### The sensitivity that is the probability of yeses correctly converted to yeses is more in test set than training set.

# This shows that this is a good model.

# BUSINESS ANALYSIS

## Following is the future churn behaviour of the customers of the respective customer IDs:

In [None]:
# predicted churn result set
y_pred_final