**Load Libraries**

In [None]:

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
import seaborn as sns
from random import randrange, uniform
from fancyimpute import KNN


In [None]:
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split

In [None]:
from sklearn.metrics import confusion_matrix 
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from sklearn.naive_bayes import GaussianNB

**Change Working Directory**

In [None]:

os.chdir(r"C:\Users\RAUNAK\Desktop\edwisor\workspace")

**Load Data**

In [None]:

churn_telecom = pd.read_csv("churn_telecom.csv")


**Exploratory Data Analysis**

In [None]:
churn_telecom.info()

In [None]:
#Remove state, area code,phone_number  columns because we are predicting churn based on Usage and Plans
churn_telecom = churn_telecom.drop(['state','area.code', 'phone.number'], axis=1)

In [None]:
#Assigning levels to the categories of Object type variables
lis = []
for i in range(0, churn_telecom.shape[1]):
    print(i)
    if(churn_telecom.iloc[:,i].dtypes == 'object') :
        churn_telecom.iloc[:,i] = pd.Categorical(churn_telecom.iloc[:,i])
        #print(churn_telecom[[i]])
        churn_telecom.iloc[:,i] = churn_telecom.iloc[:,i].cat.codes 
        churn_telecom.iloc[:,i] = churn_telecom.iloc[:,i].astype('object')
        lis.append(churn_telecom.columns[i])

**Missing Values Analysis**

In [None]:
#Check if there are any missing values
churn_telecom.isnull().sum()

**Outlier Analysis**

In [None]:
# #Plot boxplot to visualize Outliers

plt.boxplot(churn_telecom['total.day.minutes'])

In [None]:
#Replace outliers with maximum and minimum values
for i in range(0, churn_telecom.shape[1]):
    print(i)
    if(churn_telecom.iloc[:,i].dtypes != 'object') :
        q75, q25 = np.percentile(churn_telecom.iloc[:,i], [75 ,25])
        churn_telecom.iloc[churn_telecom.loc[(churn_telecom.iloc[:,i]> q75+1.5*(q75-q25) )].index.values,i ] = q75+1.5*(q75-q25)
        churn_telecom.iloc[churn_telecom.loc[(churn_telecom.iloc[:,i]< q25-1.5*(q75-q25) )].index.values,i ] = q25-1.5*(q75-q25)
        
        
        

**Feature Scaling**

In [None]:
#Normalisation
cnames_norm = ["number.vmail.messages",
               "total.intl.calls",
               "number.customer.service.calls"]
for i in cnames_norm:
    print(i)
    churn_telecom[i] = (churn_telecom[i] - min(churn_telecom[i]))/(max(churn_telecom[i]) - min(churn_telecom[i]))

In [None]:
#Standardisation
cnames_stand = ["account.length",
                 "total.day.minutes",
                 "total.day.calls",
                 "total.eve.minutes",
                 "total.eve.calls",
                 "total.night.minutes",
                 "total.night.calls",
                 "total.intl.minutes"]

for i in cnames_stand:
     print(i)
     churn_telecom[i] = (churn_telecom[i] - churn_telecom[i].mean())/churn_telecom[i].std()

**Feature Selection**

In [None]:
##Correlation analysis
#Correlation plot

cnames_numeric =["account.length",
                 "total.day.minutes",
                 "total.day.calls",
                 "total.eve.minutes",
                 "total.eve.calls",
                 "total.night.minutes",
                 "total.night.calls",
                 "total.intl.minutes",
                 "total.intl.calls",
                 "number.vmail.messages",
                 "number.customer.service.calls"]

df_corr = churn_telecom.loc[:,cnames_numeric]

#Set the width and hieght of the plot
f, ax = plt.subplots(figsize=(7, 5))

#Generate correlation matrix
corr = df_corr.corr()

#Plot using seaborn library
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [None]:
#Chisquare test of independence
#Save categorical variables
cnames_object = ["international.plan","voice.mail.plan"]

#loop for chi square values
for i in cnames_object:
    print(i)
    chi2, p, dof, ex = chi2_contingency(pd.crosstab(churn_telecom['Churn'], churn_telecom[i]))
    print(p)

In [None]:
#Dimension Reduction

churn_telecom = churn_telecom.drop(['total.day.charge','total.eve.charge','total.night.charge','total.intl.charge'], axis=1)


**Model Development**

In [None]:
 #Stratified sampling

#Select categorical variable
 y = churn_telecom['international.plan']

#select subset using stratified Sampling
Rest, Sample = train_test_split(churn_telecom, test_size = 0.8, stratify = y)

In [None]:
#replace target categories with Yes or No
churn_telecom['Churn'] = churn_telecom['Churn'].replace( 0,'No')
churn_telecom['Churn'] = churn_telecom['Churn'].replace (1,'Yes')

In [None]:
churn_telecom.head(5)

In [None]:
#Divide data into train and test
X = churn_telecom.values[:, 0:14]
Y = churn_telecom.values[:,13]
Y = Y.astype('int')

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.2)

In [None]:
#Decision Tree
C50_model = tree.DecisionTreeClassifier(criterion='entropy').fit(X_train, y_train)





In [None]:
#predict new test cases
C50_Predictions = C50_model.predict(X_test)

In [None]:
#build confusion matrix

# CM = confusion_matrix(y_test, y_pred)
CM = pd.crosstab(y_test, C50_Predictions)

#let us save TP, TN, FP, FN
TN = CM.iloc[0,0]
FN = CM.iloc[1,0]
TP = CM.iloc[1,1]
FP = CM.iloc[0,1]

#check accuracy of model
#accuracy_score(y_test, y_pred)*100
((TP+TN)*100)/(TP+TN+FP+FN)

#False Negative rate 
(FN*100)/(FN+TP)



In [None]:
#Random Forest

RF_model = RandomForestClassifier(n_estimators = 20).fit(X_train, y_train)

In [None]:
RF_Predictions = RF_model.predict(X_test)

In [None]:
#build confusion matrix
# from sklearn.metrics import confusion_matrix 
# CM = confusion_matrix(y_test, y_pred)
CM = pd.crosstab(y_test, RF_Predictions)

#let us save TP, TN, FP, FN
TN = CM.iloc[0,0]
FN = CM.iloc[1,0]
TP = CM.iloc[1,1]
FP = CM.iloc[0,1]

#check accuracy of model
#accuracy_score(y_test, y_pred)*100
((TP+TN)*100)/(TP+TN+FP+FN)

#False Negative rate 
(FN*100)/(FN+TP)



In [None]:
#Let us prepare data for logistic regression
#replace target categories with Yes or No
churn_telecom['Churn'] = churn_telecom['Churn'].replace(0,'No')
churn_telecom['Churn'] = churn_telecom['Churn'].replace(1,'Yes')

In [None]:
churn_telecom_logit = pd.DataFrame(churn_telecom['Churn'])

In [None]:
churn_telecom_logit = churn_telecom_logit.join(churn_telecom[cnames_numeric])

In [None]:
##Create dummies for categorical variables
cat_names = ["international.plan","voice.mail.plan"]

for i in cat_names:
    temp = pd.get_dummies(churn_telecom[i], prefix = i)
    churn_telecom_logit = churn_telecom_logit.join(temp)

In [None]:
Sample_Index = np.random.rand(len(churn_telecom_logit)) < 0.8

train = churn_telecom_logit[Sample_Index]
test = churn_telecom_logit[~Sample_Index]

In [None]:
#select column indexes for independent variables
train_cols = train.columns[1:13]

In [None]:
#Built Logistic Regression

logit = sm.Logit(train['Churn'], train[train_cols]).fit()

logit.summary()

In [None]:
#Predict test data
test['Actual_prob'] = logit.predict(test[train_cols])

test['ActualVal'] = 1
test.loc[test.Actual_prob < 0.5, 'ActualVal'] = 0

In [None]:
#Build confusion matrix
CM = pd.crosstab(test['Churn'], test['ActualVal'])

#let us save TP, TN, FP, FN
TN = CM.iloc[0,0]
FN = CM.iloc[1,0]
TP = CM.iloc[1,1]
FP = CM.iloc[0,1]

#check accuracy of model
#accuracy_score(y_test, y_pred)*100
((TP+TN)*100)/(TP+TN+FP+FN)

(FN*100)/(FN+TP)

In [None]:
#Naive Bayes


#Naive Bayes implementation
NB_model = GaussianNB().fit(X_train, y_train)

In [None]:
#predict test cases
NB_Predictions = NB_model.predict(X_test)

In [None]:
#Build confusion matrix
CM = pd.crosstab(y_test, NB_Predictions)

#let us save TP, TN, FP, FN
TN = CM.iloc[0,0]
FN = CM.iloc[1,0]
TP = CM.iloc[1,1]
FP = CM.iloc[0,1]

#check accuracy of model
accuracy_score(y_test, y_pred)*100
((TP+TN)*100)/(TP+TN+FP+FN)

#False Negative rate 
(FN*100)/(FN+TP)
