## Churn Prediction Analysis Part 2

In [1]:
# Import all the libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt#visualization
from PIL import  Image
%matplotlib inline
import seaborn as sns#visualization
import itertools
import warnings
warnings.filterwarnings("ignore")
import plotly.graph_objs as go#visualization
import plotly.offline as py#visualization

In [19]:
# Load the cleaned file
telcom = pd.read_csv("clean_df.csv")

In [None]:
telcom.head()

In [21]:
telcom = telcom.drop('Unnamed: 0', axis = 1)

In [None]:
telcom.head()

In [23]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


#customer id col
Id_col     = ['customerID']
#Target columns
target_col = ["Churn"]
#categorical columns
cat_cols   = telcom.nunique()[telcom.nunique() < 6].keys().tolist()
cat_cols   = [x for x in cat_cols if x not in target_col]
#numerical columns
num_cols   = [x for x in telcom.columns if x not in cat_cols + target_col + Id_col]

#Binary columns with 2 values
bin_cols   = telcom.nunique()[telcom.nunique() == 2].keys().tolist()
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]



In [None]:
multi_cols

In [25]:
#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    telcom[i] = le.fit_transform(telcom[i])
    
#Duplicating columns for multi value columns
telcom = pd.get_dummies(data = telcom,columns = multi_cols)


In [None]:
telcom

In [27]:
#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(telcom[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

In [None]:
scaled

In [None]:
telcom = telcom.drop(columns = num_cols,axis = 1)
telcom = telcom.merge(scaled,left_index=True,right_index=True,how = "left")

### What is the purpose of the above code?

In [None]:
summary = telcom.describe().transpose()
summary

In [None]:
# Variable Summary

In [None]:
# To make it look nice we can do some additional stuff if needed
summary = (telcom[[i for i in telcom.columns if i not in Id_col]].
           describe().transpose().reset_index())

summary = summary.rename(columns = {"index" : "feature"})
summary = np.around(summary,3)

val_lst = [summary['feature'], summary['count'],
           summary['mean'],summary['std'],
           summary['min'], summary['25%'],
           summary['50%'], summary['75%'], summary['max']]

trace  = go.Table(header = dict(values = summary.columns.tolist(),
                                line = dict(color = ['#506784']),
                                fill = dict(color = ['#119DFF']),
                               ),
                  cells  = dict(values = val_lst,
                                line = dict(color = ['#506784']),
                                fill = dict(color = ["lightgrey",'#F5F8FF'])
                               ),
                  columnwidth = [200,60,100,100,60,60,80,80,80])
layout = go.Layout(dict(title = "Variable Summary"))
figure = go.Figure(data=[trace],layout=layout)
py.iplot(figure)

In [None]:
#correlation
correlation = telcom.corr()
#tick labels
matrix_cols = correlation.columns.tolist()

In [None]:
#convert to array
corr_array  = np.array(correlation)

In [None]:
correlation

In [None]:
#Plotting
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   colorscale = "Viridis",
                   colorbar   = dict(title = "Pearson Correlation coefficient",
                                     titleside = "right"
                                    ) ,
                  )

layout = go.Layout(dict(title = "Correlation Matrix for variables",
                        autosize = False,
                        height  = 720,
                        width   = 800,
                        margin  = dict(r = 0 ,l = 210,
                                       t = 25,b = 210,
                                      ),
                        yaxis   = dict(tickfont = dict(size = 9)),
                        xaxis   = dict(tickfont = dict(size = 9))
                       )
                  )

data = [trace]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

#### Q. What do you observe?

## Model Building (We will build Decision Tree and Logistics Regression models)

In [None]:
# Separate the features from the class label and split the dataset into training and testing sets

In [None]:
# Import all the modules

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import f1_score
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score

In [None]:
#splitting train and test data 
train,test = train_test_split(telcom,test_size = .25 ,random_state = 111)

### Q. What is the purpose of random_state parameter?

In [None]:
##seperating dependent and independent variables
cols    = [i for i in telcom.columns if i not in Id_col + target_col]
X_train = train[cols]
Y_train = train[target_col]
X_test  = test[cols]
Y_test  = test[target_col]

## Logistics Regression

In [None]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, Y_train)

In [None]:
# Predicting test set
y_pred = classifier.predict(X_test)

In [None]:
classifier.coef_

In [None]:
#Evaluating the Results
cm = confusion_matrix(Y_test, y_pred)
accuracy_score(Y_test, y_pred)
precision_score(Y_test, y_pred)
recall_score(Y_test, y_pred)
f1_score(Y_test, y_pred)

###  Q. What do the scores mean? Is this a good model fit based on the scores. Make sure you print all the scores.

In [None]:
# Build the confusion matrix
df_cm = pd.DataFrame(cm, index = (0, 1), columns = (0, 1))
plt.figure(figsize = (10, 7))
sns.heatmap(df_cm, annot = True, fmt ='g')
print("Test Data Accuracy: %.4f" %accuracy_score(Y_test, y_pred))

## Decision Tree

In [None]:
model_tree = DecisionTreeClassifier(random_state = 2)
model_tree.fit(X_train, Y_train)

In [None]:
# Predicting test set
y_pred = model_tree.predict(X_test)

In [None]:
#Evaluating the Results
cm = confusion_matrix(Y_test, y_pred)
accuracy_score(Y_test, y_pred)
precision_score(Y_test, y_pred)
recall_score(Y_test, y_pred)
f1_score(Y_test, y_pred)

In [None]:
###  Q. What do the scores mean? Is this a good model fit based on the scores. Make sure you print all the scores.

In [None]:
# Build Confusion Matrix

df_cm = pd.DataFrame(cm, index = (0, 1), columns = (0, 1))
plt.figure(figsize = (10, 7))
sns.heatmap(df_cm, annot = True, fmt ='g')
print("Test Data Accuracy: %.4f" %accuracy_score(Y_test, y_pred))

 ### Q Which model performs better? (Hint: compare the metrics)

## K- fold Cross Validation

### Q. What is K-fold cross validation?

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X= X_train, y =Y_train, cv =10)

In [None]:
# Check for accuracies
accuracies

### Q. What do accuracies tell?

In [None]:
# Analyzing Coefficients
pd.concat([pd.DataFrame(X_train.columns, columns =["features"]), pd.DataFrame(np.transpose(classifier.coef_), 
                                                                              columns =["Coef"])], 
                                                                              axis = 1)

## Feature Seclection/Feature Engineering

In [None]:

from sklearn.feature_selection import RFE 
classifier = LogisticRegression()
rfe = RFE(classifier, 10)
rfe = rfe.fit(X_train, Y_train)

In [None]:
print(rfe.support_)

In [None]:
X_train.columns[rfe.support_]

In [None]:
rfe.ranking_

In [None]:
# Build the model again after Feature Selection
classifier = LogisticRegression(random_state = 2)
classifier.fit(X_train[X_train.columns[rfe.support_]], Y_train)


In [None]:
#Evaluating the Results
cm = confusion_matrix(Y_test, y_pred)
accuracy_score(Y_test, y_pred)
precision_score(Y_test, y_pred)
recall_score(Y_test, y_pred)
f1_score(Y_test, y_pred)

In [None]:
df_cm = pd.DataFrame(cm, index = (0, 1), columns = (0, 1))
plt.figure(figsize = (10, 7))
sns.heatmap(df_cm, annot = True, fmt ='g')
print("Test Data Accuracy: %.4f" %accuracy_score(Y_test, y_pred))

### Q. Has the model improved after feature selection?

In [None]:
# Subset the coefficents for RFE
pd.concat([pd.DataFrame(X_train.columns[rfe.support_], columns =["features"]), pd.DataFrame(np.transpose(classifier.coef_), 
                                                                              columns =["Coef"])], axis = 1)

In [None]:
final_results = pd.concat([Y_test, telcom.customerID], axis =1).dropna()
final_results['predicted_churn'] = y_pred
final_results = final_results[['customerID', 'Churn', 'predicted_churn']].reset_index(drop = True)

### Q. Print the final Results

### Q. Provide recommendations based on the feature selection. What should company target for to reduce churn?

### Practice Q. Can you Add SVM model to this dataset and perform the same steps and check test data accuracy?