#                                       Telco Customer Churn
#                       Focused customer retention programs

## OBJECTIVE: Predict churn to retain customers. 
1. Calculation of Churn Probability and ranking of CustomerIds based on the Prob(Churn)
2. Ranking of Features

# Data Pre-Processing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Get the data

In [None]:
data = pd.read_csv("../input/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Content Analysis

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()                                                  ## data type of each column, missing values, shape of table..

#### Convert TotalCharges column to numeric

In [None]:
data.TotalCharges=pd.to_numeric(data.TotalCharges,errors='coerce')

# Univariate Analysis

### Univariate Analysis for Non-Numeric/Categorical type Variables

In [None]:
data.describe(include=[np.object])

#### What are the levels and its distribution within each Categorical Column

In [None]:
col_names=list(data.columns)

In [None]:
col_names.remove('customerID')

In [None]:
col_names.remove('tenure')
col_names.remove('MonthlyCharges')
col_names.remove('TotalCharges')

In [None]:
col_names

In [None]:
for i in col_names:
    j=data[i].value_counts()
    print('-----------------------------------')
    print(j)

In [None]:
for m in col_names:
    data[m].hist()
    plt.show()

### Univariate Analysis of the Numeric type Variables

In [None]:
data.describe(include=[np.number])

# Missing Value Treatment

#### Where are the missing value??

In [None]:
data.info()                                     ## Check the Missing Value

In [None]:
data.isnull().sum()                               ## Check the number missing value

#### Replace /Impute the Missing Value.

In [None]:
## Calculate the median of the column

q=data.TotalCharges.quantile([0.1,0.5,0.9])

In [None]:
type(q)                                                                                 ## one Dimensional labelled Array

In [None]:
q

In [None]:
TC_median=q[.5]

In [None]:
TC_median

In [None]:
#data.loc[null_value].index             ## Indexes of the Missing Values

In [None]:
column_names=list(data.columns)
column_names

In [None]:
column_names[18:20]

In [None]:
plt.scatter(data.MonthlyCharges,data.TotalCharges, alpha=0.1)
plt.xlabel(column_names[18])
plt.ylabel(column_names[19])

In [None]:
plt.scatter(data.tenure,data.TotalCharges, alpha=0.01)
plt.xlabel(column_names[5])
plt.ylabel(column_names[19])

#### Replace the missing Value with Median

In [None]:
data.TotalCharges =  data.TotalCharges.fillna(TC_median)           

In [None]:
data.info()

# OUTLIER Treatment

In [None]:
data.boxplot(column=['MonthlyCharges','tenure'])

In [None]:
data.boxplot(column='TotalCharges')

In [None]:
sns.kdeplot(data.MonthlyCharges)

## Correlation Analysis

In [None]:
print(data[['MonthlyCharges','TotalCharges','tenure']].corr())

In [None]:
print(data.corr())

## Create Dummy Variables

In [None]:
data_copy=data
data_copy=data_copy.drop(columns=['customerID', 'TotalCharges'])

In [None]:
data_dummy=pd.get_dummies(data_copy,drop_first=True)

In [None]:
len(data_dummy.columns)

In [None]:
data_dummy.head()

# Building a Predictive Model

#### PREDICTORS

In [None]:
X=data_dummy.iloc[:,0:29]

#### TARGET VARIABLE

In [None]:
y=data_dummy.iloc[:,29]

### Test Train Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc=StandardScaler()

In [None]:
X_train=sc.fit_transform(X_train)

In [None]:
X_test=sc.transform(X_test)

### Create the model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
logreg = LogisticRegression()

In [None]:
logreg.fit(X_train, y_train)

### Check for Accuracy

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
y_pred1 = logreg.predict(X_train)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(logreg.score(X_train, y_train)))

### K Fold Cross Validation

In [None]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

In [None]:
results.mean()

In [None]:
results.std()

### ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))

In [None]:
logit_roc_auc

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

### Precision and Recall

In [None]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [None]:
print('recall score = ',recall_score(y_test,y_pred))
print('precision score = ',precision_score(y_test,y_pred))

##### Classification Report on Test Set

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

##### Classification Report on Training Set

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_train,y_pred1))

#### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

### HyperParameter Tuning using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Create logistic regression instance
logistic = LogisticRegression()

In [None]:
# Regularization penalty space
penalty = ['l1', 'l2']

# Regularization hyperparameter space
C = np.logspace(0, 4, 10)

In [None]:
# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [None]:
# Create grid search using 5-fold cross validation
clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)

In [None]:
# Fit grid search
best_model = clf.fit(X_train, y_train)

In [None]:
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

In [None]:
y_pred_GCV = best_model.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(best_model.score(X_test, y_test)))

In [None]:
y_pred_GCV = best_model.predict(X_train)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(best_model.score(X_train, y_train)))

## Feature Selection based on Random Forest and Recursive Feature Elimination

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Create random forest classifer object that uses entropy
rfc = RandomForestClassifier(criterion='entropy', random_state=0, n_jobs=-1,n_estimators=200,max_depth=11)

# Train model
rfc_model = rfc.fit(X_train, y_train)
              
# Predict    
y_pred_rfc=rfc_model.predict(X_test)

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'.format(rfc_model.score(X_test, y_test)))

In [None]:
print(classification_report(y_test,y_pred_rfc))

In [None]:
# Create a series with feature importance 

rfc_model.feature_importances_

In [None]:
rfc_imp=list(rfc_model.feature_importances_)

In [None]:
rfc_colname=list(X.columns)

In [None]:
rfc_dict={'Column_Names_rfc':rfc_colname,'feature_imp_rfc':rfc_imp}

In [None]:
rfc_feature_imp=pd.DataFrame(rfc_dict)

In [None]:
rfc_feature_rank=rfc_feature_imp.sort_values(by='feature_imp_rfc',ascending = False)

In [None]:
rfc_feature_rank

## RFE Recursive Feature Elimination

In [None]:
from sklearn.feature_selection import RFE

In [None]:
model_rfe=LogisticRegression()

In [None]:
rfe=RFE(model_rfe,1)

In [None]:
rfe_fit=rfe.fit(X_train,y_train)

In [None]:
rfe_fit.n_features_

In [None]:
rfe_fit.ranking_

In [None]:
rank=list(rfe_fit.ranking_)

In [None]:
X.columns

In [None]:
col_nm=list(X.columns)

In [None]:
dict_rank={'Column_Name': col_nm,'Ranking':rank}

In [None]:
df_rank=pd.DataFrame(dict_rank)

#### Ranking of Predictor Variables Based on their importance in predicting the Churn

In [None]:
df_rank.sort_values('Ranking')

## Churn Probability

In [None]:
y_pred_list=list(y_pred)

In [None]:
y_prob=logreg.predict_proba(X_test)

In [None]:
y_prob_list=list(y_prob)

In [None]:
pd.DataFrame(y_prob_list,columns=['No_Churn','Churn']).sort_values(by='Churn', ascending=False).head(20)