In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict

In [3]:
churn = pd.read_csv('churn.csv')

In [4]:
churn.head()

Unnamed: 0.1,Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,0,KS,128,415,382-4657,no,yes,25,265.1,110,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,1,OH,107,415,371-7191,no,yes,26,161.6,123,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,2,NJ,137,415,358-1921,no,no,0,243.4,114,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,3,OH,84,408,375-9999,yes,no,0,299.4,71,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,4,OK,75,415,330-6626,yes,no,0,166.7,113,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


### Check for Missing Values in Table '?' and Achieve 0.893+ AUC Using 10 Fold Cross - Validation

In [5]:
# Convert Data Type of Colums Having Missing Values to float64

churn['Day Charge'] = churn['Day Charge'].apply(pd.to_numeric, errors='coerce')
# ----------------------------------------------------------------------------------
churn['Eve Mins'] = churn['Eve Mins'].apply(pd.to_numeric, errors='coerce')
churn['Eve Calls'] = churn['Eve Calls'].apply(pd.to_numeric, errors='coerce')
# ----------------------------------------------------------------------------------
churn['Night Charge'] = churn['Night Charge'].apply(pd.to_numeric, errors='coerce')
# ----------------------------------------------------------------------------------
churn['Intl Calls'] = churn['Intl Calls'].apply(pd.to_numeric, errors='coerce')
churn['Intl Charge'] = churn['Intl Charge'].apply(pd.to_numeric, errors='coerce')

In [6]:
# Replace Missing Values with Mean of Columns corresponding to Churn? Label

churn['Day Charge'].fillna(churn.groupby('Churn?')['Day Charge'].transform("mean"), inplace=True)
# --------------------------------------------------------------------------------------------------
churn['Eve Mins'].fillna(churn.groupby('Churn?')['Eve Mins'].transform("mean"), inplace=True)
churn['Eve Calls'].fillna(churn.groupby('Churn?')['Eve Calls'].transform("mean"), inplace=True)
# ---------------------------------------------------------------------------------------------------
churn['Night Charge'].fillna(churn.groupby('Churn?')['Night Charge'].transform("mean"), inplace=True)
# ---------------------------------------------------------------------------------------------------
churn['Intl Calls'].fillna(churn.groupby('Churn?')['Intl Calls'].transform("mean"), inplace=True)
churn['Intl Charge'].fillna(churn.groupby('Churn?')['Intl Charge'].transform("mean"), inplace=True)

In [7]:
# Convert Output Of Label To Binary Form
churn['Churn?'] = churn['Churn?'].apply(lambda x: 0 if x == 'False.' else 1)

In [8]:
# Initialize Input Features By Selecting Columns Having Continous Values
X = churn[['Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls']]

In [9]:
# Initialize Output Label
y = churn['Churn?']

In [10]:
# Initialize 10 Fold Cross-Validation Data Set
num_folds = 10
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=None)

In [11]:
# Initialize Random Forest Parameters and Start Training the Model
num_trees = 100
max_features = 5
model = RandomForestClassifier(n_estimators = num_trees, max_features=max_features)

In [12]:
# Compute K-Fold Cross Validation Using Random Forest Classifier 
results_predict = cross_val_predict(model, X, y, cv=kfold, method='predict_proba')

In [13]:
# Display Area Under Curve
AUC = roc_auc_score(y,results_predict[:,1], average='macro')
print('ROC_AUC is {0:.3f}'.format(AUC))

ROC_AUC is 0.901


In [14]:
# Display Accuracy of Model
# results = cross_val_score(model,X,y,cv=kfold)
# print("Accuracy: ", results.mean()*100, "+_", results.std()*100)