# Churn Prediction using Random Forest

In [1]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


# load data
print("Loading dataset ... ")
df = pd.read_csv("churn.csv", header=0, delimiter=",")


# convert strings to boolean for the labels
y = df["Churn?"]
y = np.where(df["Churn?"] == "True.",1,0)


# drop irrelevant columns
X = df.drop(["Unnamed: 0","State","Area Code","Phone","Churn?"],axis=1)
features = X.columns
print("This data set contains %d data and %d features" % X.shape)
print()


# impute missing values
print("Imputing missing values for the following features ...")
X = X.replace('?', np.NaN)
for i in range(1, len(features)):
    if X[features[i]].isnull().values.any():
        print(features[i])
print()
X["Int'l Plan"] = X["Int'l Plan"].fillna(X["Int'l Plan"].mode()[0])
X["VMail Plan"] = X["VMail Plan"].fillna(X["VMail Plan"].mode()[0])
X["Day Charge"] = X["Day Charge"].fillna(X["Day Charge"].median())
X["Day Charge"] = X["Day Charge"].fillna(X["Day Charge"].median())
X["Eve Mins"] = X["Eve Mins"].fillna(X["Eve Mins"].median())
X["Eve Calls"] = X["Eve Calls"].fillna(X["Eve Calls"].median())
X["Night Charge"] = X["Night Charge"].fillna(X["Night Charge"].median())
X["Intl Calls"] = X["Intl Calls"].fillna(X["Intl Calls"].median())
X["Intl Charge"] = X["Intl Charge"].fillna(X["Intl Charge"].median())


# convert strings to boolean
X["Int'l Plan"][X["Int'l Plan"] == "no"] = 0
X["Int'l Plan"][X["Int'l Plan"] == "yes"] = 1
X["VMail Plan"][X["VMail Plan"] == "no"] = 0
X["VMail Plan"][X["VMail Plan"] == "yes"] = 1


# print csv to store the cleaned dataset for future use
clean_df = X.copy()
clean_df["churn"] = y
clean_df.to_csv("churn_clean.csv", index=False, encoding='utf-8')


# standandize features
scaler = StandardScaler()
X = scaler.fit_transform(X)


# Tuning hyper-parameter for random forest classifier using grid search with roc_auc_score on 10-fold cross validation
print("Training random forest classifier and finding the best parameters ...")
parameters = {"n_estimators": [10, 100, 500, 1000],
#               "criterion": ["gini", "entropy"],
              "max_depth": [10, None],
#               "min_samples_split": sp_randint(1, 11),
#               "min_samples_leaf": sp_randint(1, 11),
#               "max_features": ["sqrt", "log2", None]
#               "min_impurity_split": [1e-07],
#               "bootstrap": [True, False],            
              }
grid_search = GridSearchCV(RandomForestClassifier(), parameters, cv=10, scoring="roc_auc", n_jobs=-1, verbose=1)
grid_search.fit(X,y)
print()
print("Best parameters set :")
print(grid_search.best_params_)
print()
print("Best score: %0.3f" % grid_search.best_score_)
print()
print("Grid scores :")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))


Loading dataset ... 
This data set contains 3333 data and 17 features

Imputing missing values for the following features ...
Int'l Plan
VMail Plan
Day Charge
Eve Mins
Eve Calls
Night Charge
Intl Calls
Intl Charge



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Training random forest classifier and finding the best parameters ...
Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  55 tasks      | elapsed:   46.8s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  1.7min finished



Best parameters set :
{'max_depth': 10, 'n_estimators': 500}

Best score: 0.902

Grid scores :
0.899 (+/-0.073) for {'max_depth': 10, 'n_estimators': 10}
0.902 (+/-0.087) for {'max_depth': 10, 'n_estimators': 100}
0.902 (+/-0.086) for {'max_depth': 10, 'n_estimators': 500}
0.901 (+/-0.092) for {'max_depth': 10, 'n_estimators': 1000}
0.891 (+/-0.067) for {'max_depth': None, 'n_estimators': 10}
0.900 (+/-0.087) for {'max_depth': None, 'n_estimators': 100}
0.897 (+/-0.084) for {'max_depth': None, 'n_estimators': 500}
0.896 (+/-0.089) for {'max_depth': None, 'n_estimators': 1000}
