In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [12]:
dataset = pd.read_csv('Churn_Modelling.csv')
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [13]:
print(dataset.index)
print(dataset.columns)
print(dataset.describe())

RangeIndex(start=0, stop=10000, step=1)
Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')
         RowNumber    CustomerId   CreditScore           Age        Tenure  \
count  10000.00000  1.000000e+04  10000.000000  10000.000000  10000.000000   
mean    5000.50000  1.569094e+07    650.528800     38.921800      5.012800   
std     2886.89568  7.193619e+04     96.653299     10.487806      2.892174   
min        1.00000  1.556570e+07    350.000000     18.000000      0.000000   
25%     2500.75000  1.562853e+07    584.000000     32.000000      3.000000   
50%     5000.50000  1.569074e+07    652.000000     37.000000      5.000000   
75%     7500.25000  1.575323e+07    718.000000     44.000000      7.000000   
max    10000.00000  1.581569e+07    850.000000     92.000000     10.000000   

             Balance  NumOfProdu

In [14]:
X = dataset.iloc[:,3:13].values
Y = dataset.iloc[:,13].values

In [15]:
le_1 = LabelEncoder()
X[:,1] = le_1.fit_transform(X[:,1])
le_2 = LabelEncoder()
X[:,2] = le_2.fit_transform(X[:,2])

ohe = OneHotEncoder(categorical_features=[1])
X = ohe.fit_transform(X).toarray()
X = X[:,1:]

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

KERAS Initialization

In [7]:



classifier = Sequential()
classifier.add(Dense(output_dim=6,init='uniform',activation='relu',input_dim=11))
#Dropout - randomly drop inputs to the layer to reduce over adapting 
#reduces the high variance - overfitting
classifier.add(Dropout(p=0.1))
classifier.add(Dense(output_dim=6,init='uniform',activation='relu'))
classifier.add(Dense(output_dim=1,init='uniform',activation='sigmoid'))
#compile the ANN
classifier.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
#fit the classifier
classifier.fit(X_train,y_train,batch_size=10,epochs=100)

In [8]:
y_pred = classifier.predict(X_test)
y_pred = (y_pred >0.5)

Evaluation - Confusion Matrix

In [9]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)

In [10]:
test_accuracy = (cm[0,0] + cm[1,1])/np.sum(cm)
test_accuracy

0.8405

<b> K-Fold Cross Validation with Keras </b>

In [11]:


def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(output_dim=6,init='uniform',activation='relu',input_dim=11))
    classifier.add(Dense(output_dim=6,init='uniform',activation='relu'))
    classifier.add(Dense(output_dim=1,init='uniform',activation='sigmoid'))
    #compile the ANN
    classifier.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
    return classifier

k_fold_classifier = KerasClassifier(build_fn=build_classifier,batch_size=10,epochs=100)
accuracies = cross_val_score(estimator=k_fold_classifier,X=X_train,y=y_train,cv=10,n_jobs=-1) #njobs - number of CPU to use - -1 to use all CPUs


In [12]:
accuracies

array([0.845     , 0.83625   , 0.87499999, 0.82999999, 0.84999999,
       0.83125   , 0.85499999, 0.85875   , 0.83624999, 0.8675    ])

<h4><b>Grid Search for Keras</b></h4>

In [17]:
def build_classifier_for_grid(optimizer,output_dim):
    classifier = Sequential()
    classifier.add(Dense(output_dim=output_dim,init='uniform',activation='relu',input_dim=11))
    classifier.add(Dense(output_dim=output_dim,init='uniform',activation='relu'))
    classifier.add(Dense(output_dim=1,init='uniform',activation='sigmoid'))
    #compile the ANN
    classifier.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return classifier

wrapper_classifier = KerasClassifier(build_fn=build_classifier_for_grid)
parameters = {'batch_size':[24,32],'epochs':[100,500],'output_dim':[7,10],'optimizer':['adam','rmsprop']}
grid_search = GridSearchCV(estimator=wrapper_classifier, param_grid=parameters, scoring='accuracy', cv=10)
grid_search.fit(X_train,y_train)

### getting the best paramters and accuracy
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_