In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score

In [16]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [32]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [34]:
dataset = pd.read_csv('Churn_Modelling.csv', index_col = 'RowNumber')

In [36]:
dataset.head()

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [38]:
# Features and target
X = dataset.iloc[:, 2:12]
X  # drop CustomerId and Surname

Unnamed: 0_level_0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,619,France,Female,42,2,0.00,1,1,1,101348.88
2,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
3,502,France,Female,42,8,159660.80,3,1,0,113931.57
4,699,France,Female,39,1,0.00,2,0,0,93826.63
5,850,Spain,Female,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9996,771,France,Male,39,5,0.00,2,1,0,96270.64
9997,516,France,Male,35,10,57369.61,1,1,1,101699.77
9998,709,France,Female,36,7,0.00,1,0,1,42085.58
9999,772,Germany,Male,42,3,75075.31,2,1,0,92888.52


In [40]:
# Features and target
Y = dataset.iloc[:, 12].values  # Churn column
Y

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

We are treating countries with ordinal values(0 < 1 < 2) but they are incomparable.
To solve this we can use one hot encoding.
We will perform some standardization 

In [43]:
# Pipeline: One-hot encode categorical and scale
pipeline = Pipeline([
    ('preprocess', ColumnTransformer(
        transformers=[
            ('gender', OneHotEncoder(drop='first'), ['Gender']),
            ('geo', OneHotEncoder(drop='first'), ['Geography'])
        ],
        remainder='passthrough'
    )),
    ('scaler', StandardScaler())
])

In [45]:
#Standardize the features
X = pipeline.fit_transform(X)

In [47]:
#Spilt the data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [49]:
# Build ANN
classifier = Sequential()
classifier.add(Dense(6, activation='relu', input_shape=(X_train.shape[1],)))
classifier.add(Dropout(0.1))
classifier.add(Dense(6, activation='relu'))
classifier.add(Dropout(0.1))
classifier.add(Dense(1, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [55]:
# Train ANN
history = classifier.fit(X_train, y_train, batch_size=32, epochs=100, validation_split=0.1, verbose=2)

ValueError: You must call `compile()` before using the model.

In [57]:
# Predict
y_pred = (classifier.predict(X_test) > 0.5).astype(int)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [59]:
y_pred = classifier.predict(X_test)
print(y_pred[:5])

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[[0.576405  ]
 [0.68890285]
 [0.8079057 ]
 [0.38708237]
 [0.5681131 ]]


In [61]:
#Let us use confusion matrix with cutoff value as 0.5
y_pred = (y_pred > 0.5).astype(int)
print(y_pred[:5])

[[1]
 [1]
 [1]
 [0]
 [1]]


In [None]:
#Making the Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
#Accuracy of our NN
print(((cm[0][0] + cm[1][1])* 100) / len(y_test), '% of data was classified correctly')