In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


Matplotlib is building the font cache; this may take a moment.


In [2]:
df=pd.read_csv('Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


Here we dont want the data of the RowNumber CustomerId Surname 
Because it will not much effect the dependent variable
so, we simply drop that columns

In [3]:
df.drop(columns=['RowNumber','CustomerId','Surname'],axis=1,inplace=True)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


We are having the categorical columns as Geography, Gender, so we need to change this to numeric 

In [4]:
#Label encoder for gender column
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['Gender']=le.fit_transform(df['Gender'])

In [5]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [14]:
#One Hot encoding for geography col
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder()
ohe_geo=ohe.fit_transform(df[['Geography']])
ohe_geo

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10000 stored elements and shape (10000, 3)>

In [15]:
#convering the sparse matrix into array
geo=ohe_geo.toarray()
geo

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [19]:
ohe.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [20]:
encoded_df=pd.DataFrame(geo,columns=ohe.get_feature_names_out(['Geography']))
encoded_df.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [23]:
#drop the original column adn add these ohe cols
final_df=pd.concat([df.drop('Geography',axis=1),encoded_df],axis=1)
final_df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


Dependent and Independent Features

In [27]:
X=final_df.drop('Exited',axis=1)
y=final_df['Exited']
X.shape,y.shape

((10000, 12), (10000,))

Train test split

In [28]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=18)
X_train.shape,y_train.shape

((7000, 12), (7000,))

Standerdizing the data

In [29]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)
X_train

array([[ 0.75146204, -1.09349623,  2.77791433, ..., -1.00515615,
         1.71247898, -0.56679212],
       [ 1.45454019, -1.09349623, -0.56205559, ...,  0.9948703 ,
        -0.58394877, -0.56679212],
       [-1.62659642, -1.09349623, -0.18034474, ...,  0.9948703 ,
        -0.58394877, -0.56679212],
       ...,
       [ 1.29944943,  0.9144979 , -0.08491703, ..., -1.00515615,
         1.71247898, -0.56679212],
       [ 0.10008081,  0.9144979 ,  0.29679382, ..., -1.00515615,
        -0.58394877,  1.7643153 ],
       [ 0.46195927,  0.9144979 ,  0.9647878 , ...,  0.9948703 ,
        -0.58394877, -0.56679212]])

Saving the all transformations into pickle files

In [33]:
import pickle
with open('D:\DL\ANN Project\labelencoder.pkl','wb') as file:
    pickle.dump(le,file)

with open('D:\DL\ANN Project\onehotencoder.pkl','wb') as file:
    pickle.dump(ohe,file)

with open('D:\DL\ANN Project\standardscaler.pkl','wb') as file:
    pickle.dump(ss,file)

ANN MODEL

In [34]:
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime

In [35]:
X_train.shape[1]

12

In [42]:
#creating a model
model=Sequential([Dense(units=12,activation='relu',input_shape=(X_train.shape[1],)),
                  Dense(units=64,activation='relu'),
                  Dense(units=32,activation='relu'),
                  Dense(units=1,activation='sigmoid')
                  ])

In [43]:
model.summary()

In [44]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [45]:
model.fit(X_train, y_train, epochs=50, batch_size=32)

Epoch 1/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.7577 - loss: 0.5239
Epoch 2/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8092 - loss: 0.4373
Epoch 3/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8177 - loss: 0.4191
Epoch 4/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8207 - loss: 0.4177
Epoch 5/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8305 - loss: 0.3964
Epoch 6/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8382 - loss: 0.3845
Epoch 7/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8377 - loss: 0.3776
Epoch 8/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8527 - loss: 0.3509
Epoch 9/50
[1m219/219[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x16e6a346090>

In [47]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
hystory=model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.8705 - loss: 0.2952 - val_accuracy: 0.8530 - val_loss: 0.3612
Epoch 2/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8789 - loss: 0.3016 - val_accuracy: 0.8587 - val_loss: 0.3593
Epoch 3/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8797 - loss: 0.2916 - val_accuracy: 0.8563 - val_loss: 0.3553
Epoch 4/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8740 - loss: 0.2982 - val_accuracy: 0.8560 - val_loss: 0.3570
Epoch 5/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8771 - loss: 0.2887 - val_accuracy: 0.8560 - val_loss: 0.3562
Epoch 6/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8807 - loss: 0.2930 - val_accuracy: 0.8563 - val_loss: 0.3569
Epoch 7/50
[1m219/219[0m 

Model prediction on testdata

In [93]:
y_pred=model.predict(X_test)
y_pred

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


array([[0.12627791],
       [0.00953373],
       [0.05064221],
       ...,
       [0.25005534],
       [0.44512725],
       [0.1985737 ]], dtype=float32)

In [102]:
y_pred_proba = y_pred.flatten()

In [103]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Convert probabilities to class labels
y_pred_labels = (y_pred_proba > 0.5).astype(int)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred_labels)
precision = precision_score(y_test, y_pred_labels)
recall = recall_score(y_test, y_pred_labels)
f1 = f1_score(y_test, y_pred_labels)
conf_matrix = confusion_matrix(y_test, y_pred_labels)

# For ROC AUC, use the probabilities (not labels)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'ROC AUC: {roc_auc:.2f}')


Accuracy: 0.86
Precision: 0.73
Recall: 0.50
F1 Score: 0.59
Confusion Matrix:
[[2256  115]
 [ 316  313]]
ROC AUC: 0.85


In [49]:
model.save('model.h5')



Loading the pickle files

In [65]:
from tensorflow.keras.models import load_model

with open('onehotencoder.pkl','rb') as file:
    ohe=pickle.load(file)

with open('labelencoder.pkl','rb') as file:
    le=pickle.load(file)

with open('standardscaler.pkl','rb') as file:
    ss=pickle.load(file)

model=load_model('model.h5')



In [62]:
input_data = {
    'CreditScore': 720,
    'Geography': 'France',  # This would typically be one-hot encoded or mapped to numerical values in practice
    'Gender': 'Female',  # This would typically be one-hot encoded or mapped to numerical values in practice
    'Age': 34,
    'Tenure': 7,
    'Balance': 9500,
    'NumOfProducts': 3,
    'HasCrCard': 1,  # Assuming 1 means Yes and 0 means No
    'IsActiveMember': 0,  # Assuming 1 means Yes and 0 means No
    'EstimatedSalary': 60000
}


In [59]:
encoded_gender=le.transform([input_data['Gender']])
encoded_gender

array([0])

In [66]:
encoded_geo=ohe.transform([[input_data['Geography']]])
encoded_geo



<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1 stored elements and shape (1, 3)>

In [67]:
encoded_geo

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1 stored elements and shape (1, 3)>

In [68]:
encoded_geo.toarray()

array([[1., 0., 0.]])

In [71]:
encoded_df=pd.DataFrame(encoded_geo.toarray(),columns=ohe.get_feature_names_out(['Geography']))
encoded_df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0


In [77]:
dict_df=pd.DataFrame([input_data])

In [83]:
dict_df['Gender']=encoded_gender

In [84]:
dict_df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,720,France,0,34,7,9500,3,1,0,60000


In [85]:
test_df=pd.concat([dict_df.drop(columns=['Geography'],axis=1),encoded_df],axis=1)

In [86]:
test_df

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,720,0,34,7,9500,3,1,0,60000,1.0,0.0,0.0


In [88]:
input_scaled=ss.transform(test_df)
input_scaled

array([[ 0.72044389, -1.09349623, -0.46662788,  0.69088627, -1.07972994,
         2.55467339,  0.64620273, -1.04023747, -0.69160292,  0.9948703 ,
        -0.58394877, -0.56679212]])

In [89]:
prediction=model.predict(input_scaled)
prediction

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 461ms/step


array([[0.12328208]], dtype=float32)

In [90]:
predict_proba=prediction[0][0]
predict_proba

0.12328208

In [92]:
if predict_proba > 0.5:
    print('The customer is likely to churn')
else:
    print('The customer is not likely to churn')

The customer is not likely to churn
