In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
from sklearn.preprocessing import StandardScaler

In [8]:
df=pd.read_csv("Churn_Modelling.csv")
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [10]:
#remove the irrelevant columns
df=df.drop(['RowNumber','CustomerId','Surname'],axis=1)

In [11]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [12]:
#checkingg the null values
df.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [13]:
#convert the categorical columns into numerical columns
#Gender column using labelencoder

from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['Gender']= encoder.fit_transform(df['Gender'])

In [14]:
df.sample(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
841,818,France,0,31,1,186796.37,1,0,0,178252.63,0
9488,671,Spain,1,32,6,123912.78,2,1,1,146636.44,0
7458,728,Spain,0,43,5,0.0,1,1,1,120088.17,0
6403,850,Germany,1,55,0,98710.89,1,1,1,83617.17,1
2861,525,France,0,25,6,0.0,2,1,0,89566.64,0


In [15]:
df['Gender'].value_counts()

Gender
1    5457
0    4543
Name: count, dtype: int64

In [16]:
#Geography column usnig OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
ohencoder=OneHotEncoder()
geography_encoder=ohencoder.fit_transform(df[['Geography']])
geography_encoder.toarray()

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [17]:
ohencoder.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [18]:
geo_encoded_df=pd.DataFrame(geography_encoder.toarray(),columns=ohencoder.get_feature_names_out(['Geography']))

In [19]:
geo_encoded_df.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [20]:
df=pd.concat([df.drop(columns=['Geography']),geo_encoded_df],axis=1)

In [21]:
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [22]:
# pickle the encoder files
with open('label_encoder.pkl','wb') as file:
    pickle.dump(encoder,file)
    
with open('onehotencoder.pkl','wb') as file:
    pickle.dump(ohencoder,file)

In [23]:
#Divide the dependent and independent features
x=df.drop(['Exited'],axis=1)
y=df['Exited']

# Split the data into training and testing sets
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

#scale the independent features
scaler= StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)

x_test_scaled= scaler.transform(x_test)

In [24]:
x_train_scaled.shape

(8000, 12)

In [25]:
with open('scaler.pkl','wb') as file:
    pickle.dump(scaler,file)

## ANN implimentation

In [26]:
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime

In [29]:
#built our model

model=Sequential([
    Dense(64,activation='relu',input_shape=(x_train_scaled.shape[1],)), #Hidden layer1 connected with input layer
    Dense(32,activation='relu'),#Hl2
    Dense(1,activation='sigmoid')#output layer
    
]
)

In [30]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 64)                832       
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2945 (11.50 KB)
Trainable params: 2945 (11.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [32]:
opt=tensorflow.keras.optimizers.Adam(learning_rate=0.01)
loss=tensorflow.keras.losses.BinaryCrossentropy()
loss

<keras.src.losses.BinaryCrossentropy at 0x17b633b3310>

In [39]:
#compile the model
model.compile(optimizer=opt,loss='binary_crossentropy',metrics=['accuracy'])

In [40]:
#setup the tensorboard
log_dir="logs/fit" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [41]:
tensorflow_callback=TensorBoard(log_dir=log_dir, histogram_freq=1)

In [42]:
#Set up Early stopping

early_stopping_callback=EarlyStopping(monitor='val_loss',patience=10,restore_best_weights=True)

In [43]:
history = model.fit(
    x_train_scaled,y_train,validation_data=(x_test_scaled,y_test),epochs=100,
    callbacks=[early_stopping_callback,tensorflow_callback]
)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100


In [44]:
model.save('model.h5')

  saving_api.save_model(


In [45]:
# load tensorboard extension
%load_ext tensorboard