# ANN Project

## Importing Libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

## Loading Dataset

In [3]:
# Load the data
data = pd.read_csv('Churn_Modelling.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


## Data Preprocessing

In [4]:
# Droping irrelevant columns
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1) # axis=1 for removing column wise
data 

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


### Encoding Categorical Data 

In [5]:
# Encoding categorical data (Geography , Gender)
label_encoder_gender = LabelEncoder()
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])
data


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.80,3,1,0,113931.57,1
3,699,France,0,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,1,39,5,0.00,2,1,0,96270.64,0
9996,516,France,1,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,0,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,1,42,3,75075.31,2,1,0,92888.52,1


In [6]:
# One hot encoding for Geography , label encoder will assign 0,1,2 to the countries which is not good for model as model will take the order of the countries into consideration giving more weight to the country with higher value
# In gender we have only 2 values so we can use label encoder as it will assign 0,1 to the values which is fine as model will treat them as binary values but when it becomes more than 2 we should use one hot encoding
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder() 
geo_encoder = onehotencoder.fit_transform(data[['Geography']]) # fit transform expects 2D array so we need to pass 2D array
geo_encoder


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10000 stored elements and shape (10000, 3)>

In [7]:
geo_encoder.toarray() # toarray() is used to convert the sparse matrix to array , we want array format as we want to add it to the data frame and we can't add sparse matrix to data frame
onehotencoder.get_feature_names_out(['Geography']) # to get the column names for the one hot encoded values


array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [8]:
geo_encoded = pd.DataFrame(geo_encoder.toarray(), columns=onehotencoder.get_feature_names_out(['Geography']))
# pd.Dataframe converts the array to dataframe and columns=onehotencoder.get_feature_names_out(['Geography']) is used to assign the column names to the dataframe
# dataframe is basically a table with rows and columns
geo_encoded

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


In [9]:
# Concatenating the data frames
data = pd.concat([data.drop('Geography', axis=1), geo_encoded], axis=1) # concat here has to be passed with a list of dataframes to concatenate and axis=1 is used to concatenate column wise
# for simply merging we could have used pd.concat([data, geo_encoded], axis=1) but here we are dropping the Geography column from the data frame and adding the one hot encoded values
data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,0,1.0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,0,1.0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1,1.0,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,1,0.0,1.0,0.0


* We could have also directly changed to geography column values to encoded values using onehotencoder but instead we are creating separte columns for each country and dropping main geography column to improve model performance as models can interpret better when have each category as separate binary feature.

In [10]:
# Save the encoder and scaler to pickle files for future use as we will need to use the same encoder and scaler for the test data
with open('label_encoders_gender.pkl', 'wb') as f:
    pickle.dump(label_encoder_gender, f) # saving the label encoder 
    
with open('onehot_encoders_geo.pkl', 'wb') as f:
    pickle.dump(onehotencoder, f) # saving the one hot encoder

### Splitting the dataset into the Training set and Test set

In [11]:
X = data.drop('Exited', axis=1) # dividing the data into features and target
y = data['Exited']

# Splitting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Feature Scaling

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
# Save the scaler to a pickle file
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f) # saving the scaler

## ANN Implementation

In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential # Sequential is used to create a sequential model in which we can add layers one by one
from tensorflow.keras.layers import Dense # Dense layer is used to create a fully connected layer in the neural network , hidden layers are created using dense layer
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard # Early stopping is used to stop the training of the model when the model is not learning anything new
import datetime

In [15]:
X_train.shape[1] # to get the number of features in the data , 1 is used to get the number of columns

12

In [16]:
# Creating the model``
model = Sequential(
    [
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)), #Hidden layer 1 with 64 neurons and relu activation function
        # and input_shape is used to specify the number of features in the data , it is used only for the first layer because it is getting input from input data
        # thereafter the input shape is not required as the input shape is already known as the output of the previous layer automatically by the model
        Dense(32, activation='relu'), #Hidden layer 2 with 32 neurons and relu activation function
        Dense(1, activation='sigmoid') #Output layer with 1 neuron and sigmoid activation function
    ]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [17]:
model.summary() # to get the summary of the model

In [18]:
# Compiling the model , compiling the model is used to configure the model for training 
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # adam is the optimizer used for backpropagation ,
# binary_crossentropy is the loss function used for binary classification and accuracy is the metric used to evaluate the model
# accuracy is the percentage of correct predictions made by the model  

In [19]:
# Setting up tensorboard for visualization i.e. to visualize the training of the model
from tensorflow.keras.callbacks import TensorBoard , EarlyStopping
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # to create a log directory with the current date and time
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1) # to create a tensorboard callback 

In [20]:
# Setting up early stopping to stop the training of the model when the model is not learning anything new
early_stopping = EarlyStopping(monitor='val_loss', patience=10,restore_best_weights=True) # patience is the number of epochs with no improvement after which training will be stopped
# monitor means check the validation loss , if the validation loss is not decreasing then stop the training 



### Model Training

In [21]:
history = model.fit(X_train, y_train, batch_size = 32, epochs = 100, validation_data=(X_test, y_test),
                    callbacks=[tensorboard_callback, early_stopping]
                    )

# model.fit is used to train the model , it takes the training data and labels as input and batch size is the number of samples that will be used in each iteration
# epochs is the number of times the model will be trained on the data
# validation_data is used to pass the validation data to the model , the model will be evaluated on this data after each epoch
# we took 32 as batch size because it is a good practice to take the batch size as a power of 2
# in 1st iteration the model will be trained on the first 32 samples and in the 2nd iteration the model will be trained on the next 32 samples and so on
# we train model again and again on the data to make the model learn the patterns in the data

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.8000 - loss: 0.4866 - val_accuracy: 0.8375 - val_loss: 0.4025
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8322 - loss: 0.4057 - val_accuracy: 0.8535 - val_loss: 0.3730
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8507 - loss: 0.3578 - val_accuracy: 0.8630 - val_loss: 0.3449
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8517 - loss: 0.3619 - val_accuracy: 0.8510 - val_loss: 0.3501
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8596 - loss: 0.3405 - val_accuracy: 0.8635 - val_loss: 0.3444
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8628 - loss: 0.3337 - val_accuracy: 0.8675 - val_loss: 0.3375
Epoch 7/100
[1m250/2

In [22]:
# Saving the model for future use
model.save('model.h5') # to save the model in h5 format , h5 is a file format to store the model



In [23]:
# Loading the tensorboard logs
%load_ext tensorboard

In [25]:
%tensorboard --logdir logs/fit/20250125-191628

Reusing TensorBoard on port 6006 (pid 2492), started 0:01:57 ago. (Use '!kill 2492' to kill it.)