In [25]:
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical operations and array handling
import matplotlib.pyplot as plt  # Plotting and visualization
import seaborn as sns  # Statistical data visualization
from sklearn import datasets, model_selection, preprocessing, metrics  # Machine learning tools
import tensorflow
import tensorflow as tf  # Deep learning framework
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime
import torch  # Deep learning framework
import torchvision  # Computer vision tools for PyTorch
import plotly.express as px  # Interactive plotting
import plotly.graph_objects as go  # More interactive plotting options
import bokeh.plotting as bkp  # Interactive visualization library
import altair as alt  # Declarative statistical visualization library
import gensim  # Topic modeling and document similarity
from textblob import TextBlob  # Text processing and NLP
import pyLDAvis  # Visualization of topic models
import dask.dataframe as dd  # Parallel computing with pandas-like DataFrames
import pyarrow as pa  # Columnar data format for efficient data interchange
import fastparquet  # Parquet file format for efficient data storage
import cv2  # Computer vision and image processing
from skimage import io, filters, color  # Image processing
import albumentations as A  # Image augmentation library
import gym  # Toolkit for developing and comparing reinforcement learning algorithms
from stable_baselines3 import PPO  # Reinforcement learning algorithms
from tqdm import tqdm  # Progress bar for loops
import joblib  # Serialization and deserialization of Python objects
from pydantic import BaseModel  # Data validation and settings management using Python type annotations
from transformers import pipeline  # State-of-the-art Natural Language Processing
import spacy  # Industrial-strength NLP
import nltk  # Natural Language Toolkit
import re
import openai  # OpenAI API for accessing GPT models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler,LabelEncoder
import pickle

In [28]:
data=pd.read_csv("Churn_Modelling.csv")
data=data.drop(['RowNumber','CustomerId','Surname'],axis=1)
label_encoder_gender=LabelEncoder()
data['Gender']=label_encoder_gender.fit_transform(data['Gender'])

onehot_encoder_geo=OneHotEncoder()
geo_encoder=onehot_encoder_geo.fit_transform(data[['Geography']]).toarray()
onehot_encoder_geo.get_feature_names_out(['Geography'])

geo_encoded_df=pd.DataFrame(geo_encoder,columns=onehot_encoder_geo.get_feature_names_out(['Geography']))

data=pd.concat([data.drop('Geography',axis=1),geo_encoded_df],axis=1)
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [15]:
# Split the data into features and target
X = data.drop('EstimatedSalary', axis=1)
y = data['EstimatedSalary']

## Split the data in training and tetsing sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

## Scale these features
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [16]:
# Save the encoders and scaler for later use
with open('label_encoder_gender.pkl', 'wb') as file:
    pickle.dump(label_encoder_gender, file)

with open('onehot_encoder_geo.pkl', 'wb') as file:
    pickle.dump(onehot_encoder_geo, file)

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

ANN Regression problem statement

In [17]:
# Build the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression & by default it has a linear activation function.
])

## compile the model
model.compile(optimizer='adam',loss='mean_absolute_error',metrics=['mae'])

model.summary()

## We can see total weights & bias = 2945. 
## 11 * 64 + 64 * 32 + 32 * 1 + (64+32+1) = 2945.  

## First Layer (Input to Hidden Layer 1)-------------------->
## Weights: There are 11 input neurons and 64 neurons in the first hidden layer. Each input neuron connects to each neuron in the hidden layer.
## Calculation: 11 * 64 = 704

## Second Layer (Hidden Layer 1 to Hidden Layer 2) ------------------->
## Weights: There are 64 neurons in the first hidden layer and 32 neurons in the second hidden layer. Each neuron in the first hidden layer connects to each neuron in the second hidden layer.
## Calculation: 64 * 32 = 2048

## Third Layer (Hidden Layer 2 to Output Layer)------------------------>
## Weights: There are 32 neurons in the second hidden layer and 1 output neuron. Each neuron in the second hidden layer connects to the output neuron.
## Calculation: 32 * 1 = 32

## Biases----------------------------->
## Biases: Each layer (except the input layer) has biases. There are biases for the 64 neurons in the first hidden layer, 32 neurons in the second hidden layer, and 1 output neuron.
## Calculation: 64 + 32 + 1 = 97

## Total Calculation------------------>
## Sum of all weights and biases: 704 + 2048 + 32 + 97 = 2945

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
## Set up the Tensorboard

log_dir="regressionlogs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") #log_dir: Creates a directory for storing logs with a timestamp.
tensorflow_callback=TensorBoard(log_dir=log_dir,histogram_freq=1) # tensorflow_callback: Sets up TensorBoard to visualize logs and histograms during model training.

## Set up Early Stopping for the epochs if you see loss value is not decreasing.
early_stopping_callback=EarlyStopping(monitor='val_loss',patience=100,restore_best_weights=True) 
# early_stopping_callback: Monitors validation loss (val_loss) and stops training if it doesn't improve for 100 epochs. Restores the best weights found during training.

### Train the model
history=model.fit( X_train,y_train,validation_data=(X_test,y_test),epochs=20,callbacks=[tensorflow_callback,early_stopping_callback])

## Train the model on training data (X_train, y_train) and validates on test data (X_test, y_test) for 20 epochs.
##  Uses TensorBoard and EarlyStopping callbacks.

Epoch 1/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 101314.9375 - mae: 101314.9375 - val_loss: 98487.9688 - val_mae: 98487.9688
Epoch 2/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 101054.9844 - mae: 101054.9844 - val_loss: 96830.9688 - val_mae: 96830.9688
Epoch 3/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 97737.8516 - mae: 97737.8516 - val_loss: 92654.9688 - val_mae: 92654.9688
Epoch 4/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 93440.6172 - mae: 93440.6172 - val_loss: 85787.8828 - val_mae: 85787.8828
Epoch 5/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 85439.1953 - mae: 85439.1953 - val_loss: 77124.2344 - val_mae: 77124.2344
Epoch 6/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 77342.2500 - mae: 77342.2500 - val_loss: 68185.4609 - val_mae:

In [19]:
## Training Loss (loss): Measures how well the model is performing on the training data. Lower values indicate better performance.
## Mean Absolute Error (mae): Average of the absolute errors between predicted and actual values. Lower values indicate more accurate predictions.
## Validation Loss (val_loss): Measures how well the model is performing on the validation data. Lower values indicate better generalization.
## Validation Mean Absolute Error (val_mae): Average of the absolute errors between predicted and actual values on the validation data. Lower values indicate more accurate predictions.
## As the epochs progress, both the loss and MAE values decrease, indicating that the model is learning and improving its performance.

In [22]:
## Load Tensorboard Extension
%load_ext tensorboard

# This has all of our model information from scratch.
%tensorboard --logdir regressionlogs/fit

%reload_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6007 (pid 12628), started 0:01:43 ago. (Use '!kill 12628' to kill it.)

In [23]:
## Evaluate model on the test data
test_loss,test_mae=model.evaluate(X_test,y_test)
print(f'Test MAE : {test_mae}')

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 772us/step - loss: 51555.2500 - mae: 51555.2500
Test MAE : 50517.59375


In [24]:
model.save('regression_model.h5')

