# Task 2 - Model Building and Training

In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys
# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

# Set max rows and columns to display
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Configure logging
from logger_setup import SetupLogger
# Assuming this class is defined in scripts/
from load_data import LoadData  

logger = SetupLogger(log_file='../logs/notebooks.log').get_logger()

In [3]:
# Create instances
load_fraud = LoadData('../data/processed_fraud_data.csv', logger=logger)
load_credit = LoadData('../data/creditcard.csv', logger=logger)
# Load the datasets
fraud_data = load_fraud.load_dataset().set_index('user_id')
credit_data = load_credit.load_dataset()

In [7]:
fraud_data.head()

Unnamed: 0_level_0,purchase_value,age,class,purchase_delay,hour_of_day,day_of_week,user_transaction_frequency,device_transaction_frequency,user_transaction_velocity,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
22058,-0.160204,0.679914,0,-0.136057,-1.377455,0.99102,0.0,-0.261514,-0.230128,0.0,1.0,0.0,0.0,0.0,0.0,1.0
333320,-1.142592,2.304476,0,-1.571877,-1.522122,-1.501259,0.0,-0.261514,-0.229874,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1359,-1.197169,2.304476,1,-1.577617,0.937208,-0.005891,0.0,3.941861,4.345476,0.0,1.0,0.0,0.0,1.0,0.0,1.0
150084,0.385567,0.911994,0,-1.420213,0.213876,-1.501259,0.0,-0.261514,-0.23012,0.0,1.0,0.0,0.0,0.0,1.0,1.0
221365,0.112681,1.376155,0,-0.182509,0.937208,-0.504347,0.0,-0.261514,-0.230128,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [8]:
# Understand the shapes
fraud_data.shape

(151112, 16)

In [9]:
# Check any missing values
# print(credit_data.isnull().sum())
print(fraud_data.isnull().sum())

purchase_value                  0
age                             0
class                           0
purchase_delay                  0
hour_of_day                     0
day_of_week                     0
user_transaction_frequency      0
device_transaction_frequency    0
user_transaction_velocity       0
source_Direct                   0
source_SEO                      0
browser_FireFox                 0
browser_IE                      0
browser_Opera                   0
browser_Safari                  0
sex_M                           0
dtype: int64


In [10]:
from data_preparation import DataPreparation
# Assuming df_creditcard is the DataFrame for the credit card dataset
_creditcard = DataPreparation(credit_data, target_column='Class')
_creditcard.train_test_split(test_size=0.2, random_state=42)

# Retrieving the train and test sets
X_train_cc, X_test_cc, y_train_cc, y_test_cc = _creditcard.get_train_test_data()

Data split into training and testing sets successfully.


In [11]:
# Assuming df_fraud is the DataFrame for the fraud dataset
_fraud = DataPreparation(fraud_data, target_column='class')
_fraud.train_test_split(test_size=0.2, random_state=42)

# Retrieving the train and test sets
X_train_fd, X_test_fd, y_train_fd, y_test_fd = _fraud.get_train_test_data()

Data split into training and testing sets successfully.


Model Selection
* Import ModelPipeline class from model_pipeline
* Train multiple models
* hyperparameter tune
* evaluate the model
* compare the model

In [12]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Disable CUDA

# Import the class
from model_pipeline import ModelPipeline

2025-03-01 14:32:19.918934: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-01 14:32:19.930563: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-01 14:32:20.028371: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-01 14:32:20.115379: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740828740.217179   70467 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740828740.24

Train and Evaluate the models on the e-commerce fruad dataset

In [13]:
# Create instance of the pipeline
model_pipeline = ModelPipeline(X_train_fd, X_test_fd, y_train_fd, y_test_fd)

# add models
# model_pipeline.add_models()
# Train and evaluate, logging with MLflow
best_model, best_model_name = model_pipeline.train_and_evaluate()
model_pipeline.save_best_models(best_model, best_model_name, 'fraud')



2025-03-01 14:32:43.057640: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Tuning hyperparameters for Random Forest...
Random Forest best parameters: {'classifier__max_depth': 5, 'classifier__n_estimators': 50}
Tuning hyperparameters for Gradient Boosting...
Gradient Boosting best parameters: {'classifier__learning_rate': 0.01, 'classifier__n_estimators': 100}
Random Forest took 4.94 seconds to train


Registered model 'random_forest' already exists. Creating a new version of this model...
Created version '4' of model 'random_forest'.


Random Forest model trained and logged with MLflow
Gradient Boosting took 34.96 seconds to train


Registered model 'gradient_boosting' already exists. Creating a new version of this model...
Created version '4' of model 'gradient_boosting'.


Gradient Boosting model trained and logged with MLflow
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step




LSTM took 217.69 seconds to train


Registered model 'lstm' already exists. Creating a new version of this model...
Created version '4' of model 'lstm'.


LSTM model trained and logged with MLflow
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step




CNN took 82.89 seconds to train




CNN model trained and logged with MLflow
Random Forest best model saved.


Registered model 'cnn' already exists. Creating a new version of this model...
Created version '4' of model 'cnn'.


In [14]:
# Get the results 
results_fraud, y_probs_fraud = model_pipeline.get_results()

In [15]:
pd.DataFrame(results_fraud).T

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Random Forest,0.956424,1.0,0.537895,0.699521,0.849777
Gradient Boosting,0.956424,1.0,0.537895,0.699521,0.849551
LSTM,0.956424,1.0,0.537895,0.699521,0.78627
CNN,0.956424,1.0,0.537895,0.699521,0.839557


Train and Evaluate the models on the creditcard dataset

In [17]:
# Get the results 
results_creditcard, y_probs_credicard = model_pipeline.get_results()

Note: Model training and evaluation have been completed, with all processes tracked using MLflow. Please refer to the document for screenshots showcasing the tracked models across different versions.

In [None]:
# # Import neccessary modules
from model_explainer import ModelExplainer

# Create instance of the class
fraud_explainer = ModelExplainer('../app/gradient_boosting_fraud_best_model.pkl', X_test_fd)

 # Explains the first instance in the dataset
fraud_explainer.explain_model(instance_idx=0) 