In [1]:
# necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import time

# machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# evaluation matrices
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#### accuracy_score: A metric that calculates the ratio of correctly predicted instances to the total instances.
#### classification_report: A detailed report showing the precision, recall, f1-score, and support for each class in a classification problem.
#### confusion_matrix: A table used to describe the performance of a classification model by showing the true positives, false positives, true negatives, and false negatives

In [2]:
# Load the datasets
X_resampled = pd.read_csv("X_resampled.csv")
y_resampled = pd.read_csv("y_resampled.csv")
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [3]:
# Check the shape of the datasets
print("Shape of X_resampled:", X_resampled.shape)
print("Shape of y_resampled:", y_resampled.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_resampled: (2461812, 8)
Shape of y_resampled: (2461812, 1)
Shape of X_test: (555719, 8)
Shape of y_test: (555719, 1)


In [4]:
X_resampled.columns

Index(['category', 'amt', 'city', 'state', 'zip', 'merch_lat', 'merch_long',
       'merchant_mean_encoded'],
      dtype='object')

#### Standardization - The process of transforming data features to have a mean of zero and a standard deviation of one

In [5]:
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

#### Normalization - Scaling individual data features to a specific range, typically [0, 1] or [-1, 1], to ensure uniformity

In [6]:
scaler = MinMaxScaler()
X_resampled_scaledN = scaler.fit_transform(X_resampled)
X_test_scaledN = scaler.transform(X_test)

---
## 1) Logistic Regression (LG)

#### - A linear model used for binary classification that estimates the probability of a binary outcome based on one or more predictor variables

#### a) LG with standardize data

In [7]:
# Start the timer
start_time = time.time()

In [8]:
# Initialize the logistic regression model
model = LogisticRegression()

# Fit the model to the training data
model.fit(X_resampled_scaled, y_resampled.values.ravel())

In [9]:
# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"LG (Standardized) Accuracy: {accuracy*100:.2f}%")

# Generate classification report
print("\nLG (Standardized) Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
print("\nLG (Standardized) Confusion Matrix :")
print(confusion_matrix(y_test, y_pred))

LG (Standardized) Accuracy: 94.77%

LG (Standardized) Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97    553574
           1       0.05      0.76      0.10      2145

    accuracy                           0.95    555719
   macro avg       0.53      0.85      0.54    555719
weighted avg       1.00      0.95      0.97    555719


LG (Standardized) Confusion Matrix :
[[525012  28562]
 [   525   1620]]


In [10]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Logistic regression with Standardization execution completed in {execution_time:.2f} minutes.")

Logistic regression with Standardization execution completed in 0.06 minutes.


#### b) LG with normalized data

In [11]:
# Start the timer
start_time = time.time()

In [12]:
# Initialize the logistic regression model
model_new = LogisticRegression()

# Fit the model to the training data
model_new.fit(X_resampled_scaledN, y_resampled.values.ravel())

In [13]:
# Make predictions on the test data
y_pred = model_new.predict(X_test_scaledN)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"LG (Normalized) Accuracy: {accuracy*100:.2f}%")

# Generate classification report
print("\nLG (Normalized) Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
print("\nLG (Normalized) Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

LG (Normalized) Accuracy: 95.07%

LG (Normalized) Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97    553574
           1       0.06      0.75      0.10      2145

    accuracy                           0.95    555719
   macro avg       0.53      0.85      0.54    555719
weighted avg       1.00      0.95      0.97    555719


LG (Normalized) Confusion Matrix:
[[526725  26849]
 [   545   1600]]


In [14]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Logistic regression with Normalization execution completed in {execution_time:.2f} minutes.")

Logistic regression with Normalization execution completed in 0.08 minutes.


---
## 2) Decision Trees

#### A predictive model that uses a tree-like structure of decisions and their possible consequences.

In [15]:
# Initialize the Decision Tree model
model_DT = DecisionTreeClassifier()

In [16]:
# Start the timer
start_time = time.time()

In [17]:
# Fit the model to the training data
model_DT.fit(X_resampled, y_resampled.values.ravel())

# Make predictions on the test data
y_pred = model_DT.predict(X_test)

In [18]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("DT Accuracy:", accuracy)

# Generate classification report
print("\nDT Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
print("\nDT Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

DT Accuracy: 0.9676131282176783

DT Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98    553574
           1       0.08      0.69      0.14      2145

    accuracy                           0.97    555719
   macro avg       0.54      0.83      0.56    555719
weighted avg       1.00      0.97      0.98    555719


DT Confusion Matrix:
[[536248  17326]
 [   672   1473]]


In [19]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"DT execution completed in {execution_time:.2f} minutes.")

DT execution completed in 0.68 minutes.


---
## 3) Random Forest

#### An ensemble learning method that uses multiple decision trees to improve predictive performance and control overfitting.

In [5]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [6]:
# Start the timer
start_time = time.time()

In [7]:
# Fit the model to the training data
rf_model.fit(X_resampled, y_resampled.values.ravel())

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test)

In [8]:
# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy of Random Forest: {accuracy_rf*100:.2f}%")

# Generate classification report
print("\nClassification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf))

# Generate confusion matrix
print("\nConfusion Matrix for Random Forest:")
print(confusion_matrix(y_test, y_pred_rf))

Accuracy of Random Forest: 99.56%

Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.45      0.71      0.55      2145

    accuracy                           1.00    555719
   macro avg       0.73      0.85      0.77    555719
weighted avg       1.00      1.00      1.00    555719


Confusion Matrix for Random Forest:
[[551743   1831]
 [   631   1514]]


In [9]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Random Forest execution completed in {execution_time:.2f} minutes")

Random Forest execution completed in 9.02 minutes


#### Random Forest using NVIDIA RAPIDS cuml library for faster processing of the model

In [25]:
import cuml
from cuml.ensemble import RandomForestClassifier as cuRF
from cuml.metrics import accuracy_score

In [26]:
# Initialize the Random Forest Classifier
clf = cuRF(n_estimators=100, random_state=42)

  return func(**kwargs)


In [27]:
# Start the timer
start_time = time.time()

In [28]:
# Fit the model to the training data
clf.fit(X_resampled, y_resampled.values.ravel())

# Make predictions on the test data
y_pred_curf = clf.predict(X_test)

  ret = func(*args, **kwargs)


In [29]:
# Evaluate the model
accuracy_curf = accuracy_score(y_test, y_pred_curf)
print("Accuracy of Random Forest:", accuracy_curf)

# Generate classification report
print("\nClassification Report for Random Forest:")
print(classification_report(y_test, y_pred_curf))

# Generate confusion matrix
print("\nConfusion Matrix for Random Forest:")
print(confusion_matrix(y_test, y_pred_curf))

Accuracy of Random Forest: 0.9616227149963379

Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98    553574
           1       0.07      0.71      0.13      2145

    accuracy                           0.96    555719
   macro avg       0.53      0.84      0.55    555719
weighted avg       1.00      0.96      0.98    555719


Confusion Matrix for Random Forest:
[[532861  20713]
 [   614   1531]]


In [30]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Random Forest with cuml execution completed in {execution_time:.2f} minutes")

Random Forest with cuml execution completed in 0.32 minutes


---
## 4) Gradient Boosting Machines (GBM)

#### An ensemble technique that builds models sequentially, each one correcting errors of the previous one, to improve accuracy.

In [36]:
# Initialize the Gradient Boosting Classifier
gbm_model = GradientBoostingClassifier()

In [37]:
# Start the timer
start_time = time.time()

In [38]:
# Fit the model to the training data
gbm_model.fit(X_resampled, y_resampled.values.ravel())

# Make predictions on the test data
y_pred = gbm_model.predict(X_test)

In [39]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("GBM Accuracy:", accuracy)

# Generate classification report
print("\nGBM Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
print("\nGBM Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

GBM Accuracy: 0.9508708715438843

GBM Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97    553574
           1       0.07      0.88      0.12      2145

    accuracy                           0.95    555719
   macro avg       0.53      0.92      0.55    555719
weighted avg       1.00      0.95      0.97    555719


GBM Confusion Matrix:
[[526521  27053]
 [   249   1896]]


In [40]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"GBM execution completed in {execution_time:.2f} minutes.")

GBM execution completed in 8.87 minutes.


---
## 5) XGBoost Classifier

#### - An optimized gradient boosting library designed for speed and performance, widely used for structured or tabular data.

In [31]:
# Initialize the XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [32]:
# Start the timer
start_time = time.time()

In [33]:
# Fit the model to the training data
xgb_model.fit(X_resampled, y_resampled)

# Make predictions on the test data
y_pred = xgb_model.predict(X_test)

In [34]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("XGBoost Accuracy:", accuracy)

# Generate classification report
print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
print("\nXGBoost Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

XGBoost Accuracy: 0.975426435470581

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    553574
           1       0.12      0.81      0.20      2145

    accuracy                           0.98    555719
   macro avg       0.56      0.89      0.59    555719
weighted avg       1.00      0.98      0.98    555719


XGBoost Confusion Matrix:
[[540331  13243]
 [   413   1732]]


In [35]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"XGBoost execution completed in {execution_time:.2f} minutes.")

XGBoost execution completed in 0.12 minutes.


#### saving the model file for deployment

In [11]:
import pickle

In [12]:
pickle.dump(rf_model, open('deploy/classy_cc_transaction_fraud_detection_model.pkl', 'wb'))

In [13]:
pickled_model = pickle.load(open('deploy/classy_cc_transaction_fraud_detection_model.pkl', 'rb'))

In [14]:
y_pred_rf_pickled = pickled_model.predict(X_test)

In [15]:
print(f"Random Forest Pickled model Accuracy: {accuracy_score(y_test, y_pred_rf_pickled)*100:.2f}%")

Random Forest Pickled model Accuracy: 99.56%


In [19]:
pred = pickled_model.predict([[12,141.23,222,-1,31630,31.617466,-82.313895,0.00623053]])
# 11,1004.95,148,-1,16114,42.005098,-80.485252,0.0176586
# 8,875.38,148,-1,16114,40.917701,-80.014586,0.018423746
# 4,283.65,148,-1,16114,41.122993,-79.556792,0.017508754
# 2,15.49,765,-1,83869,47.978198,-117.265082,0.004664179
# 11,821.89,148,-1,16114,40.7932,-80.433995,0.021653543

# 7,19.89,148,-1,16114,42.297246,-79.425116,0.003798155
# 13,9.03,692,-1,41254,38.580916,-81.725325,0.004444444
# 2,55.01,725,-1,78214,29.688173,-98.245038,0.005990783
# 9,34.42,427,-1,56152,45.166439,-94.743248,0.001573564
# 12,141.23,222,-1,31630,31.617466,-82.313895,0.00623053

pred[0]



0