In [1]:
# necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import time

# machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# evaluation matrices
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#### accuracy_score: A metric that calculates the ratio of correctly predicted instances to the total instances.
#### classification_report: A detailed report showing the precision, recall, f1-score, and support for each class in a classification problem.
#### confusion_matrix: A table used to describe the performance of a classification model by showing the true positives, false positives, true negatives, and false negatives

In [2]:
# Load the datasets
X_resampled = pd.read_csv("X_resampled.csv")
y_resampled = pd.read_csv("y_resampled.csv")
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [3]:
# Check the shape of the datasets
print("Shape of X_resampled:", X_resampled.shape)
print("Shape of y_resampled:", y_resampled.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_resampled: (2461812, 8)
Shape of y_resampled: (2461812, 1)
Shape of X_test: (555719, 8)
Shape of y_test: (555719, 1)


In [4]:
X_resampled.columns

Index(['category', 'amt', 'city', 'state', 'zip', 'merch_lat', 'merch_long',
       'merchant_mean_encoded'],
      dtype='object')

#### Standardization - The process of transforming data features to have a mean of zero and a standard deviation of one

In [5]:
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

#### Normalization - Scaling individual data features to a specific range, typically [0, 1] or [-1, 1], to ensure uniformity

In [6]:
scaler = MinMaxScaler()
X_resampled_scaledN = scaler.fit_transform(X_resampled)
X_test_scaledN = scaler.transform(X_test)

---
## 1) Logistic Regression (LG)

#### - A linear model used for binary classification that estimates the probability of a binary outcome based on one or more predictor variables

#### a) LG with standardize data

In [7]:
# Start the timer
start_time = time.time()

In [8]:
# Initialize the logistic regression model
model = LogisticRegression()

# Fit the model to the training data
model.fit(X_resampled_scaled, y_resampled.values.ravel())

In [9]:
# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"LG (Standardized) Accuracy: {accuracy*100:.2f}%")

# Generate classification report
print("\nLG (Standardized) Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
print("\nLG (Standardized) Confusion Matrix :")
print(confusion_matrix(y_test, y_pred))

LG (Standardized) Accuracy: 94.36%

LG (Standardized) Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97    553574
           1       0.05      0.76      0.09      2145

    accuracy                           0.94    555719
   macro avg       0.52      0.85      0.53    555719
weighted avg       1.00      0.94      0.97    555719


LG (Standardized) Confusion Matrix :
[[522737  30837]
 [   518   1627]]


In [10]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Logistic regression with Standardization execution completed in {execution_time:.2f} minutes.")

Logistic regression with Standardization execution completed in 0.11 minutes.


#### b) LG with normalized data

In [11]:
# Start the timer
start_time = time.time()

In [12]:
# Initialize the logistic regression model
model_new = LogisticRegression()

# Fit the model to the training data
model_new.fit(X_resampled_scaledN, y_resampled.values.ravel())

In [13]:
# Make predictions on the test data
y_pred = model_new.predict(X_test_scaledN)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"LG (Normalized) Accuracy: {accuracy*100:.2f}%")

# Generate classification report
print("\nLG (Normalized) Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
print("\nLG (Normalized) Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

LG (Normalized) Accuracy: 94.69%

LG (Normalized) Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97    553574
           1       0.05      0.75      0.10      2145

    accuracy                           0.95    555719
   macro avg       0.53      0.85      0.54    555719
weighted avg       1.00      0.95      0.97    555719


LG (Normalized) Confusion Matrix:
[[524619  28955]
 [   538   1607]]


In [14]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Logistic regression with Normalization execution completed in {execution_time:.2f} minutes.")

Logistic regression with Normalization execution completed in 0.23 minutes.


---
## 2) Decision Trees

#### A predictive model that uses a tree-like structure of decisions and their possible consequences.

In [15]:
# Initialize the Decision Tree model
model_DT = DecisionTreeClassifier()

In [16]:
# Start the timer
start_time = time.time()

In [17]:
# Fit the model to the training data
model_DT.fit(X_resampled, y_resampled.values.ravel())

# Make predictions on the test data
y_pred = model_DT.predict(X_test)

In [18]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("DT Accuracy:", accuracy)

# Generate classification report
print("\nDT Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
print("\nDT Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

DT Accuracy: 0.9849888162902474

DT Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    553574
           1       0.17      0.74      0.27      2145

    accuracy                           0.98    555719
   macro avg       0.58      0.86      0.63    555719
weighted avg       1.00      0.98      0.99    555719


DT Confusion Matrix:
[[545798   7776]
 [   566   1579]]


In [19]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"DT execution completed in {execution_time:.2f} minutes.")

DT execution completed in 2.19 minutes.


---
## 3) Random Forest

#### An ensemble learning method that uses multiple decision trees to improve predictive performance and control overfitting.

In [20]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [21]:
# Start the timer
start_time = time.time()

In [22]:
# Fit the model to the training data
rf_model.fit(X_resampled, y_resampled.values.ravel())

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test)

In [23]:
# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy of Random Forest:", accuracy_rf)

# Generate classification report
print("\nClassification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf))

# Generate confusion matrix
print("\nConfusion Matrix for Random Forest:")
print(confusion_matrix(y_test, y_pred_rf))

Accuracy of Random Forest: 0.9935452989730421

Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.34      0.74      0.47      2145

    accuracy                           0.99    555719
   macro avg       0.67      0.87      0.73    555719
weighted avg       1.00      0.99      0.99    555719


Confusion Matrix for Random Forest:
[[550550   3024]
 [   563   1582]]


In [24]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"Random Forest execution completed in {execution_time:.2f} minutes")

Random Forest execution completed in 28.43 minutes


---
## 4) Gradient Boosting Machines (GBM)

#### An ensemble technique that builds models sequentially, each one correcting errors of the previous one, to improve accuracy.

In [25]:
# Initialize the Gradient Boosting Classifier
gbm_model = GradientBoostingClassifier()

In [26]:
# Start the timer
start_time = time.time()

In [27]:
# Fit the model to the training data
gbm_model.fit(X_resampled, y_resampled.values.ravel())

# Make predictions on the test data
y_pred = gbm_model.predict(X_test)

In [28]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("GBM Accuracy:", accuracy)

# Generate classification report
print("\nGBM Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
print("\nGBM Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

GBM Accuracy: 0.9504713713225569

GBM Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97    553574
           1       0.07      0.88      0.12      2145

    accuracy                           0.95    555719
   macro avg       0.53      0.92      0.55    555719
weighted avg       1.00      0.95      0.97    555719


GBM Confusion Matrix:
[[526297  27277]
 [   247   1898]]


In [29]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"GBM execution completed in {execution_time:.2f} minutes.")

GBM execution completed in 18.77 minutes.


---
## 5) XGBoost Classifier

#### - An optimized gradient boosting library designed for speed and performance, widely used for structured or tabular data.

In [30]:
# Initialize the XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [31]:
# Start the timer
start_time = time.time()

In [32]:
# Fit the model to the training data
xgb_model.fit(X_resampled, y_resampled)

# Make predictions on the test data
y_pred = xgb_model.predict(X_test)

In [33]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("XGBoost Accuracy:", accuracy)

# Generate classification report
print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
print("\nXGBoost Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

XGBoost Accuracy: 0.9803227890354658

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    553574
           1       0.14      0.81      0.24      2145

    accuracy                           0.98    555719
   macro avg       0.57      0.90      0.62    555719
weighted avg       1.00      0.98      0.99    555719


XGBoost Confusion Matrix:
[[543046  10528]
 [   407   1738]]


In [34]:
end_time = time.time()

execution_time = (end_time - start_time)/60

print(f"XGBoost execution completed in {execution_time:.2f} minutes.")

XGBoost execution completed in 0.19 minutes.
