### Model Development

Import liberaries

In [19]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

current_dir = os.getcwd()
# Append the parent directory to sys.path
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

# ignore warrning message
import warnings
warnings.filterwarnings('ignore')

In [20]:
from scripts.model_development_scripts import ModelPipeline

In [21]:
# load preprocessed datasets
fraud_data = pd.read_csv('../data/proccessed_fraud_data.csv')
credit_data = pd.read_csv('../data/creditcard.csv')

In [22]:
# print the first 5 rows of the data
fraud_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,age,class,ip_int,country,hour_of_day,...,time_diff,transaction_frequency,average_velocity,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M
0,2,2015-01-11 03:47:13,2015-02-21 10:03:37,0.310345,FGBQNDNBETFJJ,0.12069,0,880217484,United States,10,...,0.0,1,0.0,0,1,0,0,0,0,0
1,4,2015-06-02 16:40:57,2015-09-26 21:32:16,0.22069,MKFUIVOHLJBYN,0.344828,0,2785906106,Switzerland,21,...,0.0,1,0.0,1,0,0,0,0,1,0
2,8,2015-05-28 07:53:06,2015-08-13 11:53:07,0.262069,SCQGQALXBUQZJ,0.12069,0,356056736,United States,11,...,0.0,1,0.0,0,1,0,0,0,0,1
3,12,2015-01-10 06:25:12,2015-03-04 20:56:37,0.17931,MSNWCFEHKTIOY,0.017241,0,2985180352,Mexico,20,...,0.0,1,0.0,0,0,0,0,0,1,1
4,16,2015-02-03 13:48:23,2015-03-12 12:46:23,0.0,FROZWSSWOHZBE,0.241379,0,578312545,United States,12,...,0.0,1,0.0,1,0,0,1,0,0,1


In [23]:
fraud_data.transaction_frequency.unique()

array([1], dtype=int64)

In [24]:
# Drop unnecessary columns
fraud_data = fraud_data.drop(['user_id', 'device_id', 'transaction_frequency','time_diff','average_velocity', 'ip_int', 'signup_time', 'purchase_time'], axis=1)

#### Frequency Encoding
In this method, I replace each category with its frequency in the fraud dataset. This can help maintain some information about the category without expanding the feature space too much.

In [25]:
# Frequency Encoding Example
frequency = fraud_data['country'].value_counts()
fraud_data['country_encoded'] = fraud_data['country'].map(frequency)

# Drop the original 'country' column
fraud_data = fraud_data.drop('country', axis=1)

# scale 'country_encoded' column
scaler = MinMaxScaler()
fraud_data['country_encoded'] = scaler.fit_transform(fraud_data[['country_encoded']])[:, 0]


In [26]:
fraud_data.to_csv('../data/final_preprocessed_fraud_data.csv', index=False)

In [27]:
fraud_data

Unnamed: 0,purchase_value,age,class,hour_of_day,day_of_week,source_Direct,source_SEO,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_M,country_encoded
0,0.310345,0.120690,0,10,5,0,1,0,0,0,0,0,1.000000
1,0.220690,0.344828,0,21,5,1,0,0,0,0,1,0,0.013506
2,0.262069,0.120690,0,11,3,0,1,0,0,0,0,1,1.000000
3,0.179310,0.017241,0,20,2,0,0,0,0,0,1,1,0.019294
4,0.000000,0.241379,0,12,3,1,0,0,1,0,0,1,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
129141,0.503448,0.706897,0,7,2,1,0,0,0,0,1,1,0.054438
129142,0.075862,0.327586,0,7,3,1,0,0,1,0,0,0,0.062793
129143,0.165517,0.293103,0,23,4,1,0,1,0,0,0,0,0.125844
129144,0.393103,0.517241,0,20,2,1,0,0,0,0,0,1,1.000000


### Model Development for Fraud_detaction

In [28]:
# Example usage for fraud dataset:
fraud_data_file_path = '../data/final_preprocessed_fraud_data.csv'
fraud_pipeline = ModelPipeline('fraud', fraud_data_file_path)

In [29]:
# load data for model development
fraud_pipeline.load_data()

2024-10-22 21:21:22,617 - INFO - Loading fraud data from ../data/final_preprocessed_fraud_data.csv...
2024-10-22 21:21:23,439 - INFO - Data loading complete.


In [30]:
# split the data into training and testing split 
fraud_pipeline.split_data()

2024-10-22 21:21:25,355 - INFO - Data has been split into train and test sets.


In [31]:
# applay SMOTE oversampling for fraud data to fix the class inbalance
fraud_pipeline.apply_smote()

2024-10-22 21:21:37,479 - INFO - Applying SMOTE to the training data...
2024-10-22 21:21:39,702 - INFO - SMOTE applied to training data. Classes have been balanced.


In [32]:
# Define and Label Models for Fraud Detection Comparison
models = [
            (LogisticRegression(), 'Logistic Regression'),
            (DecisionTreeClassifier(), 'Decision Tree'),
            (RandomForestClassifier(), 'Random Forest'),
            (GradientBoostingClassifier(), 'Gradient Boosting'),
            (MLPClassifier(), 'MLP')
        ]

In [33]:
# Train, Evaluate, and Log Models for Fraud Detection Pipeline
for model, name in models:
            fraud_pipeline.train_model(model, name)
            report = fraud_pipeline.evaluate_model(model, name)
            fraud_pipeline.log_model(model, name, report)

2024-10-22 21:21:47,983 - INFO - Training Logistic Regression on fraud dataset...
2024-10-22 21:21:51,225 - INFO - Logistic Regression training complete.
2024-10-22 21:21:51,228 - INFO - Evaluating Logistic Regression on fraud dataset...
2024-10-22 21:21:51,696 - INFO - Logistic Regression evaluation report:
              precision    recall  f1-score   support

           0       0.91      0.59      0.72     23389
           1       0.10      0.42      0.16      2441

    accuracy                           0.58     25830
   macro avg       0.50      0.51      0.44     25830
weighted avg       0.83      0.58      0.67     25830

2024-10-22 21:21:51,720 - INFO - Logging Logistic Regression to MLflow...
2024-10-22 21:22:48,286 - INFO - Logistic Regression has been logged and saved in MLflow as version 2.
2024-10-22 21:22:48,532 - INFO - Training Decision Tree on fraud dataset...
2024-10-22 21:22:52,422 - INFO - Decision Tree training complete.
2024-10-22 21:22:52,444 - INFO - Evaluating 

## Model Training and Evaluation Summary

### Logistic Regression
- **Training Completed**: Successfully trained on the fraud dataset.
- **Evaluation**:
  - **Accuracy**: 58%
  - **Precision**: 0.91 (Class 0), 0.10 (Class 1)
  - **Recall**: 0.59 (Class 0), 0.42 (Class 1)
- **Logged to MLflow**: Version 2

### Decision Tree
- **Training Completed**: Successfully trained on the fraud dataset.
- **Evaluation**:
  - **Accuracy**: 89%
  - **Precision**: 0.96 (Class 0), 0.43 (Class 1)
  - **Recall**: 0.92 (Class 0), 0.59 (Class 1)
- **Logged to MLflow**: Version 2

### Random Forest
- **Training Completed**: Successfully trained on the fraud dataset.
- **Evaluation**:
  - **Accuracy**: 94%
  - **Precision**: 0.95 (Class 0), 0.76 (Class 1)
  - **Recall**: 0.98 (Class 0), 0.55 (Class 1)
- **Logged to MLflow**: Version 2

### Gradient Boosting
- **Training Completed**: Successfully trained on the fraud dataset.
- **Evaluation**:
  - **Accuracy**: 76%
  - **Precision**: 0.92 (Class 0), 0.14 (Class 1)
  - **Recall**: 0.80 (Class 0), 0.30 (Class 1)
- **Logged to MLflow**: Version 2

### MLP (Multilayer Perceptron)
- **Training Completed**: Successfully trained on the fraud dataset.
- **Evaluation**:
  - **Accuracy**: 71%
  - **Precision**: 0.94 (Class 0), 0.17 (Class 1)
  - **Recall**: 0.73 (Class 0), 0.53 (Class 1)
- **Logged to MLflow**: Version 1

### Model Comparison Summary

| Model               | Precision (Fraud) | Recall (Fraud) | F1-Score (Fraud) | Accuracy |
|---------------------|-------------------|----------------|------------------|----------|
| Logistic Regression  | 0.10              | 0.42           | 0.16             | 0.58     |
| Decision Tree        | 0.43              | 0.59           | 0.50             | 0.89     |
| Random Forest        | 0.76              | 0.55           | 0.64             | 0.94     |
| Gradient Boosting    | 0.14              | 0.30           | 0.19             | 0.76     |
| MLP                  | 0.17              | 0.53           | 0.26             | 0.71     |

**Observation**: Random Forest performs best with the highest accuracy (94%) and better fraud detection compared to other models.

### General Notes
- **MLflow Logging**: Models logged without signatures and input examples; consider setting `input_example` for better model tracking.

---

### Model development for Credit Card

In [34]:
#  model development for credit card
creditcard_file_path = '../data/creditcard.csv'
creditcard_pipeline = ModelPipeline('creditcard', creditcard_file_path)

In [35]:
# Load Credit Card Fraud Dataset for Processing
creditcard_pipeline.load_data()  # Load and prepare data for training and evaluation

2024-10-22 21:50:47,402 - INFO - Loading credit card data from ../data/creditcard.csv...
2024-10-22 21:50:52,435 - INFO - Data loading complete.


In [36]:
# Split Data into Training and Testing Sets
creditcard_pipeline.split_data()  # Divide dataset for model training and evaluation

2024-10-22 21:50:54,812 - INFO - Data has been split into train and test sets.


In [37]:
# Define and Label Models for credit_card fraud Detection Comparison
credit_models = [
            (LogisticRegression(), 'Logistic Regression'),
            (DecisionTreeClassifier(), 'Decision Tree'),
            (RandomForestClassifier(), 'Random Forest'),
            (GradientBoostingClassifier(), 'Gradient Boosting'),
            (MLPClassifier(), 'Multi-Layer Perceptron')
        ]

In [38]:
# Train, Evaluate, and Log Credit Card Fraud Models
for model, name in credit_models:
            creditcard_pipeline.train_model(model, name)
            report = creditcard_pipeline.evaluate_model(model, name)
            creditcard_pipeline.log_model(model, name, report)

2024-10-22 21:51:02,192 - INFO - Training Logistic Regression on creditcard dataset...
2024-10-22 21:51:10,910 - INFO - Logistic Regression training complete.
2024-10-22 21:51:10,918 - INFO - Evaluating Logistic Regression on creditcard dataset...
2024-10-22 21:51:11,394 - INFO - Logistic Regression evaluation report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.61      0.56      0.59        98

    accuracy                           1.00     56962
   macro avg       0.81      0.78      0.79     56962
weighted avg       1.00      1.00      1.00     56962

2024-10-22 21:51:11,453 - INFO - Logging Logistic Regression to MLflow...
2024-10-22 21:51:28,351 - INFO - Logistic Regression has been logged and saved in MLflow as version 2.
2024-10-22 21:51:28,374 - INFO - Training Decision Tree on creditcard dataset...
2024-10-22 21:52:10,405 - INFO - Decision Tree training complete.
2024-10-22 21:52:10,407 - INF

### Credit Card Fraud Detection Summary

### Logistic Regression
- **Training Completed**
- **Evaluation**:
  - **Accuracy**: 100%
  - **Precision (Class 1)**: 0.61
  - **Recall (Class 1)**: 0.56

### Decision Tree
- **Training Completed**
- **Evaluation**:
  - **Accuracy**: 100%
  - **Precision (Class 1)**: 0.74
  - **Recall (Class 1)**: 0.78

### Random Forest
- **Training Completed**
- **Evaluation**:
  - **Accuracy**: 100%
  - **Precision (Class 1)**: 0.97
  - **Recall (Class 1)**: 0.77

### Gradient Boosting
- **Training Completed**
- **Evaluation**:
  - **Accuracy**: 100%
  - **Precision (Class 1)**: 0.74
  - **Recall (Class 1)**: 0.60

### Multi-Layer Perceptron (MLP)
- **Training Completed**
- **Evaluation**:
  - **Accuracy**: 100%
  - **Precision (Class 1)**: 0.59
  - **Recall (Class 1)**: 0.68

### Credit Card Fraud Detection Summary

| Model          | Precision (Fraud) | Recall (Fraud) | Accuracy | 
|----------------|-------------------|----------------|----------|
| Logistic Reg.  | 0.61              | 0.56           | 100%     | 
| Decision Tree  | 0.74              | 0.78           | 100%     | 
| Random Forest  | 0.97              | 0.77           | 100%     | 
| Grad. Boosting | 0.74              | 0.60           | 100%     | 
| MLP            | 0.59              | 0.68           | 100%     | 

**Note**: All models achieved 100% accuracy; however, Random Forest provides the highest precision, while Decision Tree offers the best recall for detecting fraud. so we can take one of the two.


---

#### Optionaly we can use ModelPipline to run the model once
- **For Credit card fraud detaction**: 

```python
pipeline = ModelPipeline(dataset_type='creditcard', path='path/to/creditcard_data.csv')
pipeline.run_pipeline()
```

- **For Bank transaction Fraud detaction**:

```python
pipeline = ModelPipeline(dataset_type='fraud', path='path/to/fraud_data.csv')
pipeline.run_pipeline()
```