In [12]:
import sys
import os
import numpy as np
# Append the project root path to sys.path
sys.path.append(os.path.abspath(".."))

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
import mlflow

In [14]:
# --- Task 2: Model Building and Training ---

# 1. Load Merged Data (from Task 1)
try:
    merged_data = pd.read_csv(r'C:\Users\user\Desktop\Kifiya\Adey-Innovations-Fraud-Mgt\data\merged_data.csv')
except FileNotFoundError:
    print("Error: merged_data.csv not found. Please run Task 1 first.")
    exit()  # Stop execution if the file is not found


# Columns to remove (replace with your actual column names)
columns_to_remove = ['signup_time', 'purchase_time','device_id', 'country'] # Add other date/time columns if necessary


# 2. Feature and Target Separation (Fraud Data)
X_fraud = merged_data.drop('class', axis=1)
y_fraud = merged_data['class'].astype(int) # Ensure y is numeric (0 or 1)
X_fraud = X_fraud.drop(columns=columns_to_remove, errors='ignore')

# 3. Train-Test Split (Fraud Data)
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

# 4. Scaling (Fraud Data)
scaler_fraud = StandardScaler()
X_fraud_train = scaler_fraud.fit_transform(X_fraud_train)
X_fraud_test = scaler_fraud.transform(X_fraud_test)

# 5. Model Selection and Training (with MLflow - Fraud Data)
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced'),  # Increase max_iter if needed
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "MLP": MLPClassifier(max_iter=500, random_state=42)
}

for name, model in models.items():
    with mlflow.start_run(run_name=f"{name} (Fraud Data)"):
        model.fit(X_fraud_train, y_fraud_train)
        y_pred = model.predict(X_fraud_test)

        accuracy = accuracy_score(y_fraud_test, y_pred)
        report = classification_report(y_fraud_test, y_pred)

        mlflow.log_param("model", name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_text(report, "classification_report.txt")
        mlflow.sklearn.log_model(model, "model")

        print(f"{name} (Fraud Data) Results:")
        print(f"Accuracy: {accuracy}")
        print("Classification Report:\n", report)
        print("-" * 50)





Logistic Regression (Fraud Data) Results:
Accuracy: 0.5319127816563545
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.53      0.67     27393
           3       0.10      0.52      0.17      2830

    accuracy                           0.53     30223
   macro avg       0.51      0.52      0.42     30223
weighted avg       0.84      0.53      0.63     30223

--------------------------------------------------




Decision Tree (Fraud Data) Results:
Accuracy: 0.8983886444098865
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94     27393
           3       0.46      0.55      0.50      2830

    accuracy                           0.90     30223
   macro avg       0.71      0.74      0.72     30223
weighted avg       0.91      0.90      0.90     30223

--------------------------------------------------




Random Forest (Fraud Data) Results:
Accuracy: 0.9550342454422129
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98     27393
           3       1.00      0.52      0.68      2830

    accuracy                           0.96     30223
   macro avg       0.97      0.76      0.83     30223
weighted avg       0.96      0.96      0.95     30223

--------------------------------------------------




Gradient Boosting (Fraud Data) Results:
Accuracy: 0.9066274029712471
Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95     27393
           3       0.90      0.00      0.01      2830

    accuracy                           0.91     30223
   macro avg       0.90      0.50      0.48     30223
weighted avg       0.91      0.91      0.86     30223

--------------------------------------------------




MLP (Fraud Data) Results:
Accuracy: 0.9495417397346392
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     27393
           3       0.89      0.53      0.66      2830

    accuracy                           0.95     30223
   macro avg       0.92      0.76      0.82     30223
weighted avg       0.95      0.95      0.94     30223

--------------------------------------------------


In [15]:
# --- Credit Card Data Processing ---

# 1. Load Data (Credit Card)
try:
    credit_data = pd.read_csv(r'C:\Users\user\Desktop\Kifiya\Adey-Innovations-Fraud-Mgt\data\creditcard.csv')
except FileNotFoundError:
    print("Error: creditcard.csv not found.")
    exit()

# 2. Feature and Target Separation (Credit Card)
X_credit = credit_data.drop('Class', axis=1)
y_credit = credit_data['Class'].astype(int) # Ensure y is numeric (0 or 1)

# 3. Train-Test Split (Credit Card)
X_credit_train, X_credit_test, y_credit_train, y_credit_test = train_test_split(
    X_credit, y_credit, test_size=0.2, random_state=42, stratify=y_credit
)

# 4. Scaling (Credit Card)
scaler_credit = StandardScaler()
X_credit_train = scaler_credit.fit_transform(X_credit_train)
X_credit_test = scaler_credit.transform(X_credit_test)

# 5. Model Selection and Training (with MLflow - Credit Card)
models_cred = {
    "Logistic Regression": LogisticRegression(class_weight='balanced'),  # Increase max_iter if needed
    "Decision Tree": DecisionTreeClassifier(),
}

for name, model in models_cred.items():  # Re-using the same models dictionary
    with mlflow.start_run(run_name=f"{name} (Credit Card Data)"):
        model.fit(X_credit_train, y_credit_train)
        y_pred = model.predict(X_credit_test)

        accuracy = accuracy_score(y_credit_test, y_pred)
        report = classification_report(y_credit_test, y_pred)

        mlflow.log_param("model", name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_text(report, "classification_report.txt")
        mlflow.sklearn.log_model(model, "model")

        print(f"{name} (Credit Card Data) Results:")
        print(f"Accuracy: {accuracy}")
        print("Classification Report:\n", report)
        print("-" * 50)



Logistic Regression (Credit Card Data) Results:
Accuracy: 0.9755275446789088
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     56864
           1       0.06      0.92      0.11        98

    accuracy                           0.98     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.98      0.99     56962

--------------------------------------------------




Decision Tree (Credit Card Data) Results:
Accuracy: 0.9990344440153085
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.72      0.71      0.72        98

    accuracy                           1.00     56962
   macro avg       0.86      0.86      0.86     56962
weighted avg       1.00      1.00      1.00     56962

--------------------------------------------------
