In [5]:
#import sys#
#!{sys.executable} -m pip install xgboost

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
import pickle
import os

# Create model directory if it doesn't exist
if not os.path.exists('model'):
    os.makedirs('model')

# Load and Preprocess Data
df = pd.read_csv('creditcard.csv')
X = df.drop('Class', axis=1) # Target column is 'Class'
y = df['Class']

# Scale features for better performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the 6 mandatory models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier()
}

results = []

# Loop through and calculate all 6 metrics
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred
    
    # Calculate metrics
    results.append({
        "ML Model Name": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    })
    
    # Save models for Streamlit
    with open(f'model/{name.replace(" ", "_").lower()}.pkl', 'wb') as f:
        pickle.dump(model, f)

# Display the Comparison Table for README
results_df = pd.DataFrame(results)
print("\n--- Model Comparison Table ---")
print(results_df.to_markdown(index=False))

Training Logistic Regression...
Training Decision Tree...
Training KNN...
Training Naive Bayes...
Training Random Forest...
Training XGBoost...

--- Model Comparison Table ---
| ML Model Name       |   Accuracy |      AUC |   Precision |   Recall |       F1 |      MCC |
|:--------------------|-----------:|---------:|------------:|---------:|---------:|---------:|
| Logistic Regression |   0.999122 | 0.975293 |   0.863636  | 0.581633 | 0.695122 | 0.708353 |
| Decision Tree       |   0.999087 | 0.902771 |   0.705357  | 0.806122 | 0.752381 | 0.753608 |
| KNN                 |   0.999526 | 0.93356  |   0.938272  | 0.77551  | 0.849162 | 0.852794 |
| Naive Bayes         |   0.977827 | 0.967106 |   0.0603774 | 0.816327 | 0.112439 | 0.218423 |
| Random Forest       |   0.999579 | 0.95244  |   0.974359  | 0.77551  | 0.863636 | 0.869075 |
| XGBoost             |   0.999579 | 0.94059  |   0.940476  | 0.806122 | 0.868132 | 0.87051  |
