In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, f1_score, confusion_matrix

# -----------------------
# 1. Load Dataset
# -----------------------
# Change the file name as needed
# For creditcard.csv: target_col = 'Class'
# For Fraud_Data.csv: target_col = 'class'
file_path = "C:\\Users\\ZAK-TECH\\Desktop\\KAIM week5\\data\\raw\\Fraud_Data.csv"  # or "Fraud_Data.csv"
target_col = "class"           # or "class" for Fraud_Data.csv

df = pd.read_csv(file_path)
X = df.drop(columns=[target_col])
y = df[target_col]

# -----------------------
# 2. Train-Test Split (Stratified)
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# -----------------------
# 3. Baseline Model: Logistic Regression
# -----------------------
lr = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="liblinear"
)
lr.fit(X_train, y_train)

# Predict & Evaluate
y_prob_lr = lr.predict_proba(X_test)[:, 1]
y_pred_lr = lr.predict(X_test)

auc_pr_lr = average_precision_score(y_test, y_prob_lr)
f1_lr = f1_score(y_test, y_pred_lr)
cm_lr = confusion_matrix(y_test, y_pred_lr)

print("=== Logistic Regression ===")
print("AUC-PR:", auc_pr_lr)
print("F1-Score:", f1_lr)
print("Confusion Matrix:\n", cm_lr)

# -----------------------
# 4. Ensemble Model: Random Forest
# -----------------------
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# Predict & Evaluate
y_prob_rf = rf.predict_proba(X_test)[:, 1]
y_pred_rf = rf.predict(X_test)

auc_pr_rf = average_precision_score(y_test, y_prob_rf)
f1_rf = f1_score(y_test, y_pred_rf)
cm_rf = confusion_matrix(y_test, y_pred_rf)

print("\n=== Random Forest ===")
print("AUC-PR:", auc_pr_rf)
print("F1-Score:", f1_rf)
print("Confusion Matrix:\n", cm_rf)

# -----------------------
# 5. Cross-Validation (Stratified K-Fold, k=5)
# -----------------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {"auc_pr": "average_precision", "f1": "f1"}

# Logistic Regression CV
lr_cv = cross_validate(lr, X, y, cv=skf, scoring=scoring)
print("\n=== Logistic Regression CV ===")
print("AUC-PR Mean:", lr_cv["test_auc_pr"].mean())
print("AUC-PR Std:", lr_cv["test_auc_pr"].std())
print("F1-Score Mean:", lr_cv["test_f1"].mean())
print("F1-Score Std:", lr_cv["test_f1"].std())

# Random Forest CV
rf_cv = cross_validate(rf, X, y, cv=skf, scoring=scoring)
print("\n=== Random Forest CV ===")
print("AUC-PR Mean:", rf_cv["test_auc_pr"].mean())
print("AUC-PR Std:", rf_cv["test_auc_pr"].std())
print("F1-Score Mean:", rf_cv["test_f1"].mean())
print("F1-Score Std:", rf_cv["test_f1"].std())

# -----------------------
# 6. Model Comparison Summary
# -----------------------
print("\n=== Model Comparison Summary ===")
print(f"Logistic Regression - AUC-PR: {auc_pr_lr:.4f}, F1: {f1_lr:.4f}")
print(f"Random Forest       - AUC-PR: {auc_pr_rf:.4f}, F1: {f1_rf:.4f}")
print("\nRecommendation: Random Forest selected for production due to higher performance.")
