## Task 2a: Data Preparation and Baseline Model

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, auc

# ----------------------------
# Load datasets
# ----------------------------
fraud_df = pd.read_csv("/content/fraud_engineered.csv")
credit_df = pd.read_csv("/content/creditcard.csv")

# ----------------------------
# Fraud dataset: feature-target separation
# ----------------------------
fraud_feat_cols_num = [
    "purchase_value", "age", "hour_of_day", "day_of_week",
    "time_since_signup_hours", "user_txn_count_24h",
    "user_txn_sum_24h", "user_txn_count_6h"
]
fraud_feat_cols_cat = ["source", "browser", "sex"]
if "ip_country" in fraud_df.columns:
    fraud_feat_cols_cat.append("ip_country")

X_fraud = fraud_df[fraud_feat_cols_num + fraud_feat_cols_cat]
y_fraud = fraud_df["class"].astype(int)

# ----------------------------
# Credit dataset: clean target and separate features
# ----------------------------
# Ensure 'Class' column is numeric and drop rows with NaN
credit_df["Class"] = pd.to_numeric(credit_df["Class"], errors="coerce")
credit_df = credit_df.dropna(subset=["Class"])

X_credit = credit_df.drop(columns=["Class"])
y_credit = credit_df["Class"].astype(int)


In [3]:


# ----------------------------
# Stratified train-test split
# ----------------------------
Xf_train, Xf_test, yf_train, yf_test = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)
Xc_train, Xc_test, yc_train, yc_test = train_test_split(
    X_credit, y_credit, test_size=0.2, random_state=42, stratify=y_credit
)

# ----------------------------
# Preprocessing
# ----------------------------
fraud_preprocessor = ColumnTransformer([
    ("num", StandardScaler(), fraud_feat_cols_num),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), fraud_feat_cols_cat)
])

credit_preprocessor = ColumnTransformer([
    ("num", StandardScaler(), X_credit.columns.tolist())
])

# ----------------------------
# Baseline Logistic Regression
# ----------------------------
fraud_baseline = Pipeline([
    ("pre", fraud_preprocessor),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

credit_baseline = Pipeline([
    ("pre", credit_preprocessor),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

fraud_baseline.fit(Xf_train, yf_train)
credit_baseline.fit(Xc_train, yc_train)

# ----------------------------
# Evaluation function
# ----------------------------
def evaluate_model(model, X_test, y_test, name="Model"):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    auc_pr = auc(recall, precision)
    print(f"{name} - F1: {f1:.3f}, AUC-PR: {auc_pr:.3f}")
    print("Confusion Matrix:\n", cm)
    return {"f1": f1, "auc_pr": auc_pr, "cm": cm}

fraud_results_baseline = evaluate_model(fraud_baseline, Xf_test, yf_test, "Fraud Logistic Regression")
credit_results_baseline = evaluate_model(credit_baseline, Xc_test, yc_test, "Credit Logistic Regression")


Fraud Logistic Regression - F1: 0.273, AUC-PR: 0.448
Confusion Matrix:
 [[17651  9742]
 [  845  1985]]
Credit Logistic Regression - F1: 0.158, AUC-PR: 0.772
Confusion Matrix:
 [[30949   701]
 [    5    66]]


### Task 2b: Ensemble Model, Cross-Validation, and Model Selection

In [4]:
from sklearn.ensemble import RandomForestClassifier

# ----------------------------
# Ensemble Model: Random Forest
# ----------------------------
fraud_rf = Pipeline([
    ("pre", fraud_preprocessor),
    ("clf", RandomForestClassifier(
        n_estimators=200, max_depth=10, random_state=42, class_weight="balanced"
    ))
])

credit_rf = Pipeline([
    ("pre", credit_preprocessor),
    ("clf", RandomForestClassifier(
        n_estimators=200, max_depth=10, random_state=42, class_weight="balanced"
    ))
])

fraud_rf.fit(Xf_train, yf_train)
credit_rf.fit(Xc_train, yc_train)

fraud_results_rf = evaluate_model(fraud_rf, Xf_test, yf_test, "Fraud Random Forest")
credit_results_rf = evaluate_model(credit_rf, Xc_test, yc_test, "Credit Random Forest")

# ----------------------------
# Cross-Validation
# ----------------------------
def cross_validate_model(model, X, y, k=5):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    f1_scores, auc_scores = [], []
    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:,1]
        f1_scores.append(f1_score(y_test, y_pred))
        precision, recall, _ = precision_recall_curve(y_test, y_prob)
        auc_scores.append(auc(recall, precision))
    print(f"F1 mean={np.mean(f1_scores):.3f}, std={np.std(f1_scores):.3f}")
    print(f"AUC-PR mean={np.mean(auc_scores):.3f}, std={np.std(auc_scores):.3f}")
    return f1_scores, auc_scores

print("Fraud RF Cross-Validation:")
cross_validate_model(fraud_rf, X_fraud, y_fraud)

print("Credit RF Cross-Validation:")
cross_validate_model(credit_rf, X_credit, y_credit)


Fraud Random Forest - F1: 0.701, AUC-PR: 0.636
Confusion Matrix:
 [[27392     1]
 [ 1301  1529]]
Credit Random Forest - F1: 0.824, AUC-PR: 0.858
Confusion Matrix:
 [[31641     9]
 [   15    56]]
Fraud RF Cross-Validation:
F1 mean=0.699, std=0.013
AUC-PR mean=0.635, std=0.013
Credit RF Cross-Validation:
F1 mean=0.839, std=0.030
AUC-PR mean=0.847, std=0.032


([0.8115942028985508,
  0.8378378378378378,
  0.8970588235294118,
  0.8226950354609929,
  0.8260869565217391],
 [np.float64(0.816556303028539),
  np.float64(0.8830695407240573),
  np.float64(0.8828200848072733),
  np.float64(0.8461846118937358),
  np.float64(0.8080831402305494)])

### Model Comparison and Selection



In [5]:
# Compare baseline vs ensemble
comparison = pd.DataFrame([
    {"Dataset": "Fraud", "Model": "Logistic Regression", **fraud_results_baseline},
    {"Dataset": "Fraud", "Model": "Random Forest", **fraud_results_rf},
    {"Dataset": "Credit", "Model": "Logistic Regression", **credit_results_baseline},
    {"Dataset": "Credit", "Model": "Random Forest", **credit_results_rf},
])

print(comparison[["Dataset","Model","f1","auc_pr"]])

# Selection justification
# Fraud dataset: Random Forest shows higher AUC-PR and F1, but Logistic Regression is more interpretable.
# Credit dataset: Random Forest improves recall, but Logistic Regression remains a strong baseline.


  Dataset                Model        f1    auc_pr
0   Fraud  Logistic Regression  0.272721  0.447519
1   Fraud        Random Forest  0.701376  0.636399
2  Credit  Logistic Regression  0.157518  0.772475
3  Credit        Random Forest  0.823529  0.857653
