# Model Building and Hyperparameter Tuning

This notebook trains baseline models and performs hyperparameter tuning for a fraud detection classifier.


In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [10]:
# Load processed dataset
# Expected columns: category, amt, gender, state, city_pop, job, distance, trans_hour, trans_minute, trans_second, is_fraud
from pathlib import Path

project_root = Path.cwd()
if project_root.name == "notebook":
    project_root = project_root.parent
data_path = project_root / "data" / "processed_credit_data.csv"
data = pd.read_csv(data_path)

data.head()


Unnamed: 0,category,amt,gender,state,city_pop,job,is_fraud,distance,trans_hour,trans_minute,trans_second
0,personal_care,2.86,M,SC,333497,Mechanical engineer,0,24613.746071,12,14,25
1,personal_care,29.84,F,UT,302,"Sales professional, IT",0,104834.043428,12,14,33
2,health_fitness,41.28,F,NY,34496,"Librarian, public",0,59204.795631,12,14,53
3,misc_pos,60.05,M,FL,54767,Set designer,0,27615.117073,12,15,15
4,travel,3.19,M,MI,1126,Furniture designer,0,104423.174625,12,15,17


In [11]:
target = "is_fraud"
X = data.drop(columns=[target])
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = [col for col in X.columns if col not in categorical_cols]

print("Train class balance:", y_train.value_counts(normalize=True))
print("Test class balance:", y_test.value_counts(normalize=True))


Train class balance: is_fraud
0    0.997966
1    0.002034
Name: proportion, dtype: float64
Test class balance: is_fraud
0    0.997966
1    0.002034
Name: proportion, dtype: float64


In [12]:
# Preprocessing and baseline models
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", StandardScaler(), numeric_cols),
    ],
    remainder="drop",
)

models = {
    "log_reg": Pipeline(
        steps=[
            ("preprocess", preprocess),
            ("model", LogisticRegression(max_iter=2000, class_weight="balanced")),
        ]
    ),
    "random_forest": Pipeline(
        steps=[
            ("preprocess", preprocess),
            (
                "model",
                RandomForestClassifier(
                    n_estimators=300,
                    random_state=42,
                    n_jobs=-1,
                    class_weight="balanced",
                ),
            ),
        ]
    ),
    "gbdt": Pipeline(
        steps=[
            ("preprocess", preprocess),
            ("model", GradientBoostingClassifier(random_state=42)),
        ]
    ),
}


def evaluate_model(name, pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)
    proba = pipeline.predict_proba(X_test)[:, 1]
    preds = (proba >= 0.5).astype(int)
    print(f"{name} metrics")
    print("ROC AUC:", round(roc_auc_score(y_test, proba), 4))
    print("Avg precision:", round(average_precision_score(y_test, proba), 4))
    print("Confusion matrix:", confusion_matrix(y_test, preds))
    print(classification_report(y_test, preds, digits=4))
    return {
        "model": name,
        "roc_auc": roc_auc_score(y_test, proba),
        "avg_precision": average_precision_score(y_test, proba),
    }

baseline_results = []
for name, pipe in models.items():
    baseline_results.append(evaluate_model(name, pipe, X_train, y_train, X_test, y_test))

pd.DataFrame(baseline_results).sort_values(by="avg_precision", ascending=False)


log_reg metrics
ROC AUC: 0.9317
Avg precision: 0.0385
Confusion matrix: [[94140 15757]
 [   29   195]]
              precision    recall  f1-score   support

           0     0.9997    0.8566    0.9226    109897
           1     0.0122    0.8705    0.0241       224

    accuracy                         0.8566    110121
   macro avg     0.5060    0.8636    0.4734    110121
weighted avg     0.9977    0.8566    0.9208    110121

random_forest metrics
ROC AUC: 0.9622
Avg precision: 0.75
Confusion matrix: [[109897      0]
 [   115    109]]
              precision    recall  f1-score   support

           0     0.9990    1.0000    0.9995    109897
           1     1.0000    0.4866    0.6547       224

    accuracy                         0.9990    110121
   macro avg     0.9995    0.7433    0.8271    110121
weighted avg     0.9990    0.9990    0.9988    110121

gbdt metrics
ROC AUC: 0.9664
Avg precision: 0.6777
Confusion matrix: [[109893      4]
 [    97    127]]
              precision    r

Unnamed: 0,model,roc_auc,avg_precision
1,random_forest,0.962228,0.74999
2,gbdt,0.966393,0.677747
0,log_reg,0.931713,0.03853


In [None]:
# Hyperparameter tuning for Random Forest
rf_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocess),
        (
            "model",
            RandomForestClassifier(
                random_state=42,
                n_jobs=-1,
                class_weight="balanced",
            ),
        ),
    ]
)

param_dist = {
    "model__n_estimators": [200, 400, 600, 800],
    "model__max_depth": [None, 6, 10, 16, 24],
    "model__min_samples_split": [2, 5, 10, 20],
    "model__min_samples_leaf": [1, 2, 4, 8],
    "model__max_features": ["sqrt", "log2", 0.5],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    rf_pipeline,
    param_distributions=param_dist,
    n_iter=20,
    scoring="average_precision",
    cv=cv,
    random_state=42,
    n_jobs=-1,
)

search.fit(X_train, y_train)

best_model = search.best_estimator_
print("Best params:", search.best_params_)

_ = evaluate_model("tuned_random_forest", best_model, X_train, y_train, X_test, y_test)


In [None]:
# Save model
import joblib

model_path = Path("models/credit_risk_model.pkl")
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(best_model, model_path)
model_path


In [None]:
# Example inference
loaded_model = joblib.load(model_path)

new_data = pd.DataFrame(
    {
        "category": ["food_dining"],
        "amt": [150.75],
        "gender": ["male"],
        "state": ["CA"],
        "city_pop": [100000],
        "job": ["engineer"],
        "distance": [5.0],
        "trans_hour": [12],
        "trans_minute": [30],
        "trans_second": [45],
    }
)

new_proba = loaded_model.predict_proba(new_data)[:, 1]
new_pred = (new_proba >= 0.5).astype(int)
print("Predicted fraud probability:", round(float(new_proba[0]), 4))
print("Predicted class:", int(new_pred[0]))
