# 02_Train & Save Model

This notebook will:

1. Load the preprocessed data  
2. One-hot encode categorical features  
3. Split into training and test sets  
4. Train a baseline Logistic Regression model  
5. Evaluate performance  
6. Save the trained model for the Streamlit app  


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import joblib


In [2]:
# Adjust the path if needed
df = pd.read_csv("../data/credit_card_transactions.csv")

# Drop index column if present
if "Unnamed: 0" in df.columns:
    df = df.drop("Unnamed: 0", axis=1)

df["trans_dt"] = pd.to_datetime(df["trans_date_trans_time"])
df["dob_dt"]   = pd.to_datetime(df["dob"])

# Compute age (in years) at transaction time
df["age"] = (df["trans_dt"] - df["dob_dt"]).dt.days // 365


# Now define our feature list, including age
FEATURES = [
    "merchant",    # categorical
    "category",    # categorical
    "gender",      # categorical
    "job",         # categorical
    "state",       # categorical
    # numeric:
    "amt",
    "city_pop",
    "merch_lat",
    "merch_long",
    "age"          # newly engineered
]
TARGET = "is_fraud"

print("Dataset shape after adding age:", df.shape)
print("Features to use:", FEATURES)
df[FEATURES + [TARGET]].head(3)

Dataset shape after adding age: (1296675, 26)
Features to use: ['merchant', 'category', 'gender', 'job', 'state', 'amt', 'city_pop', 'merch_lat', 'merch_long', 'age']


Unnamed: 0,merchant,category,gender,job,state,amt,city_pop,merch_lat,merch_long,age,is_fraud
0,"fraud_Rippin, Kub and Mann",misc_net,F,"Psychologist, counselling",NC,4.97,3495,36.011293,-82.048315,30,0
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,F,Special educational needs teacher,WA,107.23,149,49.159047,-118.186462,40,0
2,fraud_Lind-Buckridge,entertainment,M,Nature conservation officer,ID,220.11,4154,43.150704,-112.154481,56,0


In [3]:
# 1. Integer‐encode categoricals
cat_cols = ["merchant","category","gender","job","state"]
for c in cat_cols:
    df[c] = df[c].astype("category").cat.codes

# 2. Select numeric + encoded
feature_cols = cat_cols + ["amt","city_pop","merch_lat","merch_long","age"]
X = df[feature_cols]
y = df["is_fraud"]

print("X shape:", X.shape)
X.head()


X shape: (1296675, 10)


Unnamed: 0,merchant,category,gender,job,state,amt,city_pop,merch_lat,merch_long,age
0,514,8,0,370,27,4.97,3495,36.011293,-82.048315,30
1,241,4,0,428,47,107.23,149,49.159047,-118.186462,40
2,390,0,1,307,13,220.11,4154,43.150704,-112.154481,56
3,360,2,1,328,26,45.0,1939,47.034331,-112.561071,52
4,297,9,1,116,45,41.96,99,38.674999,-78.632459,32


In [4]:
from sklearn.model_selection import train_test_split

# Split 80/20, preserving class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"X_train: {X_train.shape}, positives: {y_train.sum()}")
print(f"X_test : {X_test.shape}, positives: {y_test.sum()}")


X_train: (1037340, 10), positives: 6005
X_test : (259335, 10), positives: 1501


In [5]:
from sklearn.preprocessing import StandardScaler

# 1. Instantiate & fit scaler on train set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

print("✓ Features scaled.")




✓ Features scaled.


In [6]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    solver="saga",           # saga handles large data well
    max_iter=500,            # fewer iterations needed once scaled
    class_weight="balanced"
)
model.fit(X_train_scaled, y_train)
print("✅ Scaled model training complete.")

✅ Scaled model training complete.


In [7]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# Predict on test set
y_pred  = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:,1]

# 1. Detailed metrics
print("Classification Report:\n", classification_report(y_test, y_pred))

# 2. ROC AUC
print("ROC AUC:", roc_auc_score(y_test, y_proba))

# 3. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97    257834
           1       0.08      0.76      0.15      1501

    accuracy                           0.95    259335
   macro avg       0.54      0.86      0.56    259335
weighted avg       0.99      0.95      0.97    259335

ROC AUC: 0.8550904137759295
Confusion Matrix:
 [[244969  12865]
 [   357   1144]]


In [8]:
from imblearn.over_sampling import SMOTE

# 1. Instantiate SMOTE
smote = SMOTE(random_state=42)

# 2. Fit on X_train, y_train → get balanced X_res, y_res
X_res, y_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts().to_dict())
print(" After SMOTE:", y_res.value_counts().to_dict())


Before SMOTE: {0: 1031335, 1: 6005}
 After SMOTE: {0: 1031335, 1: 1031335}


In [9]:
lr_smote = LogisticRegression(
    solver="saga",
    max_iter=500,
    class_weight="balanced",  # SMOTE has balanced classes already, but it's fine
    random_state=42
)
lr_smote.fit(X_res, y_res)
print("✅ SMOTE-LR training complete.")

✅ SMOTE-LR training complete.




In [10]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# Predict on the original (unmodified) test set
y_pred_sm = lr_smote.predict(X_test)
y_proba_sm = lr_smote.predict_proba(X_test)[:,1]

print("SMOTE-LR Classification Report:\n")
print(classification_report(y_test, y_pred_sm))

print("SMOTE-LR ROC AUC:", roc_auc_score(y_test, y_proba_sm))

cm_sm = confusion_matrix(y_test, y_pred_sm)
print("SMOTE-LR Confusion Matrix:\n", cm_sm)


SMOTE-LR Classification Report:

              precision    recall  f1-score   support

           0       1.00      0.92      0.96    257834
           1       0.05      0.74      0.10      1501

    accuracy                           0.92    259335
   macro avg       0.52      0.83      0.53    259335
weighted avg       0.99      0.92      0.95    259335

SMOTE-LR ROC AUC: 0.8433134242098463
SMOTE-LR Confusion Matrix:
 [[237263  20571]
 [   393   1108]]


In [11]:
from sklearn.ensemble import RandomForestClassifier

# 1. Instantiate RF (trees handle unscaled, integer‐encoded features fine)
rf = RandomForestClassifier(
    n_estimators=100,                    # number of trees
    class_weight="balanced_subsample",   # balance each bootstrap sample
    random_state=42,
    n_jobs=-1                            # use all cores
)

# 2. Fit on the original (unmodified) training set
rf.fit(X_train, y_train)
print("✅ Random Forest training complete.")


✅ Random Forest training complete.


In [12]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# 1. Predictions & probabilities on the test set
y_pred_rf  = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

# 2. Metrics
print("Random Forest Classification Report:\n")
print(classification_report(y_test, y_pred_rf))

print("Random Forest ROC AUC:", roc_auc_score(y_test, y_proba_rf))

cm_rf = confusion_matrix(y_test, y_pred_rf)
print("\nRandom Forest Confusion Matrix:\n", cm_rf)


Random Forest Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257834
           1       0.88      0.65      0.75      1501

    accuracy                           1.00    259335
   macro avg       0.94      0.83      0.87    259335
weighted avg       1.00      1.00      1.00    259335

Random Forest ROC AUC: 0.9714397165414577

Random Forest Confusion Matrix:
 [[257706    128]
 [   524    977]]


In [18]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score

# 1. Choose which model’s probabilities to tune on:
probs = y_proba_rf

# 2. Define thresholds and lists to collect metrics
thresholds = np.linspace(0.1, 0.9, 81)
f1_scores    = []
precisions   = []
recalls      = []

# 3. Loop over thresholds
for t in thresholds:
    preds_t = (probs > t).astype(int)
    f1_scores.append(   f1_score(y_test, preds_t)     )
    precisions.append(  precision_score(y_test, preds_t)  )
    recalls.append(     recall_score(y_test, preds_t)     )

# 4. Find best threshold by F1
best_idx    = np.argmax(f1_scores)
best_thresh = thresholds[best_idx]
best_f1     = f1_scores[best_idx]

print(f"🔍 Best threshold: {best_thresh:.2f}, F1 = {best_f1:.3f}")


🔍 Best threshold: 0.28, F1 = 0.774


In [21]:
from sklearn.metrics import classification_report, confusion_matrix

# 1. Use  best threshold
best_thresh = 0.28

# 2. Generate predictions at that threshold
y_pred_best = (y_proba_rf > best_thresh).astype(int)

# 3. Print the classification report
print(f"📊 Classification Report at threshold = {best_thresh:.2f}\n")
print(classification_report(y_test, y_pred_best))

# 4. Show the confusion matrix
print("🧮 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best))


📊 Classification Report at threshold = 0.28

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257834
           1       0.78      0.77      0.77      1501

    accuracy                           1.00    259335
   macro avg       0.89      0.88      0.89    259335
weighted avg       1.00      1.00      1.00    259335

🧮 Confusion Matrix:
[[257512    322]
 [   351   1150]]


In [22]:
import os, joblib

# 1. Ensure a models/ folder exists alongside your notebook
os.makedirs("../models", exist_ok=True)

# 2. Dump the RF into a pickle file
joblib.dump(rf, "../models/rf_fraud_model.pkl")
print("✅ Saved Random Forest model to ../models/rf_fraud_model.pkl")


✅ Saved Random Forest model to ../models/rf_fraud_model.pkl


In [23]:
!ls ../models


rf_fraud_model.pkl


Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode,trans_dt,dob_dt,age
2449,2019-01-02 01:06:37,4613314721966,543,4,281.06,Jason,Murphy,1,542 Steve Curve Suite 011,Collettsville,...,1988-09-15,e8a81877ae9a0a7f883e15cb39dc4022,1325466397,36.430124,-81.179483,1,28644.0,2019-01-02 01:06:37,1988-09-15,30
2472,2019-01-02 01:47:29,340187018810220,285,2,11.52,Misty,Hart,0,27954 Hall Mill Suite 575,San Antonio,...,1960-10-28,bc7d41c41103877b03232f03f1f8d3f5,1325468849,29.819364,-99.142791,1,78055.0,2019-01-02 01:47:29,1960-10-28,58
2523,2019-01-02 03:05:23,340187018810220,196,4,276.31,Misty,Hart,0,27954 Hall Mill Suite 575,San Antonio,...,1960-10-28,b98f12f4168391b2203238813df5aa8c,1325473523,29.273085,-98.83636,1,78039.0,2019-01-02 03:05:23,1960-10-28,58
2546,2019-01-02 03:38:03,4613314721966,162,2,7.03,Jason,Murphy,1,542 Steve Curve Suite 011,Collettsville,...,1988-09-15,397894a5c4c02e3c61c784001f0f14e4,1325475483,35.909292,-82.09101,1,28777.0,2019-01-02 03:38:03,1988-09-15,30
2553,2019-01-02 03:55:47,340187018810220,328,4,275.73,Misty,Hart,0,27954 Hall Mill Suite 575,San Antonio,...,1960-10-28,7863235a750d73a244c07f1fb7f0185a,1325476547,29.786426,-98.68341,1,78006.0,2019-01-02 03:55:47,1960-10-28,58
