In [1]:
# ==============================================================
# Random Forest Classification (RF) Experiment
# ==============================================================
import pandas as pd
import numpy as np
import sys
import time
from datetime import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
from preprocessing import prepare_data

# Experiment config
random_state = 42
results = []
partial_save_path = "results/rf_partial_results.csv"

In [2]:
# ---
# 1. Load Data
# ---
train_df = pd.read_csv("fraudTrain.csv")
test_df = pd.read_csv("fraudTest.csv")

print("Loaded train/test data.")

Loaded train/test data.


In [3]:
# ---
# 2. Prepare Data for TREE Models
# ---
# We MUST use mode="tree". This skips scaling and uses OrdinalEncoders.
out_train_init = prepare_data(
    train_df,
    mode="tree",
    training=False,  # We only need the encoders
    fit=True,
)
encoders = out_train_init["encoders"]
scalers = {}  # Scalers are not used

# Prepare TEST set using the *same* "tree" mode
out_test = prepare_data(
    test_df,
    mode="tree",
    training=False,
    fit=False,
    encoders=encoders,
    scalers=scalers,
)
df_test = out_test["df"]
X_test = df_test.drop("is_fraud", axis=1)
y_test = df_test["is_fraud"]

# Clean inf/-inf values
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0).clip(-1e6, 1e6)

print(f"Test set prepared for trees. Shape: {X_test.shape}")

Test set prepared for trees. Shape: (555719, 13)


In [6]:
# ---
# 3. Define Experiment Parameters
# ---
# Using the full ratio list as requested
ratios_to_test = [None, 0.05, 0.1, 0.2, 0.3, 0.5, 1.0]
resample_types_to_test = ["df_up", "df_down"] # Using both as requested

# We will test a pruned vs. unpruned forest
# n_estimators=100 is a standard, robust default
params_to_test = [
    {
        "name": "RF_Depth_10",
        "params": {"n_estimators": 100, "max_depth": 10, "criterion": "gini"},
    },
    {
        "name": "RF_Full_Depth",
        "params": {"n_estimators": 100, "max_depth": None, "criterion": "gini"},
    },
]

resample_type = "df_down"  # Sticking to this for speed

In [7]:
# ---
# 4. Run Experiment Loop
# ---
print("\nStarting Random Forest (FULL) experiment loop...")
for ratio_idx, ratio in enumerate(ratios_to_test, start=1):
    print(f"\n{'='*70}")
    print(
        f"[{datetime.now().strftime('%H:%M:%S')}] Starting ratio {ratio_idx}/{len(ratios_to_test)} ‚Üí ratio={ratio}"
    )
    start_ratio_time = time.time()

    # Get the "tree-mode" preprocessed data
    out_train = prepare_data(
        train_df,
        mode="tree",
        training=True,
        ratio=ratio,
        fit=False,  # <-- Correct: We use the fitted encoders
        encoders=encoders,
        scalers=scalers,
    )

    for resample_type in resample_types_to_test:
        if resample_type not in out_train or out_train[resample_type] is None:
            print(f"  SKIPPING {resample_type} for ratio={ratio} (no data)")
            continue

        df_train = out_train[resample_type]
        X_train = df_train.drop("is_fraud", axis=1)
        y_train = df_train["is_fraud"]

        # Clean inf/-inf values
        X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0).clip(-1e6, 1e6)

        print(
            f"  [{datetime.now().strftime('%H:%M:%S')}] ‚Üí Training on {resample_type} (samples={len(X_train):,})"
        )
        sys.stdout.flush()

        for p_info in params_to_test:
            model_name = p_info["name"]
            params = p_info["params"]
            start_k_time = time.time()

            print(f"    ‚è≥ Running {model_name} ...", end="")
            sys.stdout.flush()

            # n_jobs=-1 uses all your CPU cores
            model = RandomForestClassifier(random_state=random_state, n_jobs=-1, **params)
            model.fit(X_train, y_train)

            # Predict
            y_pred = model.predict(X_test)
            y_prob = model.predict_proba(X_test)[:, 1]

            # Metrics
            f1 = f1_score(y_test, y_pred)
            rec = recall_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_prob)
            prec = precision_score(y_test, y_pred)
            acc = accuracy_score(y_test, y_pred)

            results.append(
                {
                    "model": model_name,
                    "ratio": ratio,
                    "resample_type": resample_type.replace("df_",""), # Clean name
                    "accuracy": acc,
                    "precision": prec,
                    "recall": rec,
                    "f1": f1,
                    "roc_auc": auc,
                }
            )

            print(
                f" done ‚Üí F1={f1:.4f}, Recall={rec:.4f}, AUC={auc:.4f} | Time={time.time() - start_k_time:.1f}s"
            )
            sys.stdout.flush()

    # Save partial results after each ratio
    pd.DataFrame(results).to_csv(partial_save_path, index=False)
    print(f"  üíæ Saved intermediate results ‚Üí {partial_save_path}")
    print(f"  ‚úÖ Completed ratio={ratio} in {(time.time() - start_ratio_time)/60:.1f} min")

print("\nAll ratios completed.")
print(f"Total experiments logged: {len(results)}")


Starting Random Forest (FULL) experiment loop...

[22:50:20] Starting ratio 1/7 ‚Üí ratio=None
  [22:50:24] ‚Üí Training on df_up (samples=1,296,675)
    ‚è≥ Running RF_Depth_10 ... done ‚Üí F1=0.7201, Recall=0.5716, AUC=0.9849 | Time=102.4s
    ‚è≥ Running RF_Full_Depth ... done ‚Üí F1=0.8168, Recall=0.7100, AUC=0.9771 | Time=117.4s
  [22:54:05] ‚Üí Training on df_down (samples=1,296,675)
    ‚è≥ Running RF_Depth_10 ... done ‚Üí F1=0.7201, Recall=0.5716, AUC=0.9849 | Time=98.4s
    ‚è≥ Running RF_Full_Depth ... done ‚Üí F1=0.8168, Recall=0.7100, AUC=0.9771 | Time=117.6s
  üíæ Saved intermediate results ‚Üí results/rf_partial_results.csv
  ‚úÖ Completed ratio=None in 7.3 min

[22:57:41] Starting ratio 2/7 ‚Üí ratio=0.05
  [22:57:46] ‚Üí Training on df_up (samples=1,353,627)
    ‚è≥ Running RF_Depth_10 ... done ‚Üí F1=0.7597, Recall=0.7110, AUC=0.9872 | Time=124.6s
    ‚è≥ Running RF_Full_Depth ... done ‚Üí F1=0.8299, Recall=0.7380, AUC=0.9791 | Time=153.4s
  [23:02:24] ‚Üí Training o

In [9]:
# ---
# 5. Show Final Results
# ---
results_df = pd.DataFrame(results).sort_values(by="f1", ascending=False)
print("\n--- Top Performing Random Forest Models ---")
print(results_df.head(10))

results_df.to_csv("results/rf_results.csv", index=False)
print("\nSaved final results to results/rf_results.csv")


--- Top Performing Random Forest Models ---
            model  ratio resample_type  accuracy  precision    recall  \
9   RF_Full_Depth   0.10            up  0.998843   0.949701  0.739394   
17  RF_Full_Depth   0.30            up  0.998830   0.944676  0.740326   
5   RF_Full_Depth   0.05            up  0.998832   0.947904  0.737995   
13  RF_Full_Depth   0.20            up  0.998825   0.952121  0.732401   
21  RF_Full_Depth   0.50            up  0.998820   0.947147  0.735198   
25  RF_Full_Depth   1.00            up  0.998809   0.944278  0.734732   
1   RF_Full_Depth    NaN            up  0.998771   0.961490  0.710023   
3   RF_Full_Depth    NaN          down  0.998771   0.961490  0.710023   
7   RF_Full_Depth   0.05          down  0.998289   0.763926  0.805594   
4     RF_Depth_10   0.05            up  0.998264   0.815508  0.710956   

          f1   roc_auc  
9   0.831455  0.980991  
17  0.830110  0.981786  
5   0.829882  0.979074  
13  0.827931  0.982225  
21  0.827822  0.982123  
2