In [1]:
# ==============================================================
# Decision Tree Classification (DTC) Experiment
# ==============================================================
import pandas as pd
import numpy as np
import sys
import time
from datetime import datetime

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
from preprocessing import prepare_data

# Experiment config
random_state = 42
results = []
partial_save_path = "results/dtc_partial_results.csv"

In [2]:
# ---
# 1. Load Data
# ---
train_df = pd.read_csv("fraudTrain.csv")
test_df = pd.read_csv("fraudTest.csv")

print("Loaded train/test data.")

Loaded train/test data.


In [3]:
# ---
# 2. Prepare Data for TREE Models (CRITICAL STEP)
# ---
# We MUST use mode="tree". This skips scaling and uses OrdinalEncoders.
# This creates the "Source of Truth" encoders for our tree models.
out_train_init = prepare_data(
    train_df,
    mode="tree",
    training=False,  # We only need the encoders
    fit=True,
)
encoders = out_train_init["encoders"]
scalers = {}  # Scalers are not used or returned by "tree" mode
# Now prepare the TEST set using the *same* "tree" mode
out_test = prepare_data(
    test_df,
    mode="tree",
    training=False,
    fit=False,
    encoders=encoders,
    scalers=scalers,
)
df_test = out_test["df"]
X_test = df_test.drop("is_fraud", axis=1)
y_test = df_test["is_fraud"]

# Clean inf/-inf values
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0).clip(-1e6, 1e6)

print(f"Test set prepared for trees. Shape: {X_test.shape}")

Test set prepared for trees. Shape: (555719, 13)


In [4]:
# ---
# 3. Define Experiment Parameters
# ---
# We'll test a few ratios, but ONLY on df_down for speed.
ratios_to_test = [0.1, 0.5, 1.0]

# We will test the parameters you asked for
params_to_test = [
    {"name": "DTC_Gini_Full", "params": {"criterion": "gini", "max_depth": None}},
    {"name": "DTC_Entropy_Full", "params": {"criterion": "entropy", "max_depth": None}},
    {"name": "DTC_Gini_Depth_10", "params": {"criterion": "gini", "max_depth": 10}},
]

resample_type = "df_down"  # We proved df_up is a waste of time

In [5]:
# ---
# 4. Run Experiment Loop
# ---
print("\nStarting Decision Tree experiment loop...")
for ratio_idx, ratio in enumerate(ratios_to_test, start=1):
    print(f"\n{'='*70}")
    print(
        f"[{datetime.now().strftime('%H:%M:%S')}] Starting ratio {ratio_idx}/{len(ratios_to_test)} ‚Üí ratio={ratio}"
    )
    start_ratio_time = time.time()

    # Get the downsampled, "tree-mode" preprocessed data
    out_train = prepare_data(
        train_df,
        mode="tree",
        training=True,
        ratio=ratio,
        fit=False,  # <-- Correct: We use the fitted encoders
        encoders=encoders,
        scalers=scalers,
    )

    if resample_type not in out_train or out_train[resample_type] is None:
        continue

    df_train = out_train[resample_type]
    X_train = df_train.drop("is_fraud", axis=1)
    y_train = df_train["is_fraud"]

    # Clean inf/-inf values
    X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0).clip(-1e6, 1e6)

    print(
        f"  [{datetime.now().strftime('%H:%M:%S')}] ‚Üí Training on {resample_type} (samples={len(X_train):,})"
    )
    sys.stdout.flush()

    for p_info in params_to_test:
        model_name = p_info["name"]
        params = p_info["params"]
        start_k_time = time.time()

        print(f"    ‚è≥ Running {model_name} ...", end="")
        sys.stdout.flush()

        model = DecisionTreeClassifier(random_state=random_state, **params)
        model.fit(X_train, y_train)

        # Predict
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

        # Metrics
        f1 = f1_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob)
        prec = precision_score(y_test, y_pred)
        acc = accuracy_score(y_test, y_pred)

        results.append(
            {
                "model": model_name,
                "ratio": ratio,
                "resample_type": resample_type,
                "accuracy": acc,
                "precision": prec,
                "recall": rec,
                "f1": f1,
                "roc_auc": auc,
            }
        )

        print(
            f" done ‚Üí F1={f1:.4f}, Recall={rec:.4f}, AUC={auc:.4f} | Time={time.time() - start_k_time:.1f}s"
        )
        sys.stdout.flush()

    # Save partial results after each ratio
    pd.DataFrame(results).to_csv(partial_save_path, index=False)
    print(f"  üíæ Saved intermediate results ‚Üí {partial_save_path}")
    print(f"  ‚úÖ Completed ratio={ratio} in {(time.time() - start_ratio_time)/60:.1f} min")

print("\nAll ratios completed.")
print(f"Total experiments logged: {len(results)}")


Starting Decision Tree experiment loop...

[22:39:13] Starting ratio 1/3 ‚Üí ratio=0.1
  [22:39:16] ‚Üí Training on df_down (samples=82,566)
    ‚è≥ Running DTC_Gini_Full ... done ‚Üí F1=0.4513, Recall=0.9082, AUC=0.9500 | Time=0.6s
    ‚è≥ Running DTC_Entropy_Full ... done ‚Üí F1=0.4649, Recall=0.9035, AUC=0.9479 | Time=0.6s
    ‚è≥ Running DTC_Gini_Depth_10 ... done ‚Üí F1=0.5626, Recall=0.9086, AUC=0.9888 | Time=0.6s
  üíæ Saved intermediate results ‚Üí results/dtc_partial_results.csv
  ‚úÖ Completed ratio=0.1 in 0.1 min

[22:39:18] Starting ratio 2/3 ‚Üí ratio=0.5
  [22:39:22] ‚Üí Training on df_down (samples=22,518)
    ‚è≥ Running DTC_Gini_Full ... done ‚Üí F1=0.2581, Recall=0.9473, AUC=0.9632 | Time=0.3s
    ‚è≥ Running DTC_Entropy_Full ... done ‚Üí F1=0.2409, Recall=0.9506, AUC=0.9638 | Time=0.3s
    ‚è≥ Running DTC_Gini_Depth_10 ... done ‚Üí F1=0.3022, Recall=0.9501, AUC=0.9821 | Time=0.3s
  üíæ Saved intermediate results ‚Üí results/dtc_partial_results.csv
  ‚úÖ Completed 

In [6]:
# ---
# 5. Show Final Results
# ---
results_df = pd.DataFrame(results).sort_values(by="f1", ascending=False)
print("\n--- Top Performing Decision Tree Models ---")
print(results_df.head(10))

results_df.to_csv("results/dtc_results.csv", index=False)
print("\nSaved final results to results/dtc_results.csv")


--- Top Performing Decision Tree Models ---
               model  ratio resample_type  accuracy  precision    recall  \
2  DTC_Gini_Depth_10    0.1       df_down  0.994548   0.407485  0.908625   
1   DTC_Entropy_Full    0.1       df_down  0.991973   0.312984  0.903497   
0      DTC_Gini_Full    0.1       df_down  0.991478   0.300293  0.908159   
5  DTC_Gini_Depth_10    0.5       df_down  0.983065   0.179686  0.950117   
3      DTC_Gini_Full    0.5       df_down  0.978980   0.149412  0.947319   
4   DTC_Entropy_Full    0.5       df_down  0.976880   0.137947  0.950583   
8  DTC_Gini_Depth_10    1.0       df_down  0.972067   0.118164  0.965035   
6      DTC_Gini_Full    1.0       df_down  0.970255   0.111363  0.960839   
7   DTC_Entropy_Full    1.0       df_down  0.969911   0.110226  0.960839   

         f1   roc_auc  
2  0.562644  0.988815  
1  0.464915  0.947906  
0  0.451344  0.949980  
5  0.302217  0.982113  
3  0.258114  0.963211  
4  0.240931  0.963783  
8  0.210548  0.983357  
6 