## K-Nearest Neighbors (KNN)

In [1]:
import pandas as pd
import numpy as np
import sys
import time
from datetime import datetime

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
import matplotlib.pyplot as plt
import seaborn as sns

from preprocessing import prepare_data

random_state = 42
results = []
partial_save_path = "results/knn_partial_results.csv"

### 1. Load Data

In [2]:
train_df = pd.read_csv("fraudTrain.csv")
test_df = pd.read_csv("fraudTest.csv")

### 2. Prepare Data for DISTANCE Models

In [3]:

print("Fitting encoders and scalers on train data...")
out_train_init = prepare_data(
    train_df,
    mode="distance",
    training=False,
    fit=True,
)
encoders = out_train_init["encoders"]
scalers = out_train_init["scalers"]
print("Fit complete.")

print("Preparing test set...")
out_test = prepare_data(
    test_df,
    mode="distance",
    training=False,
    fit=False,
    encoders=encoders,
    scalers=scalers,
)
df_test = out_test["df"]
X_test = df_test.drop("is_fraud", axis=1)
y_test = df_test["is_fraud"]

X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(0).clip(-1e6, 1e6)

print(f"Test set prepared for distance models. Shape: {X_test.shape}")

Fitting encoders and scalers on train data...
Fit complete.
Preparing test set...
Test set prepared for distance models. Shape: (555719, 25)


### 3. Define Experiment Parameters

In [None]:
ratios_to_test = [None, 0.1, 0.5, 1.0] 
resample_types_to_test = ["df_up", "df_down"]
k_values_to_test = [3, 5, 7]

param = {
    "name": "KNeighborsClassifier",
    "params": {
        "algorithm": "ball_tree",
        "n_jobs": -1
    },
}

### 4. Run Experiment Loop

In [None]:
print("\nStarting KNN experiment loop...")
for ratio_idx, ratio in enumerate(ratios_to_test, start=1):
    print(f"\n{'='*70}")
    print(
        f"[{datetime.now().strftime('%H:%M:%S')}] Starting ratio {ratio_idx}/{len(ratios_to_test)} ‚Üí ratio={ratio}"
    )
    start_ratio_time = time.time()

    out_train = prepare_data(
        train_df,
        mode="distance",
        training=True,
        ratio=ratio,
        fit=False,  
        encoders=encoders,
        scalers=scalers,
    )

    for resample_type in resample_types_to_test:
        if resample_type not in out_train or out_train[resample_type] is None:
            continue

        df_train = out_train[resample_type]
        X_train = df_train.drop("is_fraud", axis=1)
        y_train = df_train["is_fraud"]

        X_train = X_train.replace([np.inf, -np.inf], np.nan).fillna(0).clip(-1e6, 1e6)

        print(
            f"  [{datetime.now().strftime('%H:%M:%S')}] ‚Üí Training on {resample_type} (samples={len(X_train):,})"
        )
        sys.stdout.flush()
        
        for k_idx, k in enumerate(k_values_to_test, start=1):
            
            model_name = f"{param['name']}_k={k}"
            params = param["params"].copy()
            params["n_neighbors"] = k
            
            start_k_time = time.time()

            print(f"    ‚è≥ Running {model_name} [{k_idx}/{len(k_values_to_test)}] ...", end="")
            sys.stdout.flush()

            model = KNeighborsClassifier(**params)
            
            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)

            # Metrics
            f1 = f1_score(y_test, y_pred)
            rec = recall_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred)
            acc = accuracy_score(y_test, y_pred)
            
            results.append(
                {
                    "model": "KNN",
                    "k": k,
                    "ratio": ratio,
                    "resample_type": resample_type.replace("df_",""),
                    "accuracy": acc,
                    "precision": prec,
                    "recall": rec,
                    "f1": f1,
                }
            )

            print(
                f" done ‚Üí F1={f1:.4f}, Recall={rec:.4f}, AUC={auc:.4f} | Time={time.time() - start_k_time:.1f}s"
            )
            sys.stdout.flush()

    pd.DataFrame(results).to_csv(partial_save_path, index=False)
    print(f"  üíæ Saved intermediate results ‚Üí {partial_save_path}")
    print(f"  ‚úÖ Completed ratio={ratio} in {(time.time() - start_ratio_time)/60:.1f} min")


Starting KNN experiment loop...

[20:48:57] Starting ratio 1/4 ‚Üí ratio=None
  [20:49:08] ‚Üí Training on df_up (samples=1,296,675)
    ‚è≥ Running KNeighborsClassifier_k=3 [1/3] ...

In [None]:
# ---
# 5. Show Final Results
# ---
print("\nAll ratios completed.")
print(f"Total experiments logged: {len(results)}")
results_df = pd.DataFrame(results).sort_values(by="f1", ascending=False)
print("\n--- Top Performing KNN Models ---")
print(results_df.head(10))

results_df.to_csv("results/knn_results.csv", index=False)
print("\nSaved final results to results/knn_results.csv")

### 6. Plot Results

In [None]:
print("\nGenerating plots...")
plt.figure(figsize=(9, 6))

# Plot F1
sns.lineplot(
    data=results_df, 
    x="ratio", 
    y="f1", 
    hue="resample_type",
    style="k",
    marker="o", 
    linewidth=2
)

# Plot Recall on same chart
sns.lineplot(
    data=results_df, 
    x="ratio", 
    y="recall", 
    hue="resample_type",
    style="k",
    marker="x", 
    linestyle="--", 
    linewidth=2,
    legend=False # Avoid duplicate legend entries
)

plt.title("F1 and Recall vs Resampling Ratio (KNN)")
plt.xlabel("Fraud:Legit Ratio")
plt.ylabel("Score")
plt.grid(True, alpha=0.3)
plt.legend(title="Resample / k") # Updated legend title
plt.savefig("results/knn_f1_recall_plot.png")
plt.show()

# ---- Summary Table ----
# Updated groupby to include 'k'
summary = results_df.groupby(["k", "ratio", "resample_type"])[["precision", "recall", "f1"]].mean().round(3)
print("\n--- Mean Scores Summary ---")
print(summary)

print("\nPlots and summary complete.")