# 3. Supervised Fraud Detection: Gradient Boosting

Learns from known fraud cases (Class = 1).

Tasks:
1. Load data (with Anomaly scores).
2. Split Train/Test.
3. Train Gradient Boosting Classifier.
4. Generate Fraud Probabilities.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
import pickle

INPUT_FILE = '../data/cleaned_transactions_with_iso.csv'
OUTPUT_FILE = '../data/transactions_with_predictions.csv'
MODEL_FILE = '../outputs/gb_model.pkl'

df = pd.read_csv(INPUT_FILE)
print(f"Data loaded. Shape: {df.shape}")

In [None]:
# Prepare Data
X = df.drop(columns=['Class', 'Time', 'UserID']) # Use all features including anomaly_score if we want stacking, or exclude it to keep independent
# The plan says "Combine... df['fraud_probability'] + df['anomaly_score']", implying they are generated independently or sequentially.
# Often we use the unsupervised score as a feature in supervised learning. Let's include 'anomaly_score' as a feature.
# BUT, the user prompt implies 'Hybrid Fraud Score' is a weighted average at the end. So strictly speaking we should probably NOT use anomaly_score as input here to follow the formula precisely? 
# Actually, using it as a feature is smarter. But let's follow the user's explicit Formula step later (Step 6) which implies they are two separate signals combined at the end. 
# So I will EXCLUDE anomaly_score from the supervised training features to keep them as distinct signals for the hybrid score.
X = X.drop(columns=['anomaly_score'], errors='ignore') 

y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

In [None]:
# Train Gradient Boosting
gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    random_state=42
)

print("Training Gradient Boosting...")
gb.fit(X_train, y_train)

# Predict Probabilities for ALL data (for the final output CSV)
# Ideally we only predict on Test for Evaluation, but for the 'Output' file we probably want scores for the whole dataset (or just the test set).
# We'll predict for the whole dataset to create the full 'fraud_predictions.csv'
all_probs = gb.predict_proba(X)[:, 1]
df['fraud_probability'] = all_probs

print("Training Complete.")

In [None]:
# Save Model and Data
with open(MODEL_FILE, 'wb') as f:
    pickle.dump(gb, f)

df.to_csv(OUTPUT_FILE, index=False)
print(f"Predictions saved to {OUTPUT_FILE}")