In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Generate synthetic fraud detection dataset
np.random.seed(42)
n_samples = 1000
transaction_amount = np.random.exponential(scale=100, size=n_samples)
time_of_day = np.random.uniform(0, 24, size=n_samples)
location_risk = np.random.uniform(0, 1, size=n_samples)
merchant_category = np.random.uniform(0, 10, size=n_samples)

fraud_probability = (
    0.1 * (transaction_amount > 200) +
    0.15 * (time_of_day > 22) +
    0.2 * (location_risk > 0.7) +
    0.05 * np.random.randn(n_samples)
)
fraud = (fraud_probability > 0.3).astype(int)

df = pd.DataFrame({
    'transaction_amount': transaction_amount,
    'time_of_day': time_of_day,
    'location_risk': location_risk,
    'merchant_category': merchant_category,
    'fraud': fraud
})

X = df.drop('fraud', axis=1)
y = df['fraud']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Single test set accuracy: {test_accuracy:.4f}")

# INCORRECT: only 3 folds and no metric storage
cv = KFold(n_splits=3, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean accuracy: {scores.mean():.4f}")
