In [5]:
# -------------------- FULL FRAUD DETECTION PROJECT + DASHBOARD --------------------
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import joblib

from ipywidgets import widgets, VBox, HBox, Output
from IPython.display import display

# -------------------- 1. CREATE OUTPUT FOLDER --------------------
output_folder = "Fraud_Detection_Output"
os.makedirs(output_folder, exist_ok=True)

# -------------------- 2. LOAD OR CREATE DATA --------------------
try:
    df = pd.read_csv("transactions.csv")
    print("Loaded dataset successfully!")
except:
    print("transactions.csv not found. Creating synthetic dataset...")
    np.random.seed(42)
    df = pd.DataFrame({
        "transaction_id": range(1,5001),
        "amount": np.random.uniform(1, 5000, 5000),
        "age": np.random.randint(18, 75, 5000),
        "is_international": np.random.randint(0, 2, 5000),
        "merchant_cat": np.random.randint(1, 20, 5000),
        "hour": np.random.randint(0, 24, 5000),
        "device_change": np.random.randint(0, 2, 5000),
        "fraud": np.random.randint(0, 2, 5000)
    })

# -------------------- 3. CREATE AGE GROUP --------------------
def get_age_group(age):
    if age < 25:
        return "18-24"
    elif age < 35:
        return "25-34"
    elif age < 45:
        return "35-44"
    elif age < 55:
        return "45-54"
    elif age < 65:
        return "55-64"
    else:
        return "65+"

df["age_group"] = df["age"].apply(get_age_group)

# -------------------- 4. FRAUD % BY HOUR --------------------
hourly_stats = df.groupby("hour")["fraud"].mean().reset_index().rename(columns={"fraud":"fraud_percent_by_hour"})
df = df.merge(hourly_stats, on="hour", how="left")

# -------------------- 5. SAVE BASE CSVs --------------------
df.to_csv(f"{output_folder}/fraud_data_for_powerbi.csv", index=False)
hourly_stats.to_csv(f"{output_folder}/fraud_by_hour.csv", index=False)
print("Saved: fraud_data_for_powerbi.csv & fraud_by_hour.csv")

# -------------------- 6. PREPROCESS & TRAIN MODELS --------------------
X = df.drop(["fraud","transaction_id","age_group","fraud_percent_by_hour"], axis=1)
y = df["fraud"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.25, random_state=42, stratify=y
)

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=150, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=120, learning_rate=0.1, max_depth=5, eval_metric='logloss')
}

trained_models = {}
roc_scores = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model
    prob = model.predict_proba(X_test)[:,1]
    roc_scores[name] = roc_auc_score(y_test, prob)

best_model_name = max(roc_scores, key=roc_scores.get)
best_model = trained_models[best_model_name]
print(f"Best Model: {best_model_name} | ROC-AUC: {roc_scores[best_model_name]:.4f}")

joblib.dump(best_model, f"{output_folder}/fraud_detection_model.pkl")
print(f"Saved model: {output_folder}/fraud_detection_model.pkl")

# -------------------- 7. SAVE PREDICTIONS CSV --------------------
pred_test = best_model.predict(X_test)
prob_test = best_model.predict_proba(X_test)[:,1]

predictions_df = df.loc[y_test.index, ["transaction_id","amount","age","age_group",
                                       "is_international","merchant_cat","hour",
                                       "device_change","fraud","fraud_percent_by_hour"]].copy()
predictions_df["predicted_fraud"] = pred_test
predictions_df["fraud_probability"] = prob_test
predictions_df.to_csv(f"{output_folder}/fraud_predictions.csv", index=False)
print("Saved: fraud_predictions.csv")

# -------------------- 8. SAVE FEATURE IMPORTANCE CSV (ALL MODELS) --------------------
if isinstance(best_model, RandomForestClassifier):
    feat_importance = pd.DataFrame({
        "feature": X.columns,
        "importance": best_model.feature_importances_
    }).sort_values(by="importance", ascending=False)
elif isinstance(best_model, XGBClassifier):
    importance_dict = best_model.get_booster().get_score(importance_type='weight')
    feat_importance = pd.DataFrame({
        "feature": list(importance_dict.keys()),
        "importance": list(importance_dict.values())
    }).sort_values(by="importance", ascending=False)
else:  # Logistic Regression
    feat_importance = pd.DataFrame({
        "feature": X.columns,
        "importance": abs(best_model.coef_[0])
    }).sort_values(by="importance", ascending=False)

feat_importance.to_csv(f"{output_folder}/feature_importance.csv", index=False)
print("Saved: feature_importance.csv")

# -------------------- 9. INTERACTIVE DASHBOARD --------------------
out = Output()

age_dropdown = widgets.Dropdown(
    options=["All"] + sorted(df["age_group"].unique()),
    value="All",
    description="Age Group:"
)

fraud_dropdown = widgets.Dropdown(
    options=["All","Fraud","Non-Fraud"],
    value="All",
    description="Class:"
)

def filtered_df(age_group):
    if age_group == "All":
        return df.copy()
    else:
        return df[df["age_group"] == age_group].copy()

def save_and_show_plot(fig, filename):
    fig.savefig(f"{output_folder}/{filename}.png", bbox_inches='tight')
    plt.show()

def plot_fraud_count(age_group):
    with out:
        out.clear_output(wait=True)
        data = filtered_df(age_group)
        fig, ax = plt.subplots(figsize=(6,4))
        sns.countplot(data=data, x="fraud", ax=ax)
        ax.set_title(f"Fraud vs Non-Fraud Count - Age Group: {age_group}")
        save_and_show_plot(fig, f"fraud_count_{age_group}")

def plot_correlation(age_group):
    with out:
        out.clear_output(wait=True)
        data = filtered_df(age_group)
        numeric_cols = data.select_dtypes(include=np.number).columns
        fig, ax = plt.subplots(figsize=(10,6))
        sns.heatmap(data[numeric_cols].corr(), annot=True, cmap="Blues", ax=ax)
        ax.set_title(f"Correlation Heatmap - Age Group: {age_group}")
        save_and_show_plot(fig, f"correlation_heatmap_{age_group}")

def plot_amount_distribution(age_group, fraud_class):
    with out:
        out.clear_output(wait=True)
        data = filtered_df(age_group)
        fig, ax = plt.subplots(figsize=(8,5))
        if fraud_class=="All":
            sns.kdeplot(data=data, x="amount", hue="fraud", fill=True, ax=ax)
        else:
            cls = 1 if fraud_class=="Fraud" else 0
            sns.kdeplot(data=data[data["fraud"]==cls], x="amount", fill=True, ax=ax)
        ax.set_title(f"Transaction Amount Distribution - {fraud_class} - Age Group: {age_group}")
        save_and_show_plot(fig, f"amount_distribution_{fraud_class}_{age_group}")

def plot_feature_importance(age_group):
    with out:
        out.clear_output(wait=True)
        fig, ax = plt.subplots(figsize=(10,5))
        sns.barplot(x="feature", y="importance", data=feat_importance, palette="Greens", ax=ax)
        ax.set_title(f"Feature Importance - {best_model_name} - Age Group: {age_group}")
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
        save_and_show_plot(fig, f"feature_importance_{age_group}")

def plot_fraud_by_hour(age_group):
    with out:
        out.clear_output(wait=True)
        data = filtered_df(age_group)
        hourly = data.groupby("hour")["fraud"].mean().reset_index()
        fig, ax = plt.subplots(figsize=(10,5))
        sns.barplot(data=hourly, x="hour", y="fraud", palette="Reds", ax=ax)
        ax.set_title(f"Fraud % by Transaction Hour - Age Group: {age_group}")
        ax.set_ylabel("Fraud %")
        save_and_show_plot(fig, f"fraud_by_hour_{age_group}")

def plot_confusion_matrix(age_group):
    with out:
        out.clear_output(wait=True)
        cm = confusion_matrix(y_test, pred_test)
        fig, ax = plt.subplots(figsize=(5,4))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", ax=ax)
        ax.set_title(f"Confusion Matrix - {best_model_name}")
        ax.set_xlabel("Predicted")
        ax.set_ylabel("Actual")
        save_and_show_plot(fig, f"confusion_matrix_{age_group}")

# Buttons
button_fraud_count = widgets.Button(description="Fraud Count")
button_correlation = widgets.Button(description="Correlation Heatmap")
button_amount_dist = widgets.Button(description="Amount Distribution")
button_feature_importance = widgets.Button(description="Feature Importance")
button_fraud_hour = widgets.Button(description="Fraud % by Hour")
button_conf_matrix = widgets.Button(description="Confusion Matrix")

button_fraud_count.on_click(lambda x: plot_fraud_count(age_dropdown.value))
button_correlation.on_click(lambda x: plot_correlation(age_dropdown.value))
button_amount_dist.on_click(lambda x: plot_amount_distribution(age_dropdown.value, fraud_dropdown.value))
button_feature_importance.on_click(lambda x: plot_feature_importance(age_dropdown.value))
button_fraud_hour.on_click(lambda x: plot_fraud_by_hour(age_dropdown.value))
button_conf_matrix.on_click(lambda x: plot_confusion_matrix(age_dropdown.value))

controls = VBox([
    HBox([age_dropdown, fraud_dropdown]),
    button_fraud_count,
    button_correlation,
    button_amount_dist,
    button_feature_importance,
    button_fraud_hour,
    button_conf_matrix
])

dashboard = VBox([controls, out])
display(dashboard)


transactions.csv not found. Creating synthetic dataset...
Saved: fraud_data_for_powerbi.csv & fraud_by_hour.csv
Best Model: Logistic Regression | ROC-AUC: 0.5190
Saved model: Fraud_Detection_Output/fraud_detection_model.pkl
Saved: fraud_predictions.csv
Saved: feature_importance.csv


VBox(children=(VBox(children=(HBox(children=(Dropdown(description='Age Group:', options=('All', '18-24', '25-3â€¦