# **Fraud Detection**

In [None]:
# Standard Imports
from typing import Callable
from functools import partial
from pprint import pprint

# Third Party Imports
import optuna
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import (
    train_test_split,
    cross_val_predict,
    cross_val_score,
    StratifiedKFold,
)
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    log_loss,
    brier_score_loss,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
    balanced_accuracy_score,
    precision_recall_curve,
)
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from optuna.importance import get_param_importances

## *Read data*

In [None]:
# Read data
df = pl.read_csv(
    r"D:\Codebase\fraud-detection\data\input\creditcard.csv",
    ignore_errors=False,
    infer_schema_length=1000_000,
)

# Display data
df

# **Data Engineering**

## *Data Split*

In [None]:
# Split data into X and y
x = df.select(pl.exclude("Class")).to_pandas()
y = df.select("Class").to_series().to_pandas()

# Set type of the splits
X_train: pd.DataFrame
y_train: pd.Series
X_test: pd.DataFrame
y_test: pd.Series

# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

# The class distribution in training set
display(y_train.value_counts(normalize=False))
display(y_train.value_counts(normalize=True).round(4) * 100)

# The Target Mapping
target_mapping = {0: "Legitimate", 1: "Fraudulent"}

# **Modelling**

## Helper Functions

In [None]:
def better_classification_report(
    model,
    X_train: pd.DataFrame,
    y_train: pd.Series,
) -> np.ndarray:
    # Set up Cross Validation Policy
    cv_policy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    y_train_proba = cross_val_predict(
        model, X_train, y_train, cv=cv_policy, method="predict_proba", verbose=0
    )
    # Get Predictions using Cross Validation
    y_train_pred = np.where(y_train_proba[:, 1] > 0.5, 1, 0)

    # Show Basic Metrics
    print(f"Accuracy         : {accuracy_score(y_train, y_train_pred):.2%}")
    print(f"Balanced Accuracy: {balanced_accuracy_score(y_train, y_train_pred):.2%}")
    print(f"F1 Score         : {f1_score(y_train, y_train_pred):.2%}")
    print(f"Precision        : {precision_score(y_train, y_train_pred):.2%}")
    print(f"Recall           : {recall_score(y_train, y_train_pred):.2%}")
    print(f"AP Score    : {average_precision_score(y_train, y_train_proba[:, 1]):.2%}")
    print(f"Log Loss    : {log_loss(y_train, y_train_proba):.4f}")
    print(f"Brier Score : {brier_score_loss(y_train, y_train_proba[:, 1]):.4f}")

    # Show Confusion Matrix
    display(
        pd.DataFrame(
            confusion_matrix(y_train, y_train_pred),
            columns=["Pred-Normal-Transaction", "Pred-Fraud-Transaction"],
            index=["Actual-Normal-Transaction", "Actual-Fraud-Transaction"],
        )
    )

    # Show Classification Report
    print(classification_report(y_train, y_train_pred, zero_division=0))

    # Return nothing
    return y_train_proba

In [None]:
# Define the Objective Function type
MultiObjectiveFunction = Callable[[optuna.trial.Trial], tuple[float, float, float]]
FullMultiObjectiveFunction = Callable[
    [optuna.trial.Trial, pd.DataFrame, pd.Series], tuple[float, float, float]
]

In [None]:
def full_dt_multi_objective_func(
    trial: optuna.trial.Trial, X_train: pd.DataFrame, y_train: pd.Series
) -> tuple[float, float, float]:
    # Parameter Dict for this trial
    params_dict = dict(
        ## Tree Policy
        criterion=trial.suggest_categorical(
            "criterion", ["gini", "entropy", "log_loss"]
        ),
        splitter=trial.suggest_categorical("splitter", ["random", "best"]),
        class_weight=trial.suggest_categorical("class_weight", ["balanced", None]),
        ## pre-pruning
        max_depth=trial.suggest_int("max_depth", 1, 500, log=True),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 100, log=True),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 100, log=True),
        # post-pruning
        ccp_alpha=trial.suggest_float("ccp_alpha", 1e-12, 1.5, log=True),
    )

    # Set the paramters in the decision tree
    dt = DecisionTreeClassifier(random_state=42)
    dt = dt.set_params(**params_dict)

    # Set Cross Validation Policy & Cross Val Score
    cv_policy = StratifiedKFold(n_splits=5, shuffle=True, random_state=1729)

    # Get Predictions using Cross Validation
    y_pred_proba = cross_val_predict(
        dt, X_train, y_train, cv=cv_policy, n_jobs=-1, method="predict_proba"
    )[:, 1]
    y_pred = np.where(y_pred_proba >= 0.5, 1, 0)

    # Calculate Precision Score & Recall Score
    precision = precision_score(y_train, y_pred, zero_division=0)
    recall = recall_score(y_train, y_pred, zero_division=0)
    ap_score = average_precision_score(y_train, y_pred_proba)

    # Return the balanced accuracy score
    return float(ap_score), float(precision), float(recall)


# An objective fucniton for Random Forest Classifier
def full_rf_multi_objective_func(
    trial: optuna.trial.Trial, X_train: pd.DataFrame, y_train: pd.Series
) -> tuple[float, float, float]:
    # Parameter Dict for this trial
    params_dict = dict(
        ## Tree Policy
        class_weight=trial.suggest_categorical(
            "class_weight", ["balanced", "balanced_subsample", None]
        ),
        ## pre-pruning
        min_samples_split=trial.suggest_int("min_samples_split", 2, 100, log=True),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 100, log=True),
        max_leaf_nodes=trial.suggest_int("max_leaf_nodes", 10, 1000, log=True),
        # post-pruning
        ccp_alpha=trial.suggest_float("ccp_alpha", 1e-9, 2, log=True),
    )

    # Set the paramters in the decision tree
    rf = RandomForestClassifier(random_state=42, n_jobs=-1)
    rf = rf.set_params(**params_dict)

    # Set Cross Validation Policy & Cross Val Score
    cv_policy = StratifiedKFold(n_splits=5, shuffle=True, random_state=1729)

    # Get Predictions using Cross Validation
    y_pred_proba = cross_val_predict(
        rf, X_train, y_train, cv=cv_policy, n_jobs=-1, method="predict_proba"
    )[:, 1]
    y_pred = np.where(y_pred_proba >= 0.5, 1, 0)

    # Calculate Precision Score & Recall Score
    precision = precision_score(y_train, y_pred, zero_division=0)
    recall = recall_score(y_train, y_pred, zero_division=0)
    ap_score = average_precision_score(y_train, y_pred_proba)

    # Return the balanced accuracy score
    return float(ap_score), float(precision), float(recall)

## *1. Dummy Classifier* : Setting Baselines

In [None]:
# Show better classification report for dummy clf
better_classification_report(
    DummyClassifier(strategy="stratified", random_state=42), X_train, y_train
)

## *2. Decision Tree*

In [None]:
# Set up the Classifier Tree
dt = DecisionTreeClassifier(
    # Tree Policy
    criterion="gini",
    splitter="best",
    # Weights
    class_weight="balanced",
    # The shape of the tree
    ## Pre-pruning
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    ## Post-pruning
    ccp_alpha=0.0,
    # Random Seed
    random_state=42,
)


# Show better classification report
y_train_proba = better_classification_report(dt, X_train, y_train)

In [None]:
# Database path
db_path = "sqlite:///D:/Codebase/fraud-detection/data/db/optuna-fraud-detection.db"

# Create study
study_dt = optuna.create_study(
    storage=db_path,  # storage=None,
    study_name="MOO-Decision-Tree-V02",
    directions=["maximize", "maximize", "maximize"],
    load_if_exists=True,
)

# Create the Partial Functions
dt_multi_objective_func: MultiObjectiveFunction = partial(
    full_dt_multi_objective_func, X_train=X_train, y_train=y_train
)

# Start Optimization
study_dt.optimize(
    dt_multi_objective_func,
    n_trials=1,
    timeout=int(3 * 3600),
    n_jobs=1,
    gc_after_trial=True,
    show_progress_bar=True,
)

In [None]:
# Understand the Importtant Parameters
importances_dt_ap = get_param_importances(study_dt, target=lambda t: t.values[0])
importances_dt_pr = get_param_importances(study_dt, target=lambda t: t.values[1])
importances_dt_re = get_param_importances(study_dt, target=lambda t: t.values[2])


# Get the best trials
best_trials_dt = study_dt.best_trials

# Create Importances DataFrame
importances_dt_df = pd.concat(
    [
        (pd.Series(importances_dt_ap, name="AP-Score") * 100),
        (pd.Series(importances_dt_pr, name="Precision-Score") * 100),
        (pd.Series(importances_dt_re, name="Recall-Score") * 100),
    ],
    axis=1,
).sort_values(by="AP-Score", ascending=False)

# Show Importances DataFrame
display(importances_dt_df)

# Show best trials
best_trials_dt

In [None]:
# Create Scores DataFrame
score_dt_df = pd.DataFrame(
    [trial.values for trial in best_trials_dt],
    columns=["AP-Score", "Precision-Score", "Recall-Score"],
    index=[trial.number for trial in best_trials_dt],
)
# Rtnak the scores
score_dt_df.loc[:, "AP-Score-Rank"] = (
    score_dt_df.loc[:, "AP-Score"].rank(ascending=False, method="dense").astype(int)
)
score_dt_df.loc[:, "Precision-Score-Rank"] = (
    score_dt_df.loc[:, "Precision-Score"]
    .rank(ascending=False, method="dense")
    .astype(int)
)
score_dt_df.loc[:, "Recall-Score-Rank"] = (
    score_dt_df.loc[:, "Recall-Score"].rank(ascending=False, method="dense").astype(int)
)

# The mean of all the ranks
score_dt_df.loc[:, "Mean-Rank"] = (
    score_dt_df.loc[:, ["Precision-Score-Rank", "Recall-Score-Rank"]]
    .mean(axis=1)
    .rank(ascending=True, method="dense")
    .astype(int)
)
# Show the Scores DataFrame
display(score_dt_df.sort_values(by="Mean-Rank"))

# Filter the Scores DataFrame for Precision >= 70% and Recall >= 70%
score_dt_df.loc[
    (score_dt_df["Precision-Score"] >= 0.70) & (score_dt_df["Recall-Score"] >= 0.70)
].sort_values(by="Mean-Rank")

In [None]:
# Best Parameters for Trial Number 97
best_dt_parms = [trial.params for trial in best_trials_dt if trial.number == 97][0]
pprint(best_dt_parms)

# Fit the Decision Tree with the best parameters
best_dt = DecisionTreeClassifier(random_state=42)
best_dt = best_dt.set_params(**best_dt_parms)

# Get the classification report for best decision tree
better_classification_report(best_dt, X_train, y_train)

In [None]:
# Fite the Decision Tree
best_dt.fit(X_train, y_train)

# Plot the tree
fig, ax = plt.subplots(figsize=(20, 10))
plot_tree(
    best_dt,
    filled=True,
    class_names=[target_mapping[0], target_mapping[1]],
    feature_names=best_dt.feature_names_in_,
    max_depth=5,
    fontsize=8,
    ax=ax,
)
plt.show()

In [None]:
# Fit the model
best_dt.fit(X_train, y_train)

# Crete matplotlib figure
fig, ax = plt.subplots(figsize=(20, 10))

# Plot the tree
plot_tree(
    best_dt,
    feature_names=list(X_train.columns),
    class_names=["Non-Fraud", "Fraud"],
    filled=True,
    rounded=True,
    fontsize=8,
    max_depth=4,
    ax=ax,
)

# Show plot
plt.show()

## *Random Forest*

### Simple RF Fit

In [None]:
# Set Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Show better classification report
y_train_proba = better_classification_report(rf, X_train, y_train)

In [None]:
np.where(y_train_proba[:, 1] >= 0.52, 1, 0)

confusion_matrix(y_train, np.where(y_train_proba[:, 1] >= 0.52, 1, 0))

In [None]:
# Get Precision-Recall for each threshold
precision, recall, thresholds = precision_recall_curve(y_train, y_train_proba[:, 1])
# Compute F1 for each threshold
f1 = 2 * (precision * recall) / (precision + recall)
# Select the best threshold with highest F1
best_thresh = thresholds[np.argmax(f1)]
# Print the best threshold
print(f"Best threshold for F1 = {best_thresh:.3f}")

# Precision Recall DataFrame
pr_df = pd.DataFrame(
    np.concat(
        [
            precision[:-1].reshape(-1, 1),
            recall[:-1].reshape(-1, 1),
            f1[:-1].reshape(-1, 1),
            thresholds.reshape(-1, 1),
        ],
        axis=1,
    ),
    columns=["Precision", "Recall", "F1-Score", "Threshold"],
).sort_values(["Recall", "Precision"], ascending=[True, False])

# Lets plot the Precision-Recall Curve
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=pr_df["Recall"],
        y=pr_df["Precision"],
        mode="lines+markers",
        name="Precision-Recall Curve",
        line=dict(color="blue", width=2),
        # Add Custom data
        customdata=pr_df[["F1-Score", "Threshold"]].values,
        # Add hover info
        hovertemplate=(
            "Recall: %{x:.2%}<br>"
            "Precision: %{y:.2%}<br>"
            "F1: %{customdata[0]:.2%}<br>"
            "Threshold: %{customdata[1]:.2%}"
            "<extra></extra>"
        ),
    )
)

# Add one Point for Max F1 Score
max_f1_idx = pr_df["F1-Score"].idxmax()
fig.add_trace(
    go.Scatter(
        x=[pr_df.loc[max_f1_idx, "Recall"]],
        y=[pr_df.loc[max_f1_idx, "Precision"]],
        mode="markers+text",
        name="Max F1 Score Point",
        marker=dict(color="red", size=10, symbol="circle"),
        text=[f"Max F1: {pr_df.loc[max_f1_idx, 'F1-Score']:.2%}"],
        textposition="top center",
        hovertemplate=(
            "Recall: %{x:.2%}<br>"
            "Precision: %{y:.2%}<br>"
            "F1: %{text}<br>"
            "Threshold: %{customdata[1]:.4%}"
            "<extra></extra>"
        ),
        customdata=np.array(
            [[pr_df.loc[max_f1_idx, "F1-Score"], pr_df.loc[max_f1_idx, "Threshold"]]]
        ),
    )
)

# Update Layout
fig.update_layout(
    title=dict(text="<b>Precision-Recall Curve</b>", x=0.5, font=dict(size=20)),
    xaxis_title="<b>Recall</b>",
    yaxis_title="<b>Precision</b>",
    template="plotly_white",
)

In [None]:
# Fit the modle
rf.fit(X_train, y_train)

In [None]:
display(
    pd.Series(rf.feature_importances_ * 100, index=rf.feature_names_in_).sort_values(
        ascending=False
    )
)

display(
    pd.Series(rf.feature_importances_ * 100, index=rf.feature_names_in_)
    .sort_values(ascending=False)
    .cumsum()
)

### Optimized RF Fit

In [None]:
# Set up study and optimization for Random Forest
study_rf = optuna.create_study(
    # storage=db_path,
    storage=None,
    study_name="Random-Forest-V04",
    direction="maximize",
    load_if_exists=True,
)

# Set partial function for RF objective
rf_objective: MultiObjectiveFunction = partial(
    full_rf_multi_objective_func, X_train=X_train, y_train=y_train
)

# Start Optimization
study_rf.optimize(
    rf_objective,
    n_trials=1000,
    timeout=int(5 * 3600),
    n_jobs=1,
    gc_after_trial=True,
    show_progress_bar=True,
)

In [None]:
# Set the paramters in the decision tree
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf = rf.set_params(**study_rf.best_params)
rf.fit(X_train, y_train)

In [None]:
[tree.get_depth() for tree in rf.estimators_]

# Create Figure Object
fig = go.Figure()

#  Draw the count of trees by depth
fig.add_trace(
    go.Histogram(
        x=[tree.get_depth() for tree in rf.estimators_],
        nbinsx=max([tree.get_depth() for tree in rf.estimators_]),
        name="Count of Trees by Depth",
    )
)

In [None]:
# Display Feature Importances
pd.Series(rf.feature_importances_ * 100, index=rf.feature_names_in_).sort_values(
    ascending=False
)

In [None]:
# Get Parammeter Importance
param_importance = get_param_importances(study_rf)

# Show importtance
display(pd.Series(param_importance) * 100)
# Cumulative Importance
display(pd.Series(param_importance).cumsum() * 100)
# Plot the graph for the Cumulative Importance
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=pd.Series(param_importance).cumsum().index,
        y=(pd.Series(param_importance) * 100).cumsum().values,
        name="Cumulative Importance",
    )
)

In [None]:
(pd.Series(param_importance) * 100).cumsum()

In [None]:
(
    (pd.Series(param_importance) * 100).cumsum().shift(-1)
    - (pd.Series(param_importance) * 100).cumsum().shift(1)
) / 2

In [None]:
# Get better classification report
better_classification_report(rf, X_train, y_train)