# **Fraud Detection**

In [None]:
# Standard Imports
from typing import Callable
from functools import partial

# Third Party Imports
import optuna
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import (
    train_test_split,
    cross_val_predict,
    cross_val_score,
    RepeatedStratifiedKFold,
    StratifiedKFold,
)
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    log_loss,
    brier_score_loss,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
    balanced_accuracy_score,
)
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from optuna.importance import get_param_importances
from xgboost.sklearn import XGBClassifier

## *Read data*

In [None]:
# Read data
df = pl.read_csv(
    r"D:\Codebase\fraud-detection\data\input\creditcard.csv",
    ignore_errors=False,
    infer_schema_length=1000_000,
)

# Display data
df

## *Data Exploration*

In [None]:
# Create Figure Object
fig = go.Figure()

# Add Trace to heatmap
fig.add_trace(
    go.Heatmap(
        x=df.to_pandas().corr(method="pearson").index,
        y=df.to_pandas().corr(method="pearson").columns,
        z=df.to_pandas().corr(method="pearson").values * 100,
    ),
)

# Layout settings
fig.update_layout(
    title=dict(text="<b>Correlation Heatmap</b>", x=0.5, font=dict(size=24)),
    xaxis_nticks=36,
    yaxis_nticks=36,
    height=800,
    width=800,
)

# Show plot
fig.show()

In [None]:
# Set Subplot lay out
columns = df.columns
fig = make_subplots(
    rows=len(columns),
    cols=1,
    subplot_titles=[f"<B>{col} Distribution</B>" for col in columns],
)

# Loop through columns and add histogram and box plot for each
for i, col in enumerate(columns, start=1):
    fig.add_trace(
        go.Histogram(
            x=df[col].to_pandas(),
            name=f"{col} Histogram",
            nbinsx=100,
            # xaxis="Distribution",
            # yaxis="Count",
        ),
        row=i,
        col=1,
    )

# Set Layout
fig.update_layout(
    title=dict(text="<b>Feature Distributions</b>", x=0.5, font=dict(size=24)),
    height=300 * len(columns),
    width=800,
    showlegend=False,
)

# Show
fig.show()

# **Data Engineering**

## *Data Split*

In [None]:
# Split data into X and y
x = df.select(pl.exclude("Class")).to_pandas()
y = df.select("Class").to_series().to_pandas()

# Set type of the splits
X_train: pd.DataFrame
y_train: pd.Series
X_test: pd.DataFrame
y_test: pd.Series

# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# The class distribution in training set
display(y_train.value_counts(normalize=False))
display(y_train.value_counts(normalize=True).round(4) * 100)

# **Modelling**

In [None]:
# Define the Objective Function type
ObjectiveFunction = Callable[[optuna.trial.Trial], float]
FullObjectiveFunction = Callable[[optuna.trial.Trial, pd.DataFrame, pd.Series], float]

In [None]:
def better_classification_report(
    model,
    X_train: pd.DataFrame,
    y_train: pd.Series,
) -> None:
    # Set up Cross Validation Policy
    cv_policy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Get Predictions using Cross Validation
    y_train_pred = cross_val_predict(
        model, X_train, y_train, cv=cv_policy, method="predict", verbose=0
    )
    y_train_proba = cross_val_predict(
        model, X_train, y_train, cv=cv_policy, method="predict_proba", verbose=0
    )

    # Show Basic Metrics
    print(f"Accuracy         : {accuracy_score(y_train, y_train_pred):.2%}")
    print(f"Balanced Accuracy: {balanced_accuracy_score(y_train, y_train_pred):.2%}")
    print(f"F1 Score         : {f1_score(y_train, y_train_pred):.2%}")
    print(f"Precision        : {precision_score(y_train, y_train_pred):.2%}")
    print(f"Recall           : {recall_score(y_train, y_train_pred):.2%}")
    print(f"AP Score    : {average_precision_score(y_train, y_train_proba[:, 1]):.2%}")
    print(f"Log Loss    : {log_loss(y_train, y_train_proba):.4f}")
    print(f"Brier Score : {brier_score_loss(y_train, y_train_proba[:, 1]):.4f}")

    # Show Confusion Matrix
    display(
        pd.DataFrame(
            confusion_matrix(y_train, y_train_pred),
            columns=["Pred-Normal-Transaction", "Pred-Fraud-Transaction"],
            index=["Actual-Normal-Transaction", "Actual-Fraud-Transaction"],
        )
    )

    # Show Classification Report
    print(classification_report(y_train, y_train_pred, zero_division=np.nan))

    # Return nothing
    return None

In [None]:
def full_dt_objective(
    trial: optuna.trial.Trial, X_train: pd.DataFrame, y_train: pd.Series
) -> float:
    # Parameter Dict for this trial
    params_dict = dict(
        ## Tree Policy
        criterion=trial.suggest_categorical(
            "criterion", ["gini", "entropy", "log_loss"]
        ),
        splitter=trial.suggest_categorical("splitter", ["random", "best"]),
        class_weight=trial.suggest_categorical("class_weight", ["balanced", None]),
        ## pre-pruning
        max_depth=trial.suggest_int("max_depth", 1, 100),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 20),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 20),
        # post-pruning
        ccp_alpha=trial.suggest_float("ccp_alpha", 0.0, 4, step=0.0001),
    )

    # Set the paramters in the decision tree
    dt = DecisionTreeClassifier(random_state=42)
    dt = dt.set_params(**params_dict)

    # Set Cross Validation Policy & Cross Val Score
    cv_policy = RepeatedStratifiedKFold(n_splits=4, n_repeats=2, random_state=42)
    score = np.nanmean(
        cross_val_score(
            dt, X_train, y_train, cv=cv_policy, scoring="average_precision", n_jobs=-1
        )
    )

    # Return the balanced accuracy score
    return float(score)

## *Dummy Classifier*

In [None]:
# Set up dummpy clf
dummy_clf = DummyClassifier(strategy="uniform", random_state=42)

# Show better classification report for dummy clf
better_classification_report(dummy_clf, X_train, y_train)

In [None]:
# Set up dummpy clf
dummy_clf = DummyClassifier(strategy="most_frequent", random_state=42)

# Show better classification report for dummy clf
better_classification_report(dummy_clf, X_train, y_train)

In [None]:
# Set up dummpy clf
dummy_clf = DummyClassifier(strategy="stratified", random_state=42)

# Show better classification report for dummy clf
better_classification_report(dummy_clf, X_train, y_train)

## *Decision Tree*

In [None]:
# Set up the Classifier Tree
dt = DecisionTreeClassifier(
    # Tree Policy
    criterion="gini",
    splitter="best",
    # Weights
    class_weight="balanced",
    # The shape of the tree
    ## Pre-pruning
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    ## Post-pruning
    ccp_alpha=0.0,
    # Random Seed
    random_state=42,
)


# Show better classification report
better_classification_report(dt, X_train, y_train)

In [None]:
# Database path
db_path = "sqlite:///D:/Codebase/fraud-detection/data/db/optuna-fraud-detection.db"

# Create study
study_dt = optuna.create_study(
    storage=db_path,
    study_name="Decision-Tree-V01",
    direction="maximize",
    load_if_exists=True,
)

# Create the Partial Functions
dt_objective: ObjectiveFunction = partial(
    full_dt_objective, X_train=X_train, y_train=y_train
)

# Start Optimization
study_dt.optimize(
    dt_objective,
    n_trials=1,
    timeout=int(3 * 3600),
    n_jobs=1,
    gc_after_trial=True,
    show_progress_bar=True,
)

In [None]:
# Decision Tree wiht best Parameters
best_dt = DecisionTreeClassifier(random_state=42, **study_dt.best_params)

# Show Better Classification Report
better_classification_report(best_dt, X_train, y_train)

In [None]:
# Fit the model
best_dt.fit(X_train, y_train)

# Crete matplotlib figure
fig, ax = plt.subplots(figsize=(20, 10))

# Plot the tree
plot_tree(
    best_dt,
    feature_names=list(X_train.columns),
    class_names=["Non-Fraud", "Fraud"],
    filled=True,
    rounded=True,
    fontsize=8,
    max_depth=4,
    ax=ax,
)

# Show plot
plt.show()

## *Random Forest*

### Simple RF Fit

In [None]:
# Set Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Show better classification report
better_classification_report(rf, X_train, y_train)

In [None]:
# Fit the modle
rf.fit(X_train, y_train)

In [None]:
display(
    pd.Series(rf.feature_importances_ * 100, index=rf.feature_names_in_).sort_values(
        ascending=False
    )
)

display(
    pd.Series(rf.feature_importances_ * 100, index=rf.feature_names_in_)
    .sort_values(ascending=False)
    .cumsum()
)

### Optimized RF Fit

In [None]:
# An objective fucniton for Random Forest Classifier
def full_rf_objective(
    trial: optuna.trial.Trial, X_train: pd.DataFrame, y_train: pd.Series
) -> float:
    # Parameter Dict for this trial
    params_dict = dict(
        ## Tree Policy
        class_weight=trial.suggest_categorical(
            "class_weight", ["balanced", "balanced_subsample", None]
        ),
        ## pre-pruning
        min_samples_split=trial.suggest_int("min_samples_split", 2, 100, log=True),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 100, log=True),
        max_leaf_nodes=trial.suggest_int("max_leaf_nodes", 10, 1000, log=True),
        # post-pruning
        ccp_alpha=trial.suggest_float("ccp_alpha", 1e-9, 2, log=True),
    )

    # Set the paramters in the decision tree
    rf = RandomForestClassifier(random_state=42, n_jobs=-1)
    rf = rf.set_params(**params_dict)

    # Set Cross Validation Policy & Cross Val Score
    cv_policy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    score = np.nanmean(
        cross_val_score(
            rf, X_train, y_train, cv=cv_policy, scoring="average_precision", n_jobs=-1
        )
    )

    # Return the balanced accuracy score
    return float(score)

In [None]:
# Set up study and optimization for Random Forest
study_rf = optuna.create_study(
    storage=db_path,
    study_name="Random-Forest-V04",
    direction="maximize",
    load_if_exists=True,
)

# Set partial function for RF objective
rf_objective: ObjectiveFunction = partial(
    full_rf_objective, X_train=X_train, y_train=y_train
)

# Start Optimization
study_rf.optimize(
    rf_objective,
    n_trials=1000,
    timeout=int(5 * 3600),
    n_jobs=1,
    gc_after_trial=True,
    show_progress_bar=True,
)

In [None]:
# Set the paramters in the decision tree
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf = rf.set_params(**study_rf.best_params)
rf.fit(X_train, y_train)

In [None]:
[tree.get_depth() for tree in rf.estimators_]

# Create Figure Object
fig = go.Figure()

#  Draw the count of trees by depth
fig.add_trace(
    go.Histogram(
        x=[tree.get_depth() for tree in rf.estimators_],
        nbinsx=max([tree.get_depth() for tree in rf.estimators_]),
        name="Count of Trees by Depth",
    )
)

In [None]:
# Display Feature Importances
pd.Series(rf.feature_importances_ * 100, index=rf.feature_names_in_).sort_values(
    ascending=False
)

In [None]:
# Get Parammeter Importance
param_importance = get_param_importances(study_rf)

# Show importtance
display(pd.Series(param_importance) * 100)
# Cumulative Importance
display(pd.Series(param_importance).cumsum() * 100)
# Plot the graph for the Cumulative Importance
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=pd.Series(param_importance).cumsum().index,
        y=(pd.Series(param_importance) * 100).cumsum().values,
        name="Cumulative Importance",
    )
)

In [None]:
(pd.Series(param_importance) * 100).cumsum()

In [None]:
(
    (pd.Series(param_importance) * 100).cumsum().shift(-1)
    - (pd.Series(param_importance) * 100).cumsum().shift(1)
) / 2

In [None]:
# Get better classification report
better_classification_report(rf, X_train, y_train)

In [None]:
# Set up the Classifier
xgb = XGBClassifier(n_jobs=-1)
# Get better classification report
better_classification_report(xgb, X_train, y_train)

In [None]:
# Objective function for XGBoost Classifier
def full_xgb_objective(
    trial: optuna.trial.Trial, X_train: pd.DataFrame, y_train: pd.Series
) -> float:
    # Parameter Dict for this trial
    params_dict = dict(
        ## Booster parameters
        learning_rate=trial.suggest_float("learning_rate", 1e-6, 1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 50, 1000, log=True),
        # PrePruning
        max_depth=trial.suggest_int("max_depth", 3, 20),
        max_leaves=trial.suggest_int("max_leaves", 0, 1000),
        # Regularization
        gamma=trial.suggest_float("gamma", 0.0, 5.0),
        reg_alpha=trial.suggest_float("reg_alpha", 0.0, 5.0),
        reg_lambda=trial.suggest_float("reg_lambda", 0.0, 5.0),
    )

    # Set the paramters in the decision tree
    xgb = XGBClassifier(random_state=42, n_jobs=None, use_label_encoder=False)
    xgb = xgb.set_params(**params_dict)

    # Set Cross Validation Policy & Cross Val Score
    cv_policy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    score = np.nanmean(
        cross_val_score(
            xgb, X_train, y_train, cv=cv_policy, scoring="average_precision", n_jobs=-1
        )
    )

    # Return the balanced accuracy score
    return float(score)

In [None]:
# Set up study and optimization for XGBoost
study_xgb = optuna.create_study(
    storage=db_path,
    study_name="XGBoost-V01",
    direction="maximize",
    load_if_exists=True,
)

# Create the Partial Functions
xgb_objective: ObjectiveFunction = partial(
    full_xgb_objective, X_train=X_train, y_train=y_train
)

# Start Optimization
study_xgb.optimize(
    xgb_objective,
    n_trials=1000,
    timeout=int(5 * 3600),
    n_jobs=1,
    gc_after_trial=True,
    show_progress_bar=True,
)

In [None]:
# Fit the model
xgb = XGBClassifier(random_state=42, n_jobs=-1, use_label_encoder=False)
xgb = xgb.set_params(**study_xgb.best_params)

# Best Classification Report
better_classification_report(xgb, X_train, y_train)

In [None]:
study_xgb.best_params

In [None]:
# Fit Nad Predict
xgb.fit(X_train, y_train)
y_test_pred = xgb.predict(X_test)
y_test_proba = xgb.predict_proba(X_test)
# Get and Print AP Score
print(f"Test AP Score: {average_precision_score(y_test, y_test_proba[:, 1]):.2%}")