# **Fraud Detection**

In [None]:
# Standard Imports
import sys

sys.path.append("../src")

In [None]:
# Third Party Imports
import numpy as np
import polars as pl
import plotly.graph_objects as go
from sklearn.metrics import (
    average_precision_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_auc_score,
)

# Internal Imports
from fraud_detection.metrics import precision_at_K, recall_at_K, lift_at_K
from fraud_detection.utils.plotting import (
    plot_anomaly_score,
    plot_precision_recall_curve,
    plot_roc_curve,
)

## *Read data*

In [None]:
# Read data
df = pl.read_csv(
    r"../data/input/creditcard.csv",
    ignore_errors=False,
    infer_schema_length=1000_000,
)

# Display data
df

# **Feature/Target & Train/Test/Validation Split**

In [None]:
X = df.select(pl.exclude("Time", "Class"))
y = df.select("Class").to_series()

In [None]:
# Time based split
## Train Data set
X_train = X[: 2 * (X.height // 4)]
y_train = y[: 2 * (y.len() // 4)]

## Validation Data set
X_val = X[2 * (X.height // 4) : 3 * (X.height // 4)]
y_val = y[2 * (y.len() // 4) : 3 * (y.len() // 4)]

# Test Data set
X_test = X[3 * (X.height // 4) :]
y_test = y[3 * (y.len() // 4) :]

In [None]:
# Display class distribution in training set
display(
    y_train.value_counts()
    .sort(by="Class")
    .with_columns(
        (pl.col("count") * 100 / pl.col("count").sum()).round(4).alias("percentage")
    )
)

# Display class distribution in valid set
display(
    y_val.value_counts()
    .sort(by="Class")
    .with_columns(
        (pl.col("count") * 100 / pl.col("count").sum()).round(4).alias("percentage")
    )
)

# Display class distribution in test set
display(
    y_test.value_counts()
    .sort(by="Class")
    .with_columns(
        (pl.col("count") * 100 / pl.col("count").sum()).round(4).alias("percentage")
    )
)

# **Outlier Detection**

## *Approach A â€” Isolation-Based Anomaly Detection*

In [None]:
# Only filter class 0 from the training data
X_train_class_0 = X_train.filter(y_train == 0)

In [None]:
def compute_ranking_metrics_at_k(
    y_true: np.ndarray, y_pred_proba: np.ndarray, K_list: list[int]
) -> pl.DataFrame:
    # Has to be numpy array
    if not isinstance(y_true, np.ndarray):
        raise ValueError("y_true must be a numpy array")
    if not isinstance(y_pred_proba, np.ndarray):
        raise ValueError("y_pred_proba must be a numpy array")
    # if not i Dimension array
    if y_true.ndim != 1:
        raise ValueError("y_true must be a 1D array")
    if y_pred_proba.ndim != 1:
        raise ValueError("y_pred_proba must be a 1D array")
    # Constrains on K
    if min(K_list) < 1:
        raise ValueError("K must be greater than 0")
    if not all([isinstance(k, int) for k in K_list]):
        raise ValueError("K must be an integer")

    # Calculate metrics
    metrics = {
        "K": K_list,
        "Precision@K": [precision_at_K(y_true, y_pred_proba, K) for K in K_list],
        "Recall@K": [recall_at_K(y_true, y_pred_proba, K) for K in K_list],
        "Lift@K": [lift_at_K(y_true, y_pred_proba, K) for K in K_list],
    }
    return pl.DataFrame(metrics)

In [None]:
# Third Party Imports
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import Pipeline


# Create the Isolation forest pipeline
if_pipeline = Pipeline(
    [
        ("scaler", RobustScaler()),
        ("iso_forest", IsolationForest(n_estimators=1_000, random_state=42, n_jobs=5)),
    ]
)
# Train the Isolation Forest on Non-Fradulant Train data
if_pipeline.fit(X_train_class_0)
# Now get scores on the full training data
anomaly_scores = -if_pipeline.decision_function(X_val)

# Plot the Anomaly Scores
plot_anomaly_score(anomaly_scores).show()

# Plot the PR Curve
_, if_threshold = plot_precision_recall_curve(y_val.to_numpy(), anomaly_scores)
_.show()

# Plot the ROC curve
plot_roc_curve(y_val.to_numpy(), anomaly_scores).show()

# Compute Average precision
print(f"Average Precision: {average_precision_score(y_val, anomaly_scores):.2%}")
print(f"ROC AUC Score: {roc_auc_score(y_true=y_val, y_score=anomaly_scores):.2%}")
print(f"Isolation Forest Threshold: {if_threshold}")

# Display the confusion matrix
_ = ConfusionMatrixDisplay(
    confusion_matrix(y_true=y_val, y_pred=anomaly_scores > if_threshold)
).plot()

display(
    compute_ranking_metrics_at_k(
        y_val.to_numpy(), anomaly_scores, [10, 15, 20, 25, 30, 40, 50, 100, 150, 200]
    )
)

- For threshold is > 0.16 the Isolation Forest Has Precsion of 30% and Recall of 30%
- This is Much more deleberate than random Choice
- This will help us to reduce the False Positives

In [None]:
# Third Party Imports
from sklearn.neighbors import LocalOutlierFactor


# Create the Isolation forest pipeline
lof_pipeline = Pipeline(
    [
        ("scaler", RobustScaler()),
        ("lof", LocalOutlierFactor(n_neighbors=200, novelty=True)),
    ]
)
# Train the Isolation Forest on Non-Fradulant Train data
lof_pipeline.fit(X_train_class_0)
# Now get scores on the full training data
anomaly_scores = -lof_pipeline.decision_function(X_val)

# Plot the Anomaly Scores
plot_anomaly_score(anomaly_scores).show()

# Plot the PR Curve
_, lof_threshold = plot_precision_recall_curve(y_val.to_numpy(), anomaly_scores)
_.show()

# Plot the ROC curve
plot_roc_curve(y_val.to_numpy(), anomaly_scores).show()

# Compute Average precision
print(f"Average Precision: {average_precision_score(y_val, anomaly_scores):.2%}")
print(f"ROC AUC Score: {roc_auc_score(y_true=y_val, y_score=anomaly_scores):.2%}")
print(f"Isolation Forest Threshold: {lof_threshold}")

# Display the confusion matrix
_ = ConfusionMatrixDisplay(
    confusion_matrix(y_true=y_val, y_pred=anomaly_scores > if_threshold)
).plot()

display(
    compute_ranking_metrics_at_k(
        y_val.to_numpy(), anomaly_scores, [10, 15, 20, 25, 30, 40, 50, 100, 150, 200]
    )
)

# **Autoencoder with ANN**

In [None]:
# Third Party improts
from omegaconf import OmegaConf

# Internal Imports
from fraud_detection.config.unsupervised import AutoEncoderConfig

In [None]:
# Read the Configuration File
cfg = AutoEncoderConfig(**OmegaConf.load("../config/autoencoder/autoencoder.yaml"))

# Display data
cfg.model_dump()

In [None]:
# Standard Imports
from typing import NamedTuple


# Third Party Imports
import jax
import optax
import jax.numpy as jnp


class Params(NamedTuple):
    # Layer 1
    w1: jax.Array
    b1: jax.Array
    # Layer 2
    w2: jax.Array
    b2: jax.Array
    # Layer 3
    w3: jax.Array
    b3: jax.Array
    # Layer 4
    w4: jax.Array
    b4: jax.Array

We are going for a simple Symmetric Architecture for the AutoEncoder Input -> Hidden1 -> Bottleneck -> Hidden2 -> Output

In [None]:
def init_params(cfg) -> None:
    # Set the Jax key
    key = jax.random.key(cfg.seed)
    # Number of keys
    num_keys = 2 * (len(cfg.model.model_dump()) - 1)
    # Split the key into 6 parts
    (w1_key, b1_key, w2_key, b2_key, w3_key, b3_key, w4_key, b4_key) = jax.random.split(
        key, num_keys
    )

    # Form weights
    # Layer 1
    w1 = jax.random.normal(
        w1_key, (cfg.model.input_size, cfg.model.hidden1_size)
    ) * jnp.sqrt(2 / cfg.model.input_size)
    b1 = jax.random.normal(b1_key, (cfg.model.hidden1_size,))
    # Layer 2
    w2 = jax.random.normal(
        w2_key, (cfg.model.hidden1_size, cfg.model.bottleneck_size)
    ) * jnp.sqrt(2 / cfg.model.input_size)
    b2 = jax.random.normal(b2_key, (cfg.model.bottleneck_size,))
    # Layer 3
    w3 = jax.random.normal(
        w3_key, (cfg.model.bottleneck_size, cfg.model.hidden2_size)
    ) * jnp.sqrt(2 / cfg.model.input_size)
    b3 = jax.random.normal(b3_key, (cfg.model.hidden2_size,))
    # Layer 4
    w4 = jax.random.normal(
        w4_key, (cfg.model.hidden2_size, cfg.model.output_size)
    ) * jnp.sqrt(2 / cfg.model.input_size)
    b4 = jax.random.normal(b4_key, (cfg.model.output_size,))
    # Return the weights
    return Params(w1=w1, b1=b1, w2=w2, b2=b2, w3=w3, b3=b3, w4=w4, b4=b4)

In [None]:
@jax.jit
def encoder_predict(params: Params, inputs: jax.Array) -> jax.Array:
    # Start Computation
    ## Layer 1
    layer_1_out = jax.nn.relu(jnp.einsum("ih,hj->ij", inputs, params.w1) + params.b1)
    ## Layer 2
    layer_2_out = jax.nn.relu(
        jnp.einsum("ih,hj->ij", layer_1_out, params.w2) + params.b2
    )
    return layer_2_out


@jax.jit
def decoder_predict(params: Params, inputs: jax.Array) -> jax.Array:
    # Start Computation
    ## Layer 1
    layer_3_out = jax.nn.relu(jnp.einsum("ih,hj->ij", inputs, params.w3) + params.b3)
    ## Layer 2
    layer_4_out = jnp.einsum("ih,hj->ij", layer_3_out, params.w4) + params.b4
    # Return the output
    return layer_4_out


@jax.jit
def decision_function(params: Params, inputs: jax.Array) -> jax.Array:
    # Encode Incoming data
    encodes = encoder_predict(params=params, inputs=inputs)
    # Decode the data
    decodes = decoder_predict(params=params, inputs=encodes)
    # Compute Reconstruction Error
    return jnp.mean(jnp.square(inputs - decodes), axis=-1)


def reconstruction_error(inputs: jax.Array, decodes: jax.Array) -> jax.Array:
    ## Compute Reconstruction Error
    return jnp.mean(jnp.square(inputs - decodes))


@jax.jit
def forward_pass(params: Params, inputs: jax.Array) -> jax.Array:
    # Encode Incoming data
    encodes = encoder_predict(params=params, inputs=inputs)
    # Decode the data
    decodes = decoder_predict(params=params, inputs=encodes)
    # Compute Reconstruction Error
    return reconstruction_error(inputs=inputs, decodes=decodes)


@jax.jit
def value_and_grad(params: Params, inputs: jax.Array) -> tuple[jax.Array, jax.Array]:
    # Use jax to compute value and gradient
    return jax.value_and_grad(forward_pass)(params, inputs=inputs)

In [None]:
# Initialize the Parameters
params = init_params(cfg)

# lets scale the data first
scaler = RobustScaler()
# Scale the data
X_train_class_0_sc = jnp.array(
    scaler.fit_transform(X_train_class_0.to_numpy()), dtype=jnp.float32
)
X_val_class_0_sc = jnp.array(
    scaler.transform(X_val.filter(y_val == 0).to_numpy()), dtype=jnp.float32
)

# Set the Learning rate scheduler
lr_scheduler = optax.schedules.join_schedules(
    [
        optax.schedules.linear_schedule(
            init_value=cfg.training.schedulers.linear_schedule.init_value,
            end_value=cfg.training.schedulers.linear_schedule.end_value,
            transition_steps=cfg.training.schedulers.linear_schedule.transition_steps,
        ),
        optax.schedules.cosine_decay_schedule(
            init_value=cfg.training.schedulers.cosine_decay_schedule.init_value,
            decay_steps=cfg.training.schedulers.cosine_decay_schedule.decay_steps,
            alpha=cfg.training.schedulers.cosine_decay_schedule.alpha,
        ),
    ],
    boundaries=cfg.training.schedulers.boundaries,
)
# Intialize the optimizer
optimizer = optax.adam(lr_scheduler)
opt_state = optimizer.init(params)

# Hitory
history = {"train_loss": [], "val_loss": []}

# Trial of loop
for i in range(cfg.training.epochs):
    # Get value and gradients
    train_loss, gradients = value_and_grad(params, X_train_class_0_sc)
    # The Validation loss
    val_loss = forward_pass(params, X_val_class_0_sc)
    # Update the parameters
    updates, opt_state = optimizer.update(gradients, opt_state)
    params = optax.apply_updates(params, updates)
    # Store the loss
    history["train_loss"].append(float(train_loss))
    history["val_loss"].append(float(val_loss))


# Set Figure
fig = go.Figure()

# Add Line
fig.add_trace(go.Scatter(y=history["train_loss"], name="Train Loss"))
fig.add_trace(go.Scatter(y=history["val_loss"], name="Validation Loss"))

# Update Layout
fig.update_layout(
    title=dict(text="Training and Validation Loss", x=0.5, font=dict(size=30)),
    xaxis_title="Epoch",
    yaxis_title="Loss",
    template="plotly_dark",
)

# Show Figure
fig.show()

In [None]:
class BatchState(NamedTuple):
    key: jax.random.PRNGKey
    idx: int
    indices: jax.Array


def init_batch_state(key: jax.random.PRNGKey, data_size: int) -> BatchState:
    # 1. Get Suffeled indices
    shuffled_indices = jax.random.permutation(key, data_size)
    # 2. Return the batch state
    return BatchState(key=key, idx=0, indices=shuffled_indices)


def get_batch(batch_state: BatchState, X: jax.Array, batch_size: int) -> BatchState:
    # Get Data Size
    data_size = X.shape[0]
    # Get the indices for the current batch
    next_idx = batch_state.idx + batch_size
    # Check if the epoch has ended
    epoch_ended = next_idx >= data_size

    # 1. Get the indices for the current batch
    batch_indices = jnp.take(batch_state.indices, jnp.arange(batch_state.idx, next_idx))
    X_batch = jnp.take(X, batch_indices, axis=0)

    # 2. 

In [None]:
# Batching key
key = jax.random.key(101)
# Get initial State
batch = init_batch_state(key, X.shape[0])

In [None]:
# Now get scores on the full training data
anomaly_scores = np.array(decision_function(params, X_val.to_jax()))

# Plot the Anomaly Scores
plot_anomaly_score(anomaly_scores).show()

# Plot the PR Curve
_, lof_threshold = plot_precision_recall_curve(y_val.to_numpy(), anomaly_scores)
_.show()

# Plot the ROC curve
plot_roc_curve(y_val.to_numpy(), anomaly_scores).show()

# Compute Average precision
print(f"Average Precision: {average_precision_score(y_val, anomaly_scores):.2%}")
print(f"ROC AUC Score: {roc_auc_score(y_true=y_val, y_score=anomaly_scores):.2%}")
print(f"Isolation Forest Threshold: {lof_threshold}")

# Display the confusion matrix
_ = ConfusionMatrixDisplay(
    confusion_matrix(y_true=y_val, y_pred=anomaly_scores > if_threshold)
).plot()

display(
    compute_ranking_metrics_at_k(
        y_val.to_numpy(), anomaly_scores, [10, 15, 20, 25, 30, 40, 50, 100, 150, 200]
    )
)

In [None]:
anomaly_scores > if_threshold

In [None]:
anomaly_scores

In [None]:
if_threshold