In [None]:
import torch

device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

device

In [None]:
import torch
import pandas as pd 

feature_indexes = [
    1,  # distance_from_last_transaction
    2,  # ratio_to_median_purchase_price
    4,  # used_chip
    5,  # used_pin_number
    6,  # online_order
]

label_indexes = [
    7  # fraud
]

train_df = pd.read_csv('data/train.csv')
labels_df = train_df.iloc[:, label_indexes]
train_df = train_df.iloc[:, feature_indexes]
train_df_tensor = torch.tensor(train_df.values, dtype=torch.float).to(device)
labels_df_tensor = torch.tensor(labels_df.values, dtype=torch.float).to(device)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch

train_df = pd.read_csv('data/train.csv')
labels_df = train_df.iloc[:, label_indexes]
train_df = train_df.iloc[:, feature_indexes]

sk_scaler = StandardScaler()
sk_scaler.fit(train_df)
sk_scaler.mean_, sk_scaler.scale_

In [None]:
import torch


# like scikit learn standard scaler
class TorchStandardScaler:
    def __init__(self):
        self.mean = None
        self.std = None

    def fit(self, tensor):
        self.mean = tensor.mean(dim=0, keepdim=False)
        self.std = tensor.std(dim=0, keepdim=False)

    def transform(self, tensor):
        return (tensor - self.mean) / self.std

    def fit_transform(self, tensor):
        self.fit(tensor)
        return self.transform(tensor)


train_df_tensor = torch.tensor(train_df.values, dtype=torch.float).to(device)
scaler = TorchStandardScaler()
scaler.fit(train_df_tensor)
scaler.mean, scaler.std

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader


class CSVDataset(Dataset):
    def __init__(self, csv_file, pyarrow_fs=None, transform=None, target_transform=None):
        self.feature_indexes = feature_indexes
        self.label_indexes = label_indexes
        
        if pyarrow_fs:
            with pyarrow_fs.open_input_file(csv_file) as file:
                training_table = pv.read_csv(file)
            self.data = training_table.to_pandas()
        else:
            self.data = pd.read_csv(csv_file)


        self.features = self.data.iloc[:, self.feature_indexes].values
        self.labels = self.data.iloc[:, self.label_indexes].values
        self.features = torch.tensor(self.features, dtype=torch.float).to(device)
        self.labels = torch.tensor(self.labels, dtype=torch.float).to(device)

        self.transform = transform
        self.target_transform = target_transform

        if self.transform:
            self.features = self.transform(self.features)
        if self.target_transform:
            self.labels = self.target_transform(self.labels)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        features = self.features[idx]
        label = self.labels[idx]
        return features, label


training_data = CSVDataset('data/train.csv')
validation_data = CSVDataset('data/validate.csv')

In [None]:
from torch.utils.data import DataLoader

batch_size = 64

training_dataloader = DataLoader(training_data, batch_size=batch_size)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size)

In [None]:
from torch import nn


class NeuralNetwork(nn.Module):
    def __init__(self, scaler):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(5, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid(),
        )
        self.scaler = scaler

    def forward(self, x):
        with torch.no_grad():
            x_pre = self.scaler.transform(x)
        probs = self.linear_relu_stack(x_pre)
        return probs


model = NeuralNetwork(scaler).to(device)
model

In [None]:
from sklearn.metrics import precision_score, recall_score


def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        pred = model(X)
        loss = loss_fn(pred, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % round(size / batch_size / 10) == 0:
            loss = loss.item()
            current = batch * batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def eval_loop(dataloader, model, loss_fn):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    eval_loss, correct = 0, 0

    all_preds = torch.tensor([])
    all_labels = torch.tensor([])

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            eval_loss += loss_fn(pred, y).item()
            correct += torch.eq(torch.round(pred), y).sum().item()

            pred_labels = torch.round(pred)
            all_preds = torch.cat((all_preds, pred_labels.cpu()))
            all_labels = torch.cat((all_labels, y.cpu()))

    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)

    eval_loss /= num_batches
    accuracy = correct / size * 100

    return {
        "accuracy": accuracy,
        "loss": eval_loss,
        "precision": precision,
        "recall": recall
    }



In [None]:
%%time

loss_fn = nn.BCELoss().to(device)

learning_rate = 1e-3
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

num_epochs = 2
for t in range(num_epochs):
    print(f"\nEpoch {t+1}\n-------------------------------")
    train_loop(training_dataloader, model, loss_fn, optimizer)
    metrics = eval_loop(validation_dataloader, model, loss_fn)
    print(f"Eval Metrics: \n Accuracy: {(metrics['accuracy']):>0.1f}%, Avg loss: {metrics['loss']:>8f}, "
          f"Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f} \n")



In [None]:
metrics.get("accuracy")

In [None]:
print(f"Eval Metrics: \n Accuracy: {(metrics['accuracy']):>0.1f}%, Avg loss: {metrics['loss']:>8f}, "
      f"Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f} \n")


### Save Model

In [None]:
import os

model_dir = "models/fraud/1/"
os.makedirs(model_dir, exist_ok=True) 
torch.save(model, f"{model_dir}/model.pth")

### Test Model

In [None]:
def run_inference(test_data):
    model.eval()
    with torch.inference_mode():
        prediction = torch.round(model(test_data))

    if prediction.item() == 1:
        return "fraud"
    else:
        return "NOT fraud"

In [None]:
# valid transaction
valid_tx = torch.tensor([[0.0, 1.0, 1.0, 1.0, 0.0]]).to(device)
prediction = run_inference(valid_tx)
print(f"The model thinks the valid transaction is {prediction}")

In [None]:
# fraudulent use case
fraud_tx = torch.tensor([[100, 1.2, 0.0, 0.0, 1.0]]).to(device)
prediction = run_inference(fraud_tx)
print(f"The model thinks the valid transaction is {prediction}")

In [None]:
# test_df = pd.read_csv('data/test_sample.csv', )
test_df = pd.read_csv('data/test.csv', )
test_labels_df = test_df.iloc[:, label_indexes]
test_data_df = test_df.iloc[:, feature_indexes]
test_data_df_tensor = torch.tensor(test_data_df.values, dtype=torch.float).to(device)
test_labels_df_tensor = torch.tensor(test_labels_df.values, dtype=torch.float).to(device)

In [None]:
model.eval()
with torch.inference_mode():
    y_pred = model(test_data_df_tensor)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from matplotlib import pyplot as plt

correct = torch.eq(torch.round(y_pred), test_labels_df_tensor).sum().item()
acc = (correct / len(y_pred)) * 100
precision = precision_score(test_labels_df_tensor, torch.round(y_pred))
recall = recall_score(test_labels_df_tensor, torch.round(y_pred))

print(f"Eval Metrics: \n Accuracy: {acc:>0.1f}%, "
      f"Precision: {precision:.4f}, Recall: {recall:.4f} \n")

c_matrix = confusion_matrix(torch.Tensor.cpu(test_labels_df_tensor),
                            torch.round(torch.Tensor.cpu(y_pred)))
ConfusionMatrixDisplay(c_matrix).plot()
