In [10]:
#import libraries
import pandas as pd
import numpy as np
#sklearn for preprocessing and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
#matplotlib and seaborn to graph
import matplotlib.pyplot as plt
import seaborn as sns
#pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [11]:
#load in dataset
df = pd.read_csv('creditcard_fraud_detection.csv')
print(df.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [12]:
#check nas
df.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [13]:
#calculate proportion of fraud in dataset
print(sum(df['Class'])/len(df))
#dataset is sparse, much more non fraud than fraud

0.001727485630620034


In [14]:
#preprocess and split for train test

#X is everything but "Class" (PCA features, time, amount)
X = df.drop('Class', axis = 1).values
#y is class (0 = no fraud, 1 = fraud)
y= df['Class'].values

scalar = StandardScaler()
#PCA features are already scaled, but time and amount aren't
X_scaled = scalar.fit_transform(X)

#train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size= 0.2, random_state= 50, 
                                                    #stratify y since dataset is sparse and we want to ensureki
                                                    stratify= y)

In [15]:
#wrap dataset in classes for pytorch
class CreditFraudDataset(Dataset):
    def __init__(self, X, y):
        #convert np arrays to tensors
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    #return len dataset
    def __len__(self):
        return len(self.X)

    #return item of X, y at specific index
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

batch_size = 2048

#wrap datasets
train_dataset = CreditFraudDataset(X_train, y_train)
test_dataset  = CreditFraudDataset(X_test, y_test)

#initialize loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [16]:
#set up neural net
class FraudNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(

            #layer 1, 64 neurons
            nn.Linear(input_dim, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),

            #layer 2, 32 neurons
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.3),

            #only one output logit, for fraud or not
            nn.Linear(32, 1)
        )

    def forward(self, x):
        #x: (batch_size, input_dim)
        logits = self.net(x).squeeze(1)  #(batch_size,)
        return logits

input_dim = X_train.shape[1]
model = FraudNet(input_dim)
print(model)

FraudNet(
  (net): Sequential(
    (0): Linear(in_features=30, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=32, out_features=1, bias=True)
  )
)


In [17]:
#set up device
#todo: set up cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

FraudNet(
  (net): Sequential(
    (0): Linear(in_features=30, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [18]:
#set up bce for model
#count number of each class
class_counts = np.bincount(y_train)
neg, pos = class_counts[0], class_counts[1]
#set weights
pos_weight = torch.tensor([neg / pos], dtype=torch.float32).to(device)

#use BCE to weight fraud more in the model. Model treats missed fraud transactions more harshly in loss function
#without weights model can get high performance just by predicting no fraud always, since 99%+ of samples aren't fraud
#BCE fixes this by penalizing no fraud much more harshly than no fraud
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [19]:
#set number of epochs to train for
num_epochs = 20


#run training loop for each epoch
for epoch in range(num_epochs):
    #train
    model.train()
    running_loss = 0.0

    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        logits = model(X_batch)               #output scores as logits instead of probabilities for bce
        loss = criterion(logits, y_batch)     #calc loss
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * X_batch.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)

    #evaluate for each epoch
    model.eval()
    all_logits = []
    all_targets = []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.to(device)
            logits = model(X_batch)
            all_logits.append(logits.cpu())
            all_targets.append(y_batch)

    all_logits = torch.cat(all_logits)
    all_targets = torch.cat(all_targets)

    #converts logits to p
    probs = torch.sigmoid(all_logits).numpy()
    targets = all_targets.numpy()

    #calculate roc auc score
    auc = roc_auc_score(targets, probs)

    #print output for epoch
    print(f"Epoch {epoch+1:02d} | Loss: {epoch_loss:.4f} | ROC-AUC: {auc:.4f}")

Epoch 01 | Loss: 0.8629 | ROC-AUC: 0.9851
Epoch 02 | Loss: 0.5430 | ROC-AUC: 0.9861
Epoch 03 | Loss: 0.4192 | ROC-AUC: 0.9856
Epoch 04 | Loss: 0.3481 | ROC-AUC: 0.9868
Epoch 05 | Loss: 0.3125 | ROC-AUC: 0.9853
Epoch 06 | Loss: 0.2964 | ROC-AUC: 0.9865
Epoch 07 | Loss: 0.2745 | ROC-AUC: 0.9870
Epoch 08 | Loss: 0.2381 | ROC-AUC: 0.9878
Epoch 09 | Loss: 0.2389 | ROC-AUC: 0.9863
Epoch 10 | Loss: 0.2367 | ROC-AUC: 0.9864
Epoch 11 | Loss: 0.2217 | ROC-AUC: 0.9838
Epoch 12 | Loss: 0.2373 | ROC-AUC: 0.9856
Epoch 13 | Loss: 0.2006 | ROC-AUC: 0.9872
Epoch 14 | Loss: 0.2154 | ROC-AUC: 0.9858
Epoch 15 | Loss: 0.1886 | ROC-AUC: 0.9839
Epoch 16 | Loss: 0.1985 | ROC-AUC: 0.9832
Epoch 17 | Loss: 0.1876 | ROC-AUC: 0.9843
Epoch 18 | Loss: 0.1783 | ROC-AUC: 0.9855
Epoch 19 | Loss: 0.1683 | ROC-AUC: 0.9857
Epoch 20 | Loss: 0.1564 | ROC-AUC: 0.9837


In [20]:
#threshold for confusion matrix
threshold = 0.5
#make predictions
preds = (probs >= threshold).astype(int)

#print confusion matrix
print("Confusion matrix:")
print(confusion_matrix(targets, preds))

print("\nClassification report:")
print(classification_report(targets, preds, digits=4))


Confusion matrix:
[[55774  1090]
 [    8    90]]

Classification report:
              precision    recall  f1-score   support

         0.0     0.9999    0.9808    0.9903     56864
         1.0     0.0763    0.9184    0.1408        98

    accuracy                         0.9807     56962
   macro avg     0.5381    0.9496    0.5655     56962
weighted avg     0.9983    0.9807    0.9888     56962

