In [1]:
import shared_functions
#下面用到什么定义就从shared_functions中载入
from shared_functions import read_from_files
import os
import pandas as pd
import numpy as np
import math
import sys
import time
import pickle
import json
import datetime
import random
import torch
import sklearn
from sklearn import *
from shared_functions import performance_assessment
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
get_ipython().run_line_magic('matplotlib', 'inline')
sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

import graphviz
import xgboost
# For imbalanced learning
import imblearn
import warnings
warnings.filterwarnings('ignore')
from shared_functions import training_loop
import math
import torch.nn as nn

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
DIR_INPUT='D:/Credit Card Fraud Detection/simulated-data-transformed/data/' 

BEGIN_DATE = "2018-06-11"
END_DATE = "2018-09-14"

print("Load  files")
%time transactions_df=read_from_files(DIR_INPUT, BEGIN_DATE, END_DATE)
print("{0} transactions loaded, containing {1} fraudulent transactions".format(len(transactions_df),transactions_df.TX_FRAUD.sum()))

output_feature="TX_FRAUD"

input_features=['TX_AMOUNT','TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW',
       'TERMINAL_ID_RISK_1DAY_WINDOW', 'TERMINAL_ID_NB_TX_7DAY_WINDOW',
       'TERMINAL_ID_RISK_7DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
       'TERMINAL_ID_RISK_30DAY_WINDOW']

Load  files
CPU times: total: 516 ms
Wall time: 2.32 s
919767 transactions loaded, containing 8195 fraudulent transactions


In [3]:
from shared_functions import get_train_test_set, scaleData
# Set the starting day for the training period, and the deltas
start_date_training = datetime.datetime.strptime("2018-07-25", "%Y-%m-%d")
delta_train=7
delta_delay=7
delta_test=7
delta_valid = delta_test
start_date_training_with_valid = start_date_training+datetime.timedelta(days=-(delta_delay+delta_valid))
(train_df, valid_df)=get_train_test_set(transactions_df,start_date_training_with_valid,
                                       delta_train=delta_train,delta_delay=delta_delay,delta_test=delta_test)
# By default, scales input data
(train_df, valid_df)=scaleData(train_df, valid_df,input_features)

In [4]:
if torch.cuda.is_available():
    DEVICE = "cuda" 
else:
    DEVICE = "cpu"
print("Selected device is",DEVICE)

SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

Selected device is cuda


In [5]:
x_train = torch.FloatTensor(train_df[input_features].values)
x_valid = torch.FloatTensor(valid_df[input_features].values)
y_train = torch.FloatTensor(train_df[output_feature].values)
y_valid = torch.FloatTensor(valid_df[output_feature].values)

In [6]:
class FraudDatasetUnsupervised(torch.utils.data.Dataset):
    
    def __init__(self, x,output=True):
        'Initialization'
        self.x = x
        self.output = output

    def __len__(self):
        'Returns the total number of samples'
        return len(self.x)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample index
        item = self.x[index].to(DEVICE)
        if self.output:
            return item, item
        else:
            return item

training_set = FraudDatasetUnsupervised(x_train)
valid_set = FraudDatasetUnsupervised(x_valid)

from shared_functions import prepare_generators

training_generator,valid_generator = prepare_generators(training_set, valid_set, batch_size = 64)

In [8]:
import torch.nn as nn
class GELU(nn.Module):
    def __init__(self, init_sigma=1.0):  # 初始值为1.0，但不进行训练
        super(GELU, self).__init__()
        self.sigma = init_sigma  # 直接将 sigma 定义为常数

    def forward(self, input):
        output = input * (1 + torch.erf(input / torch.sqrt(torch.tensor(2.0))) / self.sigma) / 2
        return output


class SimpleAutoencoderG(torch.nn.Module):
    
        def __init__(self, input_size, intermediate_size, code_size):
            super(SimpleAutoencoderG, self).__init__()
            # parameters
            self.input_size = input_size
            self.intermediate_size = intermediate_size           
            self.code_size  = code_size
            
            self.gelu1 = GELU(init_sigma=4.108071217704759)  
            self.gelu2 = GELU(init_sigma=3.05488181077701)
            self.gelu3 = GELU(init_sigma=2.3962617153103962)
            #encoder
            self.fc1 = torch.nn.Linear(self.input_size, self.intermediate_size)
            self.fc2 = torch.nn.Linear(self.intermediate_size, self.code_size)
            
            #decoder 
            self.fc3 = torch.nn.Linear(self.code_size, self.intermediate_size)            
            self.fc4 = torch.nn.Linear(self.intermediate_size, self.input_size)
            
            
        def forward(self, x):
            
            hidden = self.fc1(x)
            hidden = self.gelu1(hidden)
            
            code = self.fc2(hidden)
            code = self.gelu2(code)
 
            hidden = self.fc3(code)
            hidden = self.gelu3(hidden)
            
            output = self.fc4(hidden)
            #linear activation in final layer)            
            
            return output

In [9]:
training_generator,valid_generator = prepare_generators(training_set, valid_set, batch_size = 64)
criterion = torch.nn.MSELoss().to(DEVICE)
modelG = SimpleAutoencoderG(len(input_features), 100,20).to(DEVICE)
optimizer = torch.optim.Adam(modelG.parameters(), lr = 0.0001)
modelG,training_execution_time,train_lossesG,valid_lossesG = training_loop(modelG,
                                                                        training_generator,
                                                                        valid_generator,
                                                                        optimizer,
                                                                        criterion,
                                                                        max_epochs=50,
                                                                        apply_early_stopping=False,
                                                                        verbose=True)


Epoch 0: train loss: 0.5311346062339742
valid loss: 0.2001096609593089

Epoch 1: train loss: 0.15844570215698867
valid loss: 0.10578395783331225

Epoch 2: train loss: 0.08639832394977544
valid loss: 0.05426356135740306

Epoch 3: train loss: 0.041200875546818824
valid loss: 0.026784037613396434

Epoch 4: train loss: 0.02423864124025829
valid loss: 0.019205639920518047

Epoch 5: train loss: 0.01733798168717134
valid loss: 0.013850844587336798

Epoch 6: train loss: 0.013282489568888998
valid loss: 0.011539075843216291

Epoch 7: train loss: 0.01004715336719973
valid loss: 0.006518060319084939

Epoch 8: train loss: 0.005314225230782683
valid loss: 0.003877969170396364

Epoch 9: train loss: 0.004041736792700889
valid loss: 0.0031565247832598987

Epoch 10: train loss: 0.0031923795142077467
valid loss: 0.002498261734863611

Epoch 11: train loss: 0.0024094502683096775
valid loss: 0.0018686942336317456

Epoch 12: train loss: 0.0018774237762869516
valid loss: 0.0014959424366421305

Epoch 13: tra

In [10]:
def per_sample_mse(model, generator):
    
    model.eval()
    criterion = torch.nn.MSELoss(reduction="none")
    batch_losses = []
    
    for x_batch, y_batch in generator:
        # Forward pass
        y_pred = model(x_batch)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), y_batch)
        loss_app = list(torch.mean(loss,axis=1).detach().cpu().numpy())
        batch_losses.extend(loss_app)
    
    return batch_losses

In [11]:
losses = per_sample_mse(modelG, valid_generator)
genuine_losses = np.array(losses)[y_valid == 0]
fraud_losses = np.array(losses)[y_valid == 1]
print("Average fraud reconstruction error:", np.mean(fraud_losses))
print("Average genuine reconstruction error:", np.mean(genuine_losses))

Average fraud reconstruction error: 0.0029228402
Average genuine reconstruction error: 0.00015812405


In [12]:
from shared_functions import performance_assessment
predictions_df=valid_df
predictions_df['predictions']=losses
    
performance_assessment(predictions_df, top_k_list=[100])

Unnamed: 0,AUC ROC,Average precision,Card Precision@100
0,0.848,0.227,0.221


In [19]:
from shared_functions import FraudDataset
class GELU(nn.Module):
    def __init__(self, init_sigma=0.8):# 将 sigma 定义为可学习参数给个初值3.0可以改，但是一定要写3.0，不能写3
        super(GELU, self).__init__()
        self.sigma = nn.Parameter(torch.tensor(init_sigma, dtype=torch.float32))
    
    def forward(self, input):
        output = input * (1 + torch.erf(input / torch.sqrt(torch.tensor(2.0)) / self.sigma)) / 2
        return output

class SimpleFraudMLPWithDropout(torch.nn.Module):
    
        def __init__(self, input_size, hidden_size,p):
            super(SimpleFraudMLPWithDropout, self).__init__()
            # parameters
            self.input_size = input_size
            self.hidden_size  = hidden_size
            self.p = p
            
            #input to hidden
            self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
            self.gelu = GELU()
            #hidden to output
            self.fc2 = torch.nn.Linear(self.hidden_size, 1)
            self.sigmoid = torch.nn.Sigmoid()
            
            self.dropout = torch.nn.Dropout(self.p)
            
        def forward(self, x):
            
            hidden = self.fc1(x)
            hidden = self.gelu(hidden)
            
            hidden = self.dropout(hidden)
            
            output = self.fc2(hidden)
            output = self.sigmoid(output)
            
            return output

In [20]:
seed_everything(SEED)

training_set_supervised = FraudDataset(x_train.to(DEVICE), y_train.to(DEVICE))
valid_set_supervised = FraudDataset(x_valid.to(DEVICE), y_valid.to(DEVICE))

training_generator_supervised,valid_generator_supervised = prepare_generators(training_set_supervised,
                                                                              valid_set_supervised,
                                                                              batch_size=64)

model_supervised = SimpleFraudMLPWithDropout(len(input_features), 1000, 0.2).to(DEVICE)
optimizer = torch.optim.Adam(model_supervised.parameters(), lr = 0.0001)
criterion = torch.nn.BCELoss().to(DEVICE)

model_supervised,training_execution_time,train_losses_dropout,valid_losses_dropout =\
    training_loop(model_supervised,
                  training_generator_supervised,
                  valid_generator_supervised,
                  optimizer,
                  criterion,
                  verbose=True)


Epoch 0: train loss: 0.12420119167827283
valid loss: 0.03719694207071281
New best score: 0.03719694207071281

Epoch 1: train loss: 0.03726520077022679
valid loss: 0.025553828705855406
New best score: 0.025553828705855406

Epoch 2: train loss: 0.029808029889024038
valid loss: 0.023274715673025758
New best score: 0.023274715673025758

Epoch 3: train loss: 0.027656423053072043
valid loss: 0.02198270818376354
New best score: 0.02198270818376354

Epoch 4: train loss: 0.026629672146243816
valid loss: 0.02147989976270725
New best score: 0.02147989976270725

Epoch 5: train loss: 0.02578301156535353
valid loss: 0.021331730773519787
New best score: 0.021331730773519787

Epoch 6: train loss: 0.025219283263238208
valid loss: 0.021081411899474597
New best score: 0.021081411899474597

Epoch 7: train loss: 0.024702781461887986
valid loss: 0.020371117653030027
New best score: 0.020371117653030027

Epoch 8: train loss: 0.024038382993083283
valid loss: 0.020385265087230448
1  iterations since best scor

In [21]:
predictions = []

for x_batch, y_batch in valid_generator_supervised: 
    predictions.append(model_supervised(x_batch.to(DEVICE)).detach().cpu().numpy())

predictions_df=valid_df
predictions_df['predictions']=np.vstack(predictions)
    
performance_assessment(predictions_df, top_k_list=[100])

Unnamed: 0,AUC ROC,Average precision,Card Precision@100
0,0.857,0.643,0.281


In [22]:
loader_params = {'batch_size': 64,
                 'num_workers': 0}
    
training_generator = torch.utils.data.DataLoader(training_set, **loader_params)
valid_generator = torch.utils.data.DataLoader(valid_set, **loader_params)

train_reconstruction = per_sample_mse(modelG, training_generator)
valid_reconstruction = per_sample_mse(modelG, valid_generator)

train_df["reconstruction_error"] = train_reconstruction
valid_df["reconstruction_error"] = valid_reconstruction

In [23]:
seed_everything(SEED)

input_features_new = input_features + ["reconstruction_error"]

# Rescale the reconstruction error
(train_df, valid_df)=scaleData(train_df, valid_df, ["reconstruction_error"])

x_train_new = torch.FloatTensor(train_df[input_features_new].values)
x_valid_new = torch.FloatTensor(valid_df[input_features_new].values)

training_set_supervised_new = FraudDataset(x_train_new.to(DEVICE), y_train.to(DEVICE))
valid_set_supervised_new = FraudDataset(x_valid_new.to(DEVICE), y_valid.to(DEVICE))

training_generator_supervised,valid_generator_supervised = prepare_generators(training_set_supervised_new,
                                                                              valid_set_supervised_new,
                                                                              batch_size=64)

model_supervised = SimpleFraudMLPWithDropout(len(input_features_new), 100, 0.2).to(DEVICE)
optimizer = torch.optim.Adam(model_supervised.parameters(), lr = 0.0001)
criterion = torch.nn.BCELoss().to(DEVICE)

model_supervised,training_execution_time,train_losses_dropout,valid_losses_dropout = \
    training_loop(model_supervised,
                  training_generator_supervised,
                  valid_generator_supervised,
                  optimizer,
                  criterion,
                  verbose=True)

predictions = []
for x_batch, y_batch in valid_generator_supervised: 
    predictions.append(model_supervised(x_batch).detach().cpu().numpy())


Epoch 0: train loss: 0.3585648208244011
valid loss: 0.13043654023623857
New best score: 0.13043654023623857

Epoch 1: train loss: 0.08938531038025484
valid loss: 0.0492938075255338
New best score: 0.0492938075255338

Epoch 2: train loss: 0.051753207919210995
valid loss: 0.03405540076293092
New best score: 0.03405540076293092

Epoch 3: train loss: 0.04113446804961941
valid loss: 0.02854783443245487
New best score: 0.02854783443245487

Epoch 4: train loss: 0.03545046698731187
valid loss: 0.025802986289481404
New best score: 0.025802986289481404

Epoch 5: train loss: 0.032592474709996815
valid loss: 0.02431558140813083
New best score: 0.02431558140813083

Epoch 6: train loss: 0.03130929681016613
valid loss: 0.02334051572137558
New best score: 0.02334051572137558

Epoch 7: train loss: 0.029966676039452064
valid loss: 0.022949993549566207
New best score: 0.022949993549566207

Epoch 8: train loss: 0.028874175880597566
valid loss: 0.022391229698465007
New best score: 0.022391229698465007

Ep

In [24]:
predictions = []
for x_batch, y_batch in valid_generator_supervised: 
    predictions.append(model_supervised(x_batch).detach().cpu().numpy())
predictions_df=valid_df
predictions_df['predictions']=np.vstack(predictions)
    
performance_assessment(predictions_df, top_k_list=[100])

Unnamed: 0,AUC ROC,Average precision,Card Precision@100
0,0.866,0.654,0.279
