In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTETomek,SMOTEENN
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns


from tqdm import tqdm
from typing import Optional


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.multiprocessing import set_start_method

import pytorch_lightning as pl

from torch.utils.data import DataLoader, Dataset
import os


In [6]:
df = pd.read_csv(r'D:\Uni Docs\DSC4996\Dynamic_fraud_detection_system\Data\pre_processed_df.csv')

In [7]:
fraud_trans = df[df['Class'] == 1]

In [8]:
fraud_trans

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
534,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.00,1
616,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.00,1
4886,4462.0,-2.303350,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.562320,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.087330,-0.156114,-0.542628,0.039566,-0.153029,239.93,1
6072,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.00,1
6293,7519.0,1.234235,3.019740,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278813,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.882850,0.697211,-2.064945,...,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.292680,0.147968,390.00,1
279090,169347.0,1.378559,1.289381,-5.004247,1.411850,0.442581,-1.326536,-1.413170,0.248525,-1.127396,...,0.370612,0.028234,-0.145640,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76,1
279096,169351.0,-0.676143,1.126366,-2.213700,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.652250,...,0.751826,0.834108,0.190944,0.032070,-0.739695,0.471111,0.385107,0.194361,77.89,1
280081,169966.0,-3.113832,0.585864,-5.399730,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.253700,245.00,1


### CREATE GAN MODEL

In [9]:
random_seed = 123
torch.manual_seed(random_seed)

BATCH_SIZE=1000
AVAIL_GPUS = min(1, torch.cuda.device_count())
NUM_WORKERS=0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [10]:
fraud_trans = fraud_trans.drop('Class',axis=1)

In [12]:
class CreditCardData(Dataset):
    def __init__(self, data: pd.DataFrame):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        row = torch.tensor(self.data.iloc[index]).float()
        return row

In [13]:
class DataModule(pl.LightningDataModule):
    def __init__(self, data: pd.DataFrame, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS):
        super().__init__()
        self.data = data
        self.batch_size = batch_size
        self.num_workers = num_workers
        
    def prepare_data(self):
        """
        Empty prepare_data method left in intentionally. 
        https://pytorch-lightning.readthedocs.io/en/latest/data/datamodule.html#prepare-data
        """
        pass
    
    def setup(self, stage: Optional[str] = None):
        train_df, test_df = train_test_split(self.data, random_state=123, test_size=0.2)
        self.train_df = train_df
        self.test_df = test_df
        data_mean = train_df.mean()
        data_std = train_df.std()
        train_norm = (train_df - data_mean)/data_std
        test_norm = (test_df - data_mean)/data_std
        self.train_df = train_norm
        self.test_df = test_norm
    
    def train_dataloader(self):
        return DataLoader(dataset=CreditCardData(self.train_df), batch_size=self.batch_size, num_workers=self.num_workers)
    
    def valid_dataloader(self):
        return DataLoader(CreditCardData(self.val_df), batch_size=self.batch_size, num_workers=self.num_workers)
    
    def test_dataloader(self):
        return DataLoader(CreditCardData(self.test_df), batch_size=self.batch_size, num_workers=self.num_workers)

In [14]:
class Generator(nn.Module):
    def __init__(self, latent_dim):
        super().__init__()
        self.sequential = nn.Sequential(
            nn.Linear(latent_dim, 100),
            nn.LeakyReLU(0.2),
            nn.Linear(100, 80),
            nn.LeakyReLU(0.2),
            nn.Linear(80, 40),
            nn.LeakyReLU(0.2),
            nn.Linear(40, 30)
        )
        
    def forward(self, x):
        output = self.sequential(x)
        return output

In [15]:
class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.sequential = nn.Sequential(
            nn.Linear(30, 150),
            nn.ReLU(),
            nn.Linear(150, 50),
            nn.ReLU(),
            nn.Linear(50, 25),
            nn.ReLU(),
            nn.Linear(25, 1)
        )
    
    def forward(self, x):
        tensor = torch.sigmoid(self.sequential(x))
        return tensor

In [16]:
class GAN(pl.LightningModule):
    def __init__(self, latent_dim=100, lr=0.002):
        super().__init__()
        self.automatic_optimization = False
        self.save_hyperparameters()
        
        self.generator = Generator(latent_dim = self.hparams.latent_dim)
        self.discriminator = Discriminator()
        
        self.validation_z = torch.randn(6, self.hparams.latent_dim)
        
        self.automatic_optimization = False

    def forward(self, z):
        return self.generator(z)
    
    
    def adversarial_loss(self, y_hat, y):
        return F.binary_cross_entropy(y_hat, y)
    
    
    def training_step(self, batch ): 
        real_data = batch
        
        opt_1, opt_2 = self.optimizers()
        z = torch.randn(real_data.size(0), self.hparams.latent_dim)

        
    def configure_optimizers(self):
        lr = self.hparams.lr
        opt_g = torch.optim.Adam(self.generator.parameters(), lr )
        opt_d = torch.optim.Adam(self.discriminator.parameters(), lr )
        return [opt_g, opt_d], []


In [17]:
gnn = GAN()

In [18]:
dm = DataModule(fraud_trans)
trainer = pl.Trainer(max_epochs=100)
trainer.fit(gnn, dm)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type          | Params
------------------------------------------------
0 | generator     | Generator     | 22.7 K
1 | discriminator | Discriminator | 13.5 K
------------------------------------------------
36.2 K    Trainable params
0         Non-trainable params
36.2 K    Total params
0.145     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.


In [19]:
z = torch.randn(85000, 100)
output = gnn(z)
output

tensor([[-0.1255, -0.0655,  0.1160,  ..., -0.0322, -0.1999, -0.1380],
        [-0.1705, -0.0180,  0.0969,  ..., -0.0784, -0.1806, -0.0667],
        [-0.1942, -0.0403,  0.0915,  ..., -0.0915, -0.2061, -0.1551],
        ...,
        [-0.1367,  0.0287,  0.1145,  ...,  0.0078, -0.1930, -0.0721],
        [-0.0970,  0.0323,  0.0931,  ..., -0.0442, -0.2048, -0.1104],
        [-0.1208,  0.0729,  0.2215,  ..., -0.0819, -0.2145, -0.0130]],
       grad_fn=<AddmmBackward0>)

In [20]:
only_fraud_df =  pd.DataFrame(output.detach().numpy())

In [21]:
only_fraud_df['Class'] = 1

In [22]:
only_fraud_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,Class
0,-0.125472,-0.065518,0.116015,-0.078386,-0.070315,-0.122219,0.017378,-0.126982,0.037647,-0.064911,...,-0.093300,-0.030247,0.125862,-0.075998,-0.203211,-0.113421,-0.032202,-0.199925,-0.137983,1
1,-0.170451,-0.018026,0.096852,-0.065626,0.072934,-0.109999,0.105688,-0.139326,-0.024676,-0.038572,...,0.012400,-0.029055,0.068315,-0.035170,-0.226974,-0.050105,-0.078412,-0.180631,-0.066747,1
2,-0.194220,-0.040289,0.091481,-0.017461,-0.019851,-0.126800,0.100452,-0.153539,-0.021744,-0.032416,...,-0.144041,0.039883,0.146865,-0.086697,-0.240962,-0.144000,-0.091510,-0.206072,-0.155127,1
3,-0.093280,-0.046098,0.146738,-0.053210,0.109236,-0.054069,0.056616,-0.196522,-0.041214,-0.055622,...,-0.050068,0.019684,0.018286,0.022157,-0.161324,-0.015622,-0.043951,-0.208535,-0.073233,1
4,-0.150356,-0.012215,0.128281,-0.067569,0.016128,-0.088675,0.093983,-0.178635,0.011020,-0.070727,...,-0.137437,0.000491,0.095065,-0.012209,-0.176238,-0.080777,-0.105282,-0.187900,-0.072092,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84995,-0.150924,-0.038800,0.121220,-0.064548,0.079419,-0.093290,0.068619,-0.164228,-0.042417,-0.022502,...,0.025307,-0.020969,0.067957,-0.037829,-0.259302,-0.084166,-0.045353,-0.170909,-0.087426,1
84996,-0.106713,-0.055724,0.107255,-0.058044,0.025893,-0.121099,0.060768,-0.122340,-0.004241,-0.060150,...,-0.054211,0.055740,0.093475,-0.059595,-0.205103,-0.044279,-0.103343,-0.192009,-0.081781,1
84997,-0.136717,0.028664,0.114541,-0.098674,0.067010,-0.102722,0.023048,-0.141780,-0.035653,-0.068010,...,-0.035877,-0.030915,0.112982,-0.060992,-0.183590,-0.083636,0.007791,-0.193032,-0.072074,1
84998,-0.096986,0.032258,0.093093,-0.066559,0.066235,-0.098856,0.037679,-0.115086,-0.062757,-0.078121,...,-0.079448,0.002127,0.111139,-0.063026,-0.132691,-0.121015,-0.044246,-0.204784,-0.110402,1


In [23]:
scaler = StandardScaler()
df[['Time','Amount']] = scaler.fit_transform(df[['Time','Amount']])

In [24]:
column_names = df.columns
only_fraud_df.columns = column_names

In [25]:
df_final = pd.concat([df,only_fraud_df])

In [26]:
df_final = df_final.sample(frac=1, random_state=42)

In [27]:
df_final = df_final.reset_index(drop=True)

In [28]:
df_final

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.124115,-1.599393,-0.149914,2.109682,0.038543,0.595675,0.644721,0.317964,-0.369205,1.019132,...,-0.249379,0.298205,0.246104,-0.714229,-0.171008,0.274035,-0.646160,-0.378601,-0.306242,0
1,-0.140221,-0.077036,0.114725,-0.068618,0.045462,-0.106829,0.094702,-0.111777,-0.072793,-0.062311,...,-0.019231,-0.002653,0.107842,-0.052991,-0.230395,-0.069105,-0.124035,-0.172561,-0.072274,1
2,-0.947813,-3.226223,-3.263305,-1.845165,0.003520,-4.396491,1.738821,5.760154,-0.029454,-0.901879,...,0.983785,0.114422,3.552622,-0.124398,0.141898,0.793311,-0.359761,0.257221,5.835037,0
3,0.521871,1.874811,-0.451824,-1.223864,0.202895,0.597169,1.184379,-0.439232,0.323480,0.519073,...,0.433782,1.543418,-0.054433,-1.532265,-0.004131,0.728989,0.002979,-0.084599,-0.256801,0
4,-1.136310,-1.142740,1.468732,0.241103,0.625281,0.338581,0.776647,-0.039810,0.842324,-0.265041,...,0.089044,0.369480,-0.235014,-1.353952,0.013564,-0.087556,0.501524,0.251930,-0.319940,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368721,1.360712,0.149260,0.988698,-0.605170,-0.788264,1.238822,-0.203064,0.861209,0.067349,-0.279204,...,-0.326673,-0.816542,0.008957,-0.010456,-0.377837,0.128193,0.219217,0.068558,-0.349773,0
368722,-0.144409,-0.056656,0.148645,-0.104169,0.046285,-0.127849,-0.003682,-0.101806,-0.062445,-0.060322,...,0.030523,-0.052311,0.109520,-0.045674,-0.214116,-0.088019,-0.053956,-0.185627,-0.087897,1
368723,-0.312295,1.259310,-0.049484,-0.721776,0.071903,1.864771,3.635628,-0.821682,0.929256,0.128685,...,0.070189,0.104264,-0.107993,1.004823,0.674600,-0.275199,0.045933,0.025074,-0.335356,0
368724,-0.132160,1.982903,-0.134427,-1.161183,0.472515,-0.043755,-1.080473,0.306983,-0.373167,0.454594,...,-0.212230,-0.472417,0.248672,0.025584,-0.195097,0.272711,-0.071660,-0.055715,-0.195658,0


### Exported Generated Dataset

In [29]:
df_final.to_csv('D:/Uni Docs/DSC4996/Dynamic_fraud_detection_system/Data/generated_df.csv', index=False)