In [1]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping
import torchmetrics
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset


from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, auc, classification_report, ConfusionMatrixDisplay
%matplotlib inline

In [2]:
validation_split = 0
shuffle_dataset  = True
batch_size = 64
max_epochs = 100
input_size = 33

In [3]:
def load_transaction():
    df = pd.read_csv("transaction_dataset.csv")

    #Rename columns for easier access
    df.columns = df.columns.str.strip().str.replace(' ','_').str.lower()

    #Remove weird stuff 
    df.drop(columns=['unnamed:_0'], inplace=True)

    #Remove duplicate accounts
    df.drop_duplicates(subset=['address'], inplace=True)

    #Remove accounts 
    df.drop(columns=['address'], inplace=True)

    #Remove index
    df.drop(columns=['index'], inplace=True)

    #Remove token names 
    df.drop(columns=['erc20_most_sent_token_type','erc20_most_rec_token_type'], inplace = True)

    #Remove var=0 columns
    df.drop(df.var(numeric_only=True)[df.var(numeric_only=True) == 0].index, axis = 1, inplace = True)

    #Remove small distribution columns
    small_distr_col = []
    for col in df.columns[3:] :
        if df[col].nunique() < 10:
            small_distr_col.append(col)
    df.drop(columns=small_distr_col,inplace = True)
    
    # Remove negative values 
    df[df<0] = None 
    df.dropna(inplace=True)
    
    return df


def train_test_split_indices(length, validation_split, shuffle_dataset = True, random_seed = 42):
    # Creating data indices for training and validation splits.
    indices = np.arange(length)
    validation_size = int(validation_split * length)
    if shuffle_dataset:
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[validation_size:], indices[:validation_size]
    return train_indices, val_indices

class MyDataset(Dataset):
    def __init__(self, df, indices, augment = False, 
                 cols_median = None, cols_means = None, cols_std = None):
        super().__init__()
        self.augment = augment
        
        df = df.iloc[indices].copy()
        
        if(self.augment):
            oversample = SMOTE()
            df,y = oversample.fit_resample(df.iloc[:,1:],df.values[:,0])
        else: 
            y,df  = df.values[:, 0],df.iloc[:, 1:]
        
        if any(param is None for param in [cols_median, cols_means, cols_std]):
            self.cols_median = df.median(numeric_only=True)
            self.cols_means  = df.mean  (numeric_only=True)
            self.cols_std    = df.std   (numeric_only=True)
        else:
            self.cols_median = cols_median
            self.cols_means  = cols_means
            self.cols_std    = cols_std
            
        df.fillna(self.cols_median, inplace = True)
        df = (df - self.cols_means) / self.cols_std
        
        self.y = y
        self.X = df.values                                    
        
    def get_cols_stats(self):
        return self.cols_median, self.cols_means, self.cols_std
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.y[idx]
        
        if self.augment:
            pass
        
        return x, y
class MyDataModule(pl.LightningDataModule):
    def __init__(self, df, batch_size, train_indices, val_indices,augment):
        super().__init__()
        self.df = df
        self.batch_size = batch_size
        self.train_indices = train_indices
        self.val_indices = val_indices
        self.augment = augment

    def setup(self, stage = None):
        self.train_set = MyDataset(self.df, self.train_indices, augment = True)
        
        train_cols_median, train_cols_means, train_cols_std = self.train_set.get_cols_stats()
        
        self.val_set = MyDataset(self.df, self.val_indices,
                                 augment = False,
                                 cols_median = train_cols_median,
                                 cols_means  = train_cols_means,
                                 cols_std    = train_cols_std)
  
    def train_dataloader(self):
        return DataLoader(self.train_set,
                          batch_size = self.batch_size,
                          shuffle = True,
                          num_workers = 8,
                          pin_memory = True)


In [4]:
def train(model,file_name,augment=False,path="./models",validation_split=validation_split,
          shuffle_dataset=shuffle_dataset,batch_size=batch_size,
         max_epochs=max_epochs,input_size=input_size):

    df = load_transaction()


    train_indices, val_indices = train_test_split_indices(length = len(df),
                                                      validation_split = validation_split)        

    data = MyDataModule(df,
                        train_indices = train_indices,
                        val_indices   = val_indices,
                        batch_size    = batch_size,
                        augment       = augment)
    early_stopping = EarlyStopping('val_loss',patience=7)
    
    checkpoint_callback = ModelCheckpoint(
                            monitor  = "val_acc",
                            mode     = 'max',
                            dirpath  = path,
                            filename = file_name)

    trainer = pl.Trainer(log_every_n_steps       = 10,
                         accelerator             = 'cpu',
                         check_val_every_n_epoch = 1,
                         enable_checkpointing    = True,
                         max_epochs              = max_epochs,
                         precision               = 64,
                         callbacks               = [checkpoint_callback],
                         num_sanity_val_steps    = 0,
                         fast_dev_run            = False)

    trainer.fit(model, data)
    return trainer, model

In [7]:
class Generator(nn.Module):
    def __init__(self, latent_dim, out_dim):
        super().__init__()
        self.out_dim = out_dim

        self.layers = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, out_dim),
            nn.Tanh(),
        )

    def forward(self, x):
        x = self.layers(x.view(x.size(0), -1))
        return x
class Discriminator(nn.Module):
    def __init__(self, input_size):
        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.layers(x.view(x.size(0), -1)).flatten()

        return x
class GAN(pl.LightningModule):
    def __init__(
        self,
        lr: float  = 0.0001,
        latent_dim = 50,
        input_size = 33
    ):
        super().__init__()
        self.input_size = input_size
        self.latent_dim = latent_dim
        self.lr         = lr

        self.generator = Generator(latent_dim,input_size)
        self.discriminator = Discriminator(input_size)


    def forward(self, z):
        return self.generator(z)

    def adversarial_loss(self, y_hat, y):
        return F.binary_cross_entropy(y_hat, y)

    def training_step(self, batch, batch_idx, optimizer_idx):
        addrs, _ = batch

        # sample noise
        z = torch.randn(addrs.shape[0],self.latent_dim)
        z = z.type_as(addrs)

        # train generator
        if optimizer_idx == 0:

            # ground truth result (ie: all fake)
            # put on GPU because we created this tensor inside training_loop
            valid = torch.ones(addrs.size(0), 1)
            valid = valid.type_as(addrs)

            # adversarial loss is binary cross-entropy
            g_loss = self.adversarial_loss(self.discriminator(self(z)).view(addrs.size(0), 1), valid)
            self.log("g_loss", g_loss, prog_bar=True, on_step = False, on_epoch = True)
            return g_loss

        # train discriminator
        if optimizer_idx == 1:
            # Measure discriminator's ability to classify real from generated samples

            # how well can it label as real?
            valid = torch.ones(addrs.size(0), 1)
            valid = valid.type_as(addrs)

            real_loss = self.adversarial_loss(self.discriminator(addrs).view(addrs.size(0), 1), valid)

            # how well can it label as fake?
            fake = torch.zeros(addrs.size(0), 1)
            fake = fake.type_as(addrs)

            fake_loss = self.adversarial_loss(self.discriminator(self(z).detach()).view(addrs.size(0), 1), fake)

            # discriminator loss is the average of these
            d_loss = (real_loss + fake_loss) / 2
            self.log("d_loss", d_loss, prog_bar=True, on_step = False, on_epoch = True)
            return d_loss

    def configure_optimizers(self):
        opt_g = torch.optim.Adam(self.generator.parameters(), lr=self.lr)
        opt_d = torch.optim.Adam(self.discriminator.parameters(), lr=self.lr)
        
        return [opt_g, opt_d], []


In [8]:
gan = train(GAN(),augment=False)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type          | Params
------------------------------------------------
0 | generator     | Generator     | 16.9 K
1 | discriminator | Discriminator | 14.7 K
------------------------------------------------
31.7 K    Trainable params
0         Non-trainable params
31.7 K    Total params
0.253     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.
