In [1]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms
from torchvision.utils import save_image

import numpy as np
import pandas as pd 
import datetime

import os, sys

from matplotlib.pyplot import imshow, imsave
%matplotlib inline

In [2]:
MODEL_NAME = 'ConditionalGAN'
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv("transaction_dataset.csv")

#Rename columns for easier access
df.columns = df.columns.str.strip().str.replace(' ','_').str.lower()

#Remove weird stuff 
df.drop(columns=['unnamed:_0'], inplace=True)

#Remove duplicate accounts
df.drop_duplicates(subset=['address'], inplace=True)

#Remove accounts 
df.drop(columns=['address'], inplace=True)

#Remove index
df.drop(columns=['index'], inplace=True)

#Remove token names 
df.drop(columns=['erc20_most_sent_token_type','erc20_most_rec_token_type'], inplace = True)

#Remove var=0 columns
df.drop(df.var(numeric_only=True)[df.var(numeric_only=True) == 0].index, axis = 1, inplace = True)

#Remove small distribution columns
small_distr_col = []
for col in df.columns[3:] :
    if len(df[col].value_counts()) < 10:
        small_distr_col.append(col)
df.drop(columns=small_distr_col,inplace = True)


#Replace nan values by median 
df.fillna(df.median(numeric_only=True), inplace=True)
# Remove negative values 
df[df<0] = None 
df.dropna(inplace=True)

df_n = df.copy()

#Normalization 
for col in df_n.columns[1:]:
    df_n[col] = (df_n[col]-df_n[col].mean())/df_n[col].std()
means = [df[col].mean() for col in df.columns[1:]]
stds = [df[col].std() for col in df.columns[1:]]

for i in range(len(df.values)):
    if(not np.allclose(df_n.values[i][1:] * stds + means, df.values[i][1:])):
        print("ay caramba")

class Accounts(torch.utils.data.Dataset):
    def __init__(self,df):

        x=df.iloc[:,1:].values
        y=df.iloc[:,0].values

        self.x_train=torch.tensor(x,dtype=torch.float32)
        self.y_train=torch.tensor(y,dtype=torch.long)

    def __len__(self):
        return len(self.y_train)
    
    def __getitem__(self,idx):
        return self.x_train[idx],self.y_train[idx]

batch_size = 64
features_size = 33 
cond_size = 2 # Fraud, No Fraud
n_noise = 50
data_loader = DataLoader(dataset=Accounts(df_n), batch_size=batch_size, shuffle=True, drop_last=True)
sample_data = None 
for i, (data, labels) in enumerate(data_loader):
    print(data.shape, labels.shape)
    sample_data = data[0], labels[0]
    break;

torch.Size([64, 33]) torch.Size([64])


In [4]:
def to_onehot(x, num_classes=cond_size):
    assert isinstance(x, int) or isinstance(x, (torch.LongTensor, torch.cuda.LongTensor))
    if isinstance(x, int):
        c = torch.zeros(1, num_classes).long()
        c[0][x] = 1
    else:
        x = x.cpu()
        c = torch.LongTensor(x.size(0), num_classes)
        c.zero_()
        c.scatter_(1, x, 1) # dim, index, src value
    return c


In [5]:
class Discriminator(nn.Module):
    """
        Simple Discriminator w/ MLP
    """
    def __init__(self, input_size=features_size, condition_size=cond_size, num_classes=1):
        super(Discriminator, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(input_size+condition_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, num_classes),
            nn.Sigmoid(),
        )
    
    def forward(self, x, c):        
        x, c = x.view(x.size(0), -1), c.view(c.size(0), -1).float()
        v = torch.cat((x, c), 1) # v: [input, label] concatenated vector
        y_ = self.layer(v)
        return y_


In [6]:


class Generator(nn.Module):
    """
        Simple Generator w/ MLP
    """
    def __init__(self, input_size=n_noise, condition_size=cond_size, num_classes=features_size):
        super(Generator, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(input_size+condition_size, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes),
            nn.Tanh()
        )
        
    def forward(self, x, c):
        x, c = x.view(x.size(0), -1), c.view(c.size(0), -1).float()
        v = torch.cat((x, c), 1) # v: [input, label] concatenated vector
        y_ = self.layer(v)
        y_ = y_.view(x.size(0), 1, 33)
        return y_

In [7]:
D = Discriminator().to(DEVICE)
G = Generator().to(DEVICE)

In [8]:
criterion = nn.BCELoss()
D_opt = torch.optim.Adam(D.parameters(), lr=0.0001)
G_opt = torch.optim.Adam(G.parameters(), lr=0.0001)


In [9]:
max_epoch = 1000 # need more than 100 epochs for training generator
step = 0
n_critic = 1 # for training more k steps about Discriminator


D_labels = torch.ones([batch_size, 1]).to(DEVICE) # Discriminator Label to real
D_fakes = torch.zeros([batch_size, 1]).to(DEVICE) # Discriminator Label to fake

In [10]:
#for epoch in range(max_epoch):
    for idx, (address, labels) in enumerate(data_loader):
        # Training Discriminator
        x = address.to(DEVICE)
        y = labels.view(batch_size, 1)
        y = to_onehot(y).to(DEVICE)
        x_outputs = D(x, y)
        D_x_loss = criterion(x_outputs, D_labels)

        z = torch.randn(batch_size, n_noise).to(DEVICE)
        z_outputs = D(G(z, y), y)
        D_z_loss = criterion(z_outputs, D_fakes)
        D_loss = D_x_loss + D_z_loss
        
        D.zero_grad()
        D_loss.backward()
        D_opt.step()
        
        if step % n_critic == 0:
            # Training Generator
            z = torch.randn(batch_size, n_noise).to(DEVICE)
            z_outputs = D(G(z, y), y)
            G_loss = criterion(z_outputs, D_labels)

            G.zero_grad()
            G_loss.backward()
            G_opt.step()
        
        if step % 3000 == 0:
            print('Epoch: {}/{}, Step: {}, D Loss: {}, G Loss: {}'.format(epoch, max_epoch, step, D_loss.item(), G_loss.item()))
        
        step += 1

IndentationError: unexpected indent (576971929.py, line 2)

In [None]:
#torch.save(D.state_dict(), "./models/gan_discriminator")
#torch.save(G.state_dict(), "./models/gan_generator")

In [11]:
D = Discriminator()
G = Generator()
D.load_state_dict(torch.load("./models/gan_discriminator"))
G.load_state_dict(torch.load("./models/gan_generator"))

<All keys matched successfully>

In [15]:
G.eval()
z = torch.randn(batch_size, n_noise).to(DEVICE)
y = torch.ones(batch_size,dtype=torch.long)
y = to_onehot(y.view(batch_size, 1)).to(DEVICE)
acc = G(z, y)
acc.size()

torch.Size([64, 1, 33])

In [23]:
df.loc[df['flag'] ==0].shape[0] - df.loc[df['flag'] ==1].shape[0]

4778