In [1]:
import pandas as pd
import numpy as np 
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data import and cleaning

In [2]:
df = pd.read_csv("transaction_dataset.csv")

#Rename columns for easier access
df.columns = df.columns.str.strip().str.replace(' ','_').str.lower()

#Remove weird stuff 
df.drop(columns=['unnamed:_0'], inplace=True)

#Remove duplicate accounts
df.drop_duplicates(subset=['address'], inplace=True)

#Remove accounts 
df.drop(columns=['address'], inplace=True)

#Remove index
df.drop(columns=['index'], inplace=True)

#Remove token names 
df.drop(columns=['erc20_most_sent_token_type','erc20_most_rec_token_type'], inplace = True)

#Remove var=0 columns
df.drop(df.var(numeric_only=True)[df.var(numeric_only=True) == 0].index, axis = 1, inplace = True)

#Remove small distribution columns
small_distr_col = []
for col in df.columns[3:] :
    if len(df[col].value_counts()) < 10:
        small_distr_col.append(col)
df.drop(columns=small_distr_col,inplace = True)


#Replace nan values by median 
df.fillna(df.median(numeric_only=True), inplace=True)
# Remove negative values 
df[df<0] = None 
df.dropna(inplace=True)

df_n = df.copy()

#Normalization 
for col in df_n.columns:
    df_n[col] = (df_n[col]-df_n[col].mean())/df_n[col].std()


In [3]:
class Accounts(torch.utils.data.Dataset):
    def __init__(self,df):

        x=df.iloc[:,1:].values
        y=df.iloc[:,0].values

        self.x_train=torch.tensor(x,dtype=torch.float32)
        self.y_train=torch.tensor(y,dtype=torch.float32)

    def __len__(self):
        return len(self.y_train)
    
    def __getitem__(self,idx):
        return self.x_train[idx],self.y_train[idx]

full_data = Accounts(df_n)
train_size = int(0.8 * len(full_data))
test_size = len(full_data) - train_size
train_data, test_data = torch.utils.data.random_split(full_data, 
                        [train_size, test_size],torch.Generator().manual_seed(42))

# Pytorch Autoencoder


In [4]:
INPUT_DIM = 33          # size of each input
HIDDEN_DIM = 30         # hidden dimension
LATENT_DIM = 10         # latent vector dimension
N_CLASSES = 1           # number of classes in the data
lr = 0.001              # learning rate


In [5]:
class Encoder(nn.Module):
    ''' This the encoder part of VAE

    '''
    def __init__(self, input_dim, hidden_dim, latent_dim, n_classes):
        '''
        Args:
            input_dim: A integer indicating the size of input (in case of MNIST 28 * 28).
            hidden_dim: A integer indicating the size of hidden dimension.
            latent_dim: A integer indicating the latent size.
            n_classes: A integer indicating the number of classes. (dimension of one-hot representation of labels)
        '''
        super().__init__()

        self.linear = nn.Linear(input_dim + n_classes, input_dim)
        self.linear1 = nn.Linear(input_dim , hidden_dim)
        self.latent = nn.Linear(hidden_dim, latent_dim)

    def forward(self, x):
        # x is of shape [batch_size, input_dim + n_classes]

        x = torch.tanh(self.linear(x))
        # hidden is of shape [batch_size, hidden_dim]
        x = torch.tanh(self.linear1(x))
        # latent parameters
        encode = self.latent(x)

        return encode

class Decoder(nn.Module):
    ''' This the decoder part of VAE

    '''
    def __init__(self, latent_dim, hidden_dim, output_dim, n_classes):
        '''
        Args:
            latent_dim: A integer indicating the latent size.
            hidden_dim: A integer indicating the size of hidden dimension.
            output_dim: A integer indicating the size of output (in case of MNIST 28 * 28).
            n_classes: A integer indicating the number of classes. (dimension of one-hot representation of labels)
        '''
        super().__init__()

        self.latent_to_hidden = nn.Linear(latent_dim + n_classes, hidden_dim)
        self.hidden_to_hidden = nn.Linear(hidden_dim, hidden_dim)
        self.hidden_to_out = nn.Linear(hidden_dim, output_dim)
        

    def forward(self, x):
        # x is of shape [batch_size, latent_dim + num_classes]
        x = torch.tanh(self.latent_to_hidden(x))
        # x is of shape [batch_size, hidden_dim]
        x = torch.tanh(self.hidden_to_hidden(x))
        generated_x = self.hidden_to_out(x)
        # x is of shape [batch_size, output_dim]

        return generated_x

class CVAE(nn.Module):
    ''' This the VAE, which takes a encoder and decoder.

    '''
    def __init__(self, input_dim, hidden_dim, latent_dim, n_classes):
        '''
        Args:
            input_dim: A integer indicating the size of input (in case of MNIST 28 * 28).
            hidden_dim: A integer indicating the size of hidden dimension.
            latent_dim: A integer indicating the latent size.
            n_classes: A integer indicating the number of classes. (dimension of one-hot representation of labels)
        '''
        super().__init__()

        self.encoder = Encoder(input_dim, hidden_dim, latent_dim, n_classes)
        self.decoder = Decoder(latent_dim, hidden_dim, input_dim, n_classes)

    def forward(self, x, C1):

        x = torch.cat((x, C1), dim=1)

        # encode
        encoded = self.encoder(x)

        z = torch.cat((encoded, C1), dim=1)


        # decode
        generated_x = self.decoder(z)

        return generated_x

In [6]:
model = CVAE(INPUT_DIM, HIDDEN_DIM, LATENT_DIM, N_CLASSES)

#optimizer
optimizer = optim.Adam(model.parameters(), lr=lr)

In [7]:
def calculate_loss(x, reconstructed_x):
    # reconstruction loss
    mse = nn.MSELoss()
    RCL = mse(reconstructed_x, x)

    return RCL 
def train():
    # set the train mode
    model.train()

    # loss of the epoch
    train_loss = 0

    for i, (x, y) in enumerate(train_data):
        # reshape the data into [batch_size, 33]
        x = x.view(-1, INPUT_DIM)
        x = x.to(device)

        # reshape the label
        y = y.view(-1, 1)
        y = y.to(device)

        # update the gradients to zero
        optimizer.zero_grad()

        # forward pass
        reconstructed_x= model(x, y)

        # loss
        loss = calculate_loss(x, reconstructed_x)

        # backward pass
        loss.backward()
        train_loss += loss.item()

        # update the weights
        optimizer.step()

    return train_loss
def test():
    # set the evaluation mode
    model.eval()

    # test loss for the data
    test_loss = 0

    # we don't need to track the gradients, since we are not updating the parameters during evaluation / testing
    with torch.no_grad():
        for i, (x, y) in enumerate(test_data):
            # reshape the data
            x = x.view(-1, INPUT_DIM)
            x = x.to(device)

            # reshape the label
            y = y.view(-1, 1)
            y = y.to(device)

            # forward pass
            reconstructed_x= model(x, y)

            # loss
            loss = calculate_loss(x, reconstructed_x)
            test_loss += loss.item()

    return test_loss

In [8]:
patience_counter = 0
for e in range(100):

    train_loss = train()
    test_loss = test()

    train_loss /= len(train_data)
    test_loss /= len(test_data)

    print(f'Epoch {e}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')
    if e==0 :
        best_test_loss = test_loss +1

    if best_test_loss > test_loss:
        best_test_loss = test_loss
        patience_counter = 1
    else:
        patience_counter += 1

    if patience_counter > 3:
        break

Epoch 0, Train Loss: 0.8446, Test Loss: 0.6129
Epoch 1, Train Loss: 0.7701, Test Loss: 0.5640
Epoch 2, Train Loss: 0.7364, Test Loss: 0.5209
Epoch 3, Train Loss: 0.7099, Test Loss: 0.4886
Epoch 4, Train Loss: 0.6833, Test Loss: 0.4867
Epoch 5, Train Loss: 0.6680, Test Loss: 0.4486
Epoch 6, Train Loss: 0.6461, Test Loss: 0.4379
Epoch 7, Train Loss: 0.6339, Test Loss: 0.4323
Epoch 8, Train Loss: 0.6263, Test Loss: 0.4585
Epoch 9, Train Loss: 0.6111, Test Loss: 0.4233
Epoch 10, Train Loss: 0.6145, Test Loss: 0.4230
Epoch 11, Train Loss: 0.5967, Test Loss: 0.4037
Epoch 12, Train Loss: 0.5904, Test Loss: 0.3976
Epoch 13, Train Loss: 0.5779, Test Loss: 0.3964
Epoch 14, Train Loss: 0.5703, Test Loss: 0.3851
Epoch 15, Train Loss: 0.5602, Test Loss: 0.3847
Epoch 16, Train Loss: 0.5538, Test Loss: 0.3859
Epoch 17, Train Loss: 0.5444, Test Loss: 0.3797
Epoch 18, Train Loss: 0.5396, Test Loss: 0.3806
Epoch 19, Train Loss: 0.5422, Test Loss: 0.3631
Epoch 20, Train Loss: 0.5308, Test Loss: 0.3557
Ep

# Testing the autoencoder

In [9]:
model.eval()
for i, (x, y) in enumerate(test_data):
    x = x.view(-1, INPUT_DIM)
    x = x.to(device)

    # reshape the label
    y = y.view(-1, 1)
    y = y.to(device)

    # forward pass
    x_r= model(x, y)
    x_r = x_r.detach().numpy()[0]
    print(x_r)
    print(x.numpy())
    break;

[ 0.09454811 -0.16814876  3.2567217   7.2087145   3.5418043   0.5459452
  0.13669589  1.0874665   0.9293145   0.06411216  0.94818056 -0.2539101
 -0.07918754 -0.40806633  6.3791194  -0.11438954 -0.12140414  0.1091944
  0.12534976  0.51199734  0.28003109 -0.17113006  0.30476052  0.13019621
  0.75335777  0.9649465   0.45672736  0.28642964  0.2719013   0.2505895
  0.27604955 -0.03742063  0.8336555 ]
[[-2.1663022e-01 -3.1607282e-01  5.1154518e+00  5.3325658e+00
   3.5575812e+00 -2.4795942e-02 -4.7033362e-02 -6.6713274e-02
  -1.4117777e-01 -3.4495737e-02 -1.8146957e-01 -3.1942267e-02
  -3.6905438e-02 -1.8245962e-01  5.3095746e+00  4.0037413e-03
  -8.0520296e-03 -2.3171857e-02 -4.1231211e-02 -4.5108795e-02
  -1.1236985e-02 -1.8277075e-02 -4.0498100e-02  6.5580852e-02
   5.0790775e-01 -2.5317864e-02 -4.0298436e-02 -2.0982658e-02
  -1.1128101e-02 -1.0868262e-02 -1.0761774e-02 -1.2688458e-01
   5.3006876e-01]]
