## Deep learning approach with price volatility and Gas fees

In [1]:
#Deep learning library of choice PyTorch

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,TensorDataset
from sklearn.model_selection import train_test_split

# for number-crunching

import numpy as np
import scipy.stats as stats
import pandas as pd

# Time to check that the gpu optimization is actually helping

import time 

# Some graphing

import matplotlib.pyplot as plt

In [2]:
# use GPU

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

# Count the number of CPU cores available, this returns the number of threads, because most cpus have a thread count equal to twice their core count I have halved the count when multithreading
# Set this number to one if you dont want to multi thread the process

cpuCount = os.cpu_count()
print(cpuCount)

cuda:0
24


In [3]:
#Pull the data from the CSV

#This value will allow you to change how many periods back the data will consider. for example if you select 1 it will only look at the previous period, Maximum value is currently set to 8.
#if you want to try over more data periods a greater number of datalet length will need to be prepped in the data prep folders. By default a maximum of 8

dataperiods = 8

df = pd.read_csv('../dataletswgv/datalet{}.csv'.format(dataperiods+1),index_col=False)


In [4]:
display(df)

Unnamed: 0,Mev_period0,Mev_period1,Mev_period2,Mev_period3,Mev_period4,Mev_period5,Mev_period6,Mev_period7,Mev_period8,gas_fees_period0,...,gas_fees_period8,price_volitility_period0,price_volitility_period1,price_volitility_period2,price_volitility_period3,price_volitility_period4,price_volitility_period5,price_volitility_period6,price_volitility_period7,price_volitility_period8
0,0.0,0.0,2.0,3.0,0.0,1.0,0.0,2.0,2.0,0.198154,...,0.207764,0.946974,0.960688,0.600441,0.977247,0.044735,0.165926,0.219165,0.114442,0.038319
1,0.0,2.0,3.0,0.0,1.0,0.0,2.0,2.0,3.0,0.683639,...,0.174731,0.960688,0.600441,0.977247,0.044735,0.165926,0.219165,0.114442,0.038319,0.644419
2,2.0,3.0,0.0,1.0,0.0,2.0,2.0,3.0,2.0,0.770326,...,0.701296,0.600441,0.977247,0.044735,0.165926,0.219165,0.114442,0.038319,0.644419,0.231879
3,3.0,0.0,1.0,0.0,2.0,2.0,3.0,2.0,2.0,0.277586,...,0.507868,0.977247,0.044735,0.165926,0.219165,0.114442,0.038319,0.644419,0.231879,0.273197
4,0.0,1.0,0.0,2.0,2.0,3.0,2.0,2.0,3.0,0.636498,...,0.666308,0.044735,0.165926,0.219165,0.114442,0.038319,0.644419,0.231879,0.273197,0.410002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9986,0.0,3.0,2.0,1.0,2.0,1.0,0.0,1.0,1.0,0.364444,...,0.062605,0.948023,0.580629,0.583344,0.139111,0.338002,0.788882,0.632953,0.093129,0.942736
9987,3.0,2.0,1.0,2.0,1.0,0.0,1.0,1.0,0.0,0.931640,...,0.023214,0.580629,0.583344,0.139111,0.338002,0.788882,0.632953,0.093129,0.942736,0.686403
9988,2.0,1.0,2.0,1.0,0.0,1.0,1.0,0.0,3.0,0.217472,...,0.710180,0.583344,0.139111,0.338002,0.788882,0.632953,0.093129,0.942736,0.686403,0.716951
9989,1.0,2.0,1.0,0.0,1.0,1.0,0.0,3.0,0.0,0.530968,...,0.539771,0.139111,0.338002,0.788882,0.632953,0.093129,0.942736,0.686403,0.716951,0.864340


In [5]:
dataheaders = list(df.columns)
dataheaders.remove('Mev_period{}'.format(int((df.shape[1]/3-1))))
dataheaders.remove('gas_fees_period{}'.format(int((df.shape[1]/3-1))))
dataheaders.remove('price_volitility_period{}'.format(int((df.shape[1]/3-1))))

data = torch.Tensor(df[dataheaders].values).type(torch.float)
labels = torch.Tensor(df['Mev_period{}'.format(int((df.shape[1]/3)-1))].values).type(torch.LongTensor)

In [6]:
print(data)
print(labels)

tensor([[0.0000, 0.0000, 2.0000,  ..., 0.1659, 0.2192, 0.1144],
        [0.0000, 2.0000, 3.0000,  ..., 0.2192, 0.1144, 0.0383],
        [2.0000, 3.0000, 0.0000,  ..., 0.1144, 0.0383, 0.6444],
        ...,
        [2.0000, 1.0000, 2.0000,  ..., 0.0931, 0.9427, 0.6864],
        [1.0000, 2.0000, 1.0000,  ..., 0.9427, 0.6864, 0.7170],
        [2.0000, 1.0000, 0.0000,  ..., 0.6864, 0.7170, 0.8643]])
tensor([2, 3, 2,  ..., 3, 0, 1])


In [7]:
# First we are going to split the data into three parts, evaluation, test and training data. I do this with two instances of train_test_split, just for the sake of convinience.

traintemp_data,eval_data, traintemp_labels,eval_labels = train_test_split(data, labels, test_size=.01)

train_data,test_data, train_labels,test_labels = train_test_split(traintemp_data, traintemp_labels, test_size=.011)

#then we are going to pass the data to the Pytorch data loader, this is going to allow us to split it into mini batches that will be run through the model.

train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

train_data = train_data
test_data = test_data

#Best to keep batches to powers of two for speed reasons adjust as needed for your own memory constraints 
x = 11
batches   = 2**x
train_loader = DataLoader(train_data,batch_size=batches,shuffle=True,drop_last=True, num_workers=(int(cpuCount/2)))
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0]) 

In [8]:
# create a class for the model

class ANN(nn.Module):
	def __init__(self, Input_dim, Output_dim):
		super().__init__()

		### input layer
		self.input = nn.Linear(Input_dim,64)
		
		### hidden layers
		self.bnorm1 	= nn.BatchNorm1d(64) 
		self.hidden1    = nn.Linear(64,256)
		self.bnorm2 	= nn.BatchNorm1d(256) 
		self.hidden2    = nn.Linear(256,256)
		self.bnorm3 	= nn.BatchNorm1d(256)
		self.hidden3    = nn.Linear(256,256)
		self.bnorm4 	= nn.BatchNorm1d(256)
		self.hidden4    = nn.Linear(256,256)

		### output layer
		self.output = nn.Linear(256,Output_dim)
	
	# forward pass
	def forward(self,x):

		# input (x starts off normalized)
		x = F.relu( self.input(x) )


		# hidden layer 1
		x = self.bnorm1(x) # batchnorm
		x = F.relu( self.hidden1(x) )      # linear function and activation function

		# hidden layer 2
		x = self.bnorm2(x) # batchnorm
		x = F.relu( self.hidden2(x) )      # linear function and activation function
		
		# hidden layer 3
		x = self.bnorm3(x)
		x = F.relu( self.hidden3(x) )      # linear function and activation function
  
		# hidden layer 4
		x = self.bnorm4(x)
		x = F.relu( self.hidden4(x) )

		# output layer
		return self.output(x)

In [9]:
def trainthemodel(learning):
	
	# Loss function and optimizer, I chose cross entropy loss as it is best for classification problems. 
	lossfun = nn.CrossEntropyLoss()
	optimizer = torch.optim.SGD(model.parameters(),lr=learning)
	
	#initialize losses
	losses = torch.zeros(numofepochs)
	trainAcc = []
	testAcc = []

	model.to(device)
	
	#now lets actually loop over the training epochs to train the model
	for epoch in range(numofepochs):
		
		# switch on training mode
		model.train()

		# loop over training data batches
		batchAcc  = []
		batchLoss = []
		for X,y in train_loader:

			X = X.to(device)
			y = y.to(device)
			
			# forward pass and loss
			yHat = model(X)
			loss = lossfun(yHat,y)

			# backprop
			optimizer.zero_grad()
			loss.backward()
			optimizer.step()

			# loss from this batch
			batchLoss.append(loss.item())

			yHat = yHat.cpu()
			y = y.cpu()

			# compute training accuracy for this batch
			batchAcc.append( 100*torch.mean((torch.argmax(yHat,axis=1) == y).float()).item() )
			
		# now that we've trained through the batches, get their average training accuracy
		trainAcc.append( np.mean(batchAcc)) 

		# and get average losses across the batches
		losses[epoch] = np.mean(batchLoss)
		
		### test accuracy

		# Lets turn eval back on so we dont overfit with the test data 
		model.eval()
		X,y = next(iter(test_loader)) # extract X,y from test dataloader

		X = X.to(device)
		y = y.to(device)  

		with torch.no_grad(): # deactivates autograd
			yHat = model(X)
   
		yHat = yHat.cpu()
		y = y.cpu()   

		testAcc.append( 100*torch.mean((torch.argmax(yHat,axis=1) == y).float()) )

		print('epoch {} done at time {} '.format(epoch,time.perf_counter()))


	# function output
	return trainAcc,testAcc,losses,model

In [10]:
# Time to run the model, first we need to input parameters, you might want to change the number of epochs if it isn't reaching the level of accuracy desired.

input_dim = df.shape[1]-3
output_dim = 4
numofepochs = 1000
learningrate = 0.01


model = ANN(Input_dim = input_dim,Output_dim = output_dim)
trainAcc,testAcc,losses,model = trainthemodel(learningrate)

epoch 0 done at time 8.0775743 
epoch 1 done at time 9.7449032 
epoch 2 done at time 11.449819 
epoch 3 done at time 13.1056497 
epoch 4 done at time 14.7984394 
epoch 5 done at time 16.4784807 
epoch 6 done at time 18.2133128 
epoch 7 done at time 19.9077097 
epoch 8 done at time 21.5874912 
epoch 9 done at time 23.3958784 
epoch 10 done at time 25.0984109 
epoch 11 done at time 26.7645336 
epoch 12 done at time 28.4416072 
epoch 13 done at time 30.0969608 
epoch 14 done at time 31.8324483 
epoch 15 done at time 33.5245616 
epoch 16 done at time 35.2881419 
epoch 17 done at time 37.0970845 
epoch 18 done at time 38.7994355 
epoch 19 done at time 40.5970835 
epoch 20 done at time 42.2700982 
epoch 21 done at time 43.9920321 
epoch 22 done at time 45.7432904 
epoch 23 done at time 47.3916017 
epoch 24 done at time 49.1352093 
epoch 25 done at time 50.8665165 
epoch 26 done at time 52.5830344 
epoch 27 done at time 54.3320535 
epoch 28 done at time 55.9964854 
epoch 29 done at time 57.73

In [None]:
fig,ax = plt.subplots(1,2,figsize=(15,5))

ax[0].plot(losses,'k^-')
ax[0].set_ylabel('Loss')
ax[0].set_xlabel('Epochs')
ax[0].set_title('Losses over epoch')

ax[1].plot(trainAcc,)
ax[1].plot(testAcc,)
ax[1].set_title('Accuracy epochs')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy (%)')
ax[1].legend(['Train','Test'])
ax[1].set_ylim([0,103])

plt.show()



In [None]:
# Computing the final accuracy, if using sample data, this should be around 25% as there is no structure to the data. 

# because its a small batch we can just run this on the CPU

model.to('cpu')

100*torch.mean((torch.argmax(model(eval_data),axis=1) == eval_labels).float()).item()

In [None]:
torch.save(model.state_dict(),'../Deeplearningmodels/deeplearningapproachwgv.pt')