## Deep learning approach

In [1]:
#Deep learning library of choice PyTorch

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,TensorDataset
from sklearn.model_selection import train_test_split

# for number-crunching

import numpy as np
import scipy.stats as stats
import pandas as pd

# Time to check that the gpu optimization is actually helping

import time 

# Some graphing

import matplotlib.pyplot as plt

In [2]:
# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [3]:
df = pd.read_csv('../datelets/datalet2.csv',index_col=False)
df = df.astype('int')


In [4]:
display(df)

Unnamed: 0,period0,period1
0,3,2
1,2,0
2,0,2
3,2,1
4,1,1
...,...,...
9999993,3,3
9999994,3,2
9999995,2,0
9999996,0,3


In [5]:
dataheaders = []

for i in range(df.shape[1]-1):
	dataheaders.append('period{}'.format(i))
 


data = torch.Tensor(df[dataheaders].values).type(torch.float)
labels = torch.Tensor(df['period{}'.format(df.shape[1]-1)].values).type(torch.LongTensor)

In [6]:
print(data)
print(labels)

tensor([[3.],
        [2.],
        [0.],
        ...,
        [2.],
        [0.],
        [3.]])
tensor([2, 0, 2,  ..., 0, 3, 2])


In [7]:
# First we are going to split the data into three parts, evaluation, test and training data. I do this with two instances of train_test_split, just for the sake of convinience.

traintemp_data,eval_data, traintemp_labels,eval_labels = train_test_split(data, labels, test_size=.01)

train_data,test_data, train_labels,test_labels = train_test_split(traintemp_data, traintemp_labels, test_size=.01)

#then we are going to pass the data to the Pytorch data loader, this is going to allow us to split it into mini batches that will be run through the model.
#given that we are working with 10mil data points this is essential or we would simply run out of memory on the devices. Im using 2048 in the hope there are some gains to be made with that matrix size.

train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

train_data = train_data
test_data = test_data

#Best to keep batches to powers of two for speed reasons adjust as needed for your own memory constraints 
x = 15
batches   = 2**x
train_loader = DataLoader(train_data,batch_size=batchsize,shuffle=True,drop_last=True, num_workers=12)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0]) 

In [8]:
# create a class for the model

class ANN(nn.Module):
	def __init__(self, Input_dim, Output_dim):
		super().__init__()

		### input layer
		self.input = nn.Linear(Input_dim,4)
		
		### hidden layers
		self.hidden1    = nn.Linear(4,4)
		self.bnorm1 = nn.BatchNorm1d(4) 
		self.hidden2    = nn.Linear(4,4)
		self.bnorm2 = nn.BatchNorm1d(4) 
		self.hidden3    = nn.Linear(4,4)
		self.bnorm3 = nn.BatchNorm1d(4)
		self.hidden4    = nn.Linear(4,4)

		### output layer
		self.output = nn.Linear(4,Output_dim)
	
	# forward pass
	def forward(self,x):

		# input (x starts off normalized)
		x = F.relu( self.input(x) )


		# hidden layer 1
		x = self.bnorm1(x) # batchnorm
		x = F.relu( self.hidden1(x) )      # linear function and activation function

		# hidden layer 2
		x = self.bnorm2(x) # batchnorm
		x = F.relu( self.hidden2(x) )      # linear function and activation function
		
		# hidden layer 3
		x = self.bnorm3(x)
		x = F.relu( self.hidden3(x) )      # linear function and activation function
  
		# hidden layer 4

		x = F.relu( self.hidden4(x) )

		# output layer
		return self.output(x)

In [9]:
def trainthemodel():
	
	# Loss function and optimizer, I chose cross entropy loss as it is best for classification problems. 
	lossfun = nn.CrossEntropyLoss()
	optimizer = torch.optim.SGD(model.parameters(),lr=0.01)
	
	#initialize losses
	losses = torch.zeros(numofepochs)
	trainAcc = []
	testAcc = []

	model.to(device)
	
	#now lets actually loop over the training epochs to train the model
	for epoch in range(numofepochs):
		
		# switch on training mode
		model.train()

		# loop over training data batches
		batchAcc  = []
		batchLoss = []
		for X,y in train_loader:

			X = X.to(device)
			y = y.to(device)
			
			# forward pass and loss
			yHat = model(X)
			loss = lossfun(yHat,y)

			# backprop
			optimizer.zero_grad()
			loss.backward()
			optimizer.step()

			# loss from this batch
			batchLoss.append(loss.item())

			yHat = yHat.cpu()
			y = y.cpu()

			# compute training accuracy for this batch
			batchAcc.append( 100*torch.mean((torch.argmax(yHat,axis=1) == y).float()).item() )
			
		# now that we've trained through the batches, get their average training accuracy
		trainAcc.append( np.mean(batchAcc)) 

		# and get average losses across the batches
		losses[epoch] = np.mean(batchLoss)
		
		### test accuracy

		# Lets turn eval back on so we dont overfit with the test data 
		model.eval()
		X,y = next(iter(test_loader)) # extract X,y from test dataloader

		X = X.to(device)
		y = y.to(device)  

		with torch.no_grad(): # deactivates autograd
			yHat = model(X)
   
		yHat = yHat.cpu()
		y = y.cpu()   

		testAcc.append( 100*torch.mean((torch.argmax(yHat,axis=1) == y).float()) )

		print('epoch {} done at time {} '.format(epoch,time.perf_counter()))


	# function output
	return trainAcc,testAcc,losses,model

In [10]:
input_dim = df.shape[1]-1
output_dim = 4
numofepochs = 1000


model = ANN(Input_dim = input_dim,Output_dim = output_dim)
trainAcc,testAcc,losses,model = trainthemodel()

epoch 0 done at time 24.5580901 
epoch 1 done at time 39.7459773 
epoch 2 done at time 54.9671812 
epoch 3 done at time 70.0421831 
epoch 4 done at time 85.189441 
epoch 5 done at time 99.7885461 
epoch 6 done at time 114.9850714 
epoch 7 done at time 129.9903293 
epoch 8 done at time 145.0399409 
epoch 9 done at time 160.2119839 
epoch 10 done at time 174.9092214 
epoch 11 done at time 190.1204395 
epoch 12 done at time 205.1167541 
epoch 13 done at time 220.2692144 
epoch 14 done at time 235.42516 
epoch 15 done at time 250.5907506 
epoch 16 done at time 265.6096892 
epoch 17 done at time 280.4527107 
epoch 18 done at time 295.5477572 
epoch 19 done at time 310.5697066 
epoch 20 done at time 325.6049222 
epoch 21 done at time 340.678276 
epoch 22 done at time 355.8538422 
epoch 23 done at time 370.5794382 
epoch 24 done at time 385.2178291 
epoch 25 done at time 400.0014445 
epoch 26 done at time 415.5617141 
epoch 27 done at time 430.8275271 
epoch 28 done at time 446.3468934 
epoch

In [None]:
fig,ax = plt.subplots(1,2,figsize=(15,5))



ax[0].plot(losses,'k^-')
ax[0].set_ylabel('Loss')
ax[0].set_xlabel('Epochs')
ax[0].set_title('Losses over epoch')

ax[1].plot(trainAcc,)
ax[1].plot(testAcc,)
ax[1].set_title('Accuracy epochs')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy (%)')
ax[1].legend(['Train','Test'])
ax[1].set_ylim([0,103])

plt.show()

