## Deep learning approach

In [114]:
#Deep learning library of choice PyTorch

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,TensorDataset
from sklearn.model_selection import train_test_split

# for number-crunching

import numpy as np
import scipy.stats as stats
import pandas as pd

In [115]:
df = pd.read_csv('../datelets/datalet9.csv',index_col=False)


In [116]:
display(df)

Unnamed: 0,period0,period1,period2,period3,period4,period5,period6,period7,period8
0,3.0,2.0,0.0,2.0,1.0,1.0,0.0,1.0,3.0
1,2.0,0.0,2.0,1.0,1.0,0.0,1.0,3.0,2.0
2,0.0,2.0,1.0,1.0,0.0,1.0,3.0,2.0,3.0
3,2.0,1.0,1.0,0.0,1.0,3.0,2.0,3.0,1.0
4,1.0,1.0,0.0,1.0,3.0,2.0,3.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
9999995,2.0,0.0,3.0,2.0,2.0,0.0,1.0,2.0,3.0
9999996,0.0,3.0,2.0,2.0,0.0,1.0,2.0,3.0,4.0
9999997,3.0,2.0,2.0,0.0,1.0,2.0,3.0,4.0,5.0
9999998,2.0,2.0,0.0,1.0,2.0,3.0,4.0,5.0,6.0


In [117]:
dataheaders = []

for i in range(df.shape[1]):
	dataheaders.append('period{}'.format(i))
	

data = torch.Tensor(df[dataheaders].values)
labels = torch.Tensor(df['period{}'.format(df.shape[1]-1)].values)

In [118]:
print(data)
print(labels)

tensor([[3., 2., 0.,  ..., 0., 1., 3.],
        [2., 0., 2.,  ..., 1., 3., 2.],
        [0., 2., 1.,  ..., 3., 2., 3.],
        ...,
        [3., 2., 2.,  ..., 3., 4., 5.],
        [2., 2., 0.,  ..., 4., 5., 6.],
        [2., 0., 1.,  ..., 5., 6., 7.]])
tensor([3., 2., 3.,  ..., 5., 6., 7.])


In [119]:
# First we are going to split the data into three parts, evaluation, test and training data. I do this with two instances of train_test_split, just for the sake of convinience.

traintemp_data,eval_data, traintemp_labels,eval_labels = train_test_split(data, labels, test_size=.01)

train_data,test_data, train_labels,test_labels = train_test_split(traintemp_data, traintemp_labels, test_size=.01)

#then we are going to pass the data to the Pytorch data loader, this is going to allow us to split it into mini batches that will be run through the model.
#given that we are working with 10mil data points this is essential or we would simply run out of memory on the devices. Im using 2048 in the hope there are some gains to be made with that matrix size.

train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

batchsize    = 2048
train_loader = DataLoader(train_data,batch_size=batchsize,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0]) 

In [120]:
# create a class for the model WITH BATCH NORM

class ANN(nn.Module):
	def __init__(self, Input_dim, Output_dim):
		super().__init__()

		### input layer
		self.input = nn.Linear(Input_dim,16)
		
		### hidden layers
		self.fc1    = nn.Linear(16,32)
#    self.bnorm1 = nn.BatchNorm1d(16) 
		self.fc2    = nn.Linear(32,20)
#    self.bnorm2 = nn.BatchNorm1d(32) 
		self.fc3    = nn.Linear(32,20)
		
		### output layer
		self.output = nn.Linear(20,Output_dim)
	
	# forward pass
	def forward(self,x):

		# input (x starts off normalized)
		x = F.relu( self.input(x) )


		# hidden layer 1
#    x = self.bnorm1(x) # batchnorm
		x = F.relu( self.fc1(x) )      # linear function and activation function

		# hidden layer 2
#    x = self.bnorm2(x) # batchnorm
		x = F.relu( self.fc2(x) )      # linear function and activation function
		
		# hidden layer 3
		
		x = F.relu( self.fc3(x) )      # linear function and activation function

		# output layer
		return self.output(x)

In [121]:
input_dim = df.shape[1]
output_dim = 4
numofepochs = 1000


model = ANN(Input_dim = input_dim,Output_dim = output_dim)

In [122]:
def trainthemodel():
	
	# Loss function and optimizer, I chose the BCE loss function as it is best for classification problems. 
	lossfun = nn.BCEWithLogitsLoss()
	optimizer = torch.optim.SGD(model.parameters(),lr=0.01)
	
	#initialize losses
	losses = torch.zeros(numofepochs)
	trainAcc = []
	testAcc = []
	
	#now lets actually loop over the training epochs to train the model
	for epoch in range(numofepochs):
		
		# switch on training mode
		model.train()

		# loop over training data batches
		batchAcc  = []
		batchLoss = []
		for X,y in train_loader:
			
			# forward pass and loss
			yHat = model(X)
			loss = lossfun(yHat,y)

			# backprop
			optimizer.zero_grad()
			loss.backward()
			optimizer.step()

			# loss from this batch
			batchLoss.append(loss.item())

			# compute training accuracy for this batch
			batchAcc.append( 100*torch.mean(((yHat>0) == y).float()).item() )
			
		# now that we've trained through the batches, get their average training accuracy
		trainAcc.append( np.mean(batchAcc) )

		# and get average losses across the batches
		losses[epoch] = np.mean(batchLoss)
		
		### test accuracy

		# NOTE: batch normalization should be turned off during testing, so we definitely need to switch modes here
		model.eval()
		X,y = next(iter(test_loader)) # extract X,y from test dataloader
		with torch.no_grad(): # deactivates autograd
		yHat = model(X)
		testAcc.append( 100*torch.mean(((yHat>0) == y).float()).item() )
		
	# function output
	return trainAcc,testAcc,losses,model

			

IndentationError: expected an indented block (Temp/ipykernel_49068/2467401633.py, line 50)