In [1]:
import time
import torch 
import torch.nn as nn
import numpy as np
import pandas as pd 
from torch.nn.utils.rnn import pad_sequence

from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
  

#### Read the Data from the CSV Source File

In [2]:
raw_data = pd.read_csv('data/targetfirm_prediction_dataset_small.csv')
raw_data = raw_data.fillna(0)
data = np.array(raw_data.values)
data = data[:,1:]

#### Get the indices of the rows with the target for each company

In [8]:
# # GVKEY Labels
labels = data[:,0]
_,ind1,inv1,cou1 = np.unique(labels, return_index=True, return_inverse=True, return_counts=True)
# Index of the last occurrence of the GVKEY row in the dataset
# print((ind1+cou1-1))

target_indices = ind1+cou1-1
# print("Length of labels is ", len(target_indices))


Since we can use at most the previous 5 years data to predict the next year, 
we split and prepare the data accordingly. 

As per the example: For the year 2001, there is only one historical data point that can be used.

But for the year 2010, there are five data points that can be used for prediction.


We use a window size of 5 (for the maximum number of years that can be traced back). 

In [9]:
def prepare_data_and_split(source_data, window_size, target_indices):
    returndata = [ ]
    x_data = []
    y_data = []
    
    for i in target_indices:
        current_index = i
        previous_index = current_index - 1
        count = 0 
        while(source_data[:,1][current_index] > source_data[:,1][previous_index] and count < window_size):
            current_index-=1
            previous_index = current_index - 1
            count+=1
        if(current_index == i):
#             print("Skipping this")
            continue

        x_data.append(source_data[current_index:i,3:17])
        y_data.append(source_data[i,2])
        returndata.append((source_data[current_index:i,3:17], source_data[i,2]))

    test_size = int(np.round(0.3 * len(returndata)))
    
    train = returndata[:-test_size]
    x_train = x_data[:-test_size]
    y_train = y_data[:-test_size]
    
    test = returndata[-test_size:]
    x_test = x_data[-test_size:]
    y_test = y_data[-test_size:]
    
    return train, test, x_train, y_train, x_test, y_test


In [10]:
year_window = 5 # Years
data_tensor = torch.FloatTensor(data)

train, test, x_train, y_train, x_test, y_test = prepare_data_and_split(data_tensor,year_window,target_indices)


We use padding (pytorch - pad_sequence) to pad the sequence for the years when there is less than 5 year data available. 

In [None]:
x_train = pad_sequence(x_train, batch_first=True)


In [None]:
input_size = 14
hidden_size = 100 
num_layers = 2 
output_size = 1 
num_epochs = 100 

# Change here for the below models
learning_rate = 0.05 

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size 
        self.input_size = input_size
        self.num_layers = num_layers
    
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)
#         self.h_cell = (torch.zeros(self.num_layers,1, self.hidden_size),
#                        torch.zeros(self.num_layers,1, self.hidden_size))

        
    def forward(self,x): 
#         out, self.h_cell = self.lstm(x.view(len(x),1,-1),self.h_cell)
        h0 = torch.zeros(self.num_layers, 1, self.hidden_size).requires_grad_()
        c0 = torch.zeros(self.num_layers, 1, self.hidden_size).requires_grad_()
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        output = self.fc(out.view(len(x),-1))
        return output[-1]
       

In [None]:
lstm_model = LSTM(input_size = input_size, 
                  hidden_size = hidden_size, 
                  num_layers = num_layers, 
                  output_size = output_size)

In [None]:

hist = np.zeros(num_epochs)

start_time = time.time()

criterion = nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(lstm_model.parameters(), learning_rate)
for i in range(num_epochs):
    for training_data, y_lstm_targets in train:

        y_train_pred = lstm_model(training_data)
        loss = criterion(y_train_pred, y_lstm_targets)
        print("Epoch ", i, "MSE: ", loss.item())
        hist[i] = loss.item()

        optimizer.zero_grad()
#         loss.backward(retain_graph=True)
        loss.backward()
        optimizer.step()

end_time = time.time()
training_time = end_time - start_time
print("Time taken for the training is ", training_time)


In [12]:
class GRU(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(GRU, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        out, (hn) = self.gru(x, (h0.detach()))
        out = self.fc(out[:, -1, :]) 
        return out

In [None]:
model = GRU(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
criterion = torch.nn.MSELoss(reduction='mean')
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
hist = np.zeros(num_epochs)
start_time = time.time()
gru = []

for t in range(num_epochs):
    y_train_pred = model(x_train)

    loss = criterion(y_train_pred, y_train_gru)
    print("Epoch ", t, "MSE: ", loss.item())
    hist[t] = loss.item()

    optimiser.zero_grad()
    loss.backward()
    optimiser.step()

training_time = time.time()-start_time    
print("Training time: {}".format(training_time))