In [None]:
from torch.utils.data import TensorDataset, DataLoader

In [None]:
# pip install pyarrow=1.0.0
#in case you cannot read feather

In [None]:
import pandas as pd
import glob
import torch
from tqdm import tqdm
from numpy import array
import numpy as np
import random

In [None]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
import torch.nn as nn

class GenModel(nn.Module):
    """[LSTM Model Generator]

    """
    def __init__(self, hidden_dim,seq_length, n_layers,hidden_layers,
                 bidirectional, dropout=0.5):
        """[summary]

        Args:
            hidden_dim ([List]): [list of integers for dimensions of hidden layers]
            seq_length ([int]): [window size of 1 reading]
            n_layers ([int]): [description]
            hidden_layers ([int]): [description]
            bidirectional ([boolean]): [boolean of whether the bidirectional ]
            dropout (float, optional): [description]. Defaults to 0.5.
        """
        super().__init__()
        self.rnn = nn.LSTM(856, 
                           hidden_dim[0], 
                           num_layers=n_layers, #set to two: makes our LSTM 'deep'
                           bidirectional=bidirectional, #bidirectional or not
                           dropout=dropout,batch_first=True) #we add dropout for regularization
        
        if bidirectional:
            self.D=2
        else:
            self.D=1
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim[0]
        self.nonlinearity = nn.ReLU() 
        self.hidden_layers = nn.ModuleList([])
        self.seq_length=seq_length
        self.dropout=nn.Dropout(dropout)
        assert(len(hidden_dim)>0)
        assert(len(hidden_dim)==1+hidden_layers)

        i=0
        if hidden_layers>0:
            self.hidden_layers.append(nn.Linear(hidden_dim[i]*self.D*self.seq_length, hidden_dim[i+1]))
            for i in range(hidden_layers-1):
                self.hidden_layers.append(nn.Linear(hidden_dim[i+1], hidden_dim[i+2]))
            self.output_projection = nn.Linear(hidden_dim[i+1], 1)
        else:
            self.output_projection = nn.Linear(hidden_dim[i]*self.D*self.seq_length, 1)
    
        
        
    def forward(self, x,hidden):
        """[Forward for Neural network]

        Args:
            x ([Tensor]): [input tensor for raw values]
            hidden ([Tensor]): [hidden state values for lstm model]

        Returns:
            [Tensor]: [output results from model]
        """
        
        batch_size= x.size(0)

        val, hidden = self.rnn(x,hidden) #feed to rnn
        
        #unpack sequence
        val = val.contiguous().view( batch_size,-1)
        for hidden_layer in self.hidden_layers:
              val = hidden_layer(val)
              val = self.dropout(val)
              val = self.nonlinearity(val) 
        out = self.output_projection(val)

        return out,hidden
    
    
    def init_hidden(self, batch_size):
        """[summary]

        Args:
            batch_size ([int]): [size of batch that you are inputting into the model]

        Returns:
            [Tensor]: [Returns a tensor with the dimensions equals to the dimensions of the model's
            hidden state with values 0]
        """
            weight = next(self.parameters()).data
            hidden = (weight.new(self.n_layers*self.D, batch_size, self.hidden_dim).zero_().to(device),
                          weight.new(self.n_layers*self.D, batch_size, self.hidden_dim).zero_().to(device))
            
            return hidden

In [None]:
newmodel = GenModel([512], 30,2, 0, True,0.5).double()
newmodel.to(device)

In [None]:
def count_parameters(model):
    """[calculate the number of parameters the model has to train]

    Args:
        model ([nn]): [input neural network]

    Returns:
        [int]: [count of parameters that the model needs to train]
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(newmodel):,} trainable parameters')

In [None]:
files=pd.read_json('train_files_801010.json')

In [None]:
def split_sequences(sequences, n_steps):
    """[inputs a numpy array and outputs a windowed sequence to put into lstm model]

    Args:
        sequences ([np.array]): [numpy array of data to be sequenced into windows]
        n_steps ([int]): [window size for the model]

    Returns:
        [np.array]: [returns a numpy array with data sequenced into windows of size n_steps]
    """
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)

In [None]:
random.seed(42) # seed the process for everyone to have the same files
train_set=random.choices(list(files[0]),k=int(len(files)*0.8))
val_set=list(set(files[0])-set(train_set))
numberepochs=10 #number of epochs that we are using

stepsize=75 # number of files processed each step
criterion = torch.nn.MSELoss() # reduction='sum' created huge loss value
optimizer = torch.optim.Adam(newmodel.parameters(), lr=0.001)
n_timesteps=30 #size of window
batch_size = 100-n_timesteps+1 #each batch size, we default it to 1 file per batch
counter = 0 #counter to tell us when to print
print_every = 50 #will print when counter reaches a multiple of this number
clip = 5 #clips the norm of the gradient by this size if it exceeds, to preven exploding gradient problem
valid_loss_min = np.Inf #loss

trainloss=[]
valloss=[]
step=[]
num=[]
valbackup=[]

In [None]:

for j in range(numberepochs):
    epoch_train=random.choices(train_set,k=750*8)#80-20 split
    epoch_val=random.choices(val_set,k=750*2)
    print(j)
    
    for number in tqdm(range(int(750*8/stepsize))):
        trainingnp_x= np.empty((0,30,856), int)
        trainingnp_y= np.empty((0,), int)
        startno=number*50
        for i in (epoch_train[startno:startno+stepsize]):
            joineddf=pd.read_feather('processed3-edited/'+i)
            joineddf=joineddf.fillna(0)
            tnp=joineddf[[c for c in joineddf if c not in ['Retweets']] 
                   + ['Retweets']].to_numpy()
            trainingnpx,trainingnpy=split_sequences(tnp, n_timesteps)

            trainingnp_x = np.append(trainingnp_x, trainingnpx, axis=0)
            trainingnp_y = np.append(trainingnp_y, trainingnpy, axis=0)

        valnp_x= np.empty((0,30,856), int)
        valnp_y= np.empty((0,), int)
        for i in (epoch_val[startno:startno+stepsize]):
            joineddf=pd.read_feather('processed3-edited/'+i)
            joineddf=joineddf.fillna(0)
            vnp=joineddf[[c for c in joineddf if c not in ['Retweets']] #move retweets column to the end
                   + ['Retweets']].to_numpy()
            valnpx,valnpy=split_sequences(tnp, n_timesteps)

            valnp_x = np.append(valnp_x, valnpx, axis=0)
            valnp_y = np.append(valnp_y, valnpy, axis=0)
        train_data = TensorDataset(torch.from_numpy(trainingnp_x), torch.from_numpy(trainingnp_y))
        val_data = TensorDataset(torch.from_numpy(valnp_x), torch.from_numpy(valnp_y))
        
        train_loader = DataLoader(train_data, shuffle=False, batch_size=batch_size)
        val_loader = DataLoader(val_data, shuffle=False, batch_size=batch_size)
        newmodel.train()
        h = newmodel.init_hidden(batch_size)
        for inputs, labels in train_loader:
            counter += 1
            h = tuple([e.data for e in h])
            inputs, labels = inputs.to(device), labels.to(device)
            newmodel.zero_grad()
            output, h = newmodel(inputs, h)
            loss = criterion(output.squeeze(), labels)
            loss.backward()
            nn.utils.clip_grad_norm_(newmodel.parameters(), clip)## clips gradient if too large
            optimizer.step()
            if counter%print_every == 0:
                val_h = newmodel.init_hidden(batch_size)
                val_losses = []
                newmodel.eval()
                for inp, lab in val_loader:
                    val_h = tuple([each.data for each in val_h])
                    inp, lab = inp.to(device), lab.to(device)
                    out, val_h = newmodel(inp, val_h)
                    val_loss = criterion(out.squeeze(), lab)
                    val_losses.append(val_loss.item())
                    
                newmodel.train()
                print("Epoch: {}/{}...".format(j+1, numberepochs),
                        "Step: {}...".format(counter),
                        "Loss: {:.6f}...".format(loss.item()),
                        "Val Loss: {:.6f}".format(np.mean(val_losses)))
                step.append(counter)
                num.append(j)
                valloss.append(val_losses)
                trainloss.append(loss.item)
                valbackup.append(np.mean(val_losses))
                if np.mean(val_losses) <= valid_loss_min:
                    torch.save(newmodel.state_dict(), './state_dict_12.pt')
                    print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                    valid_loss_min = np.mean(val_losses)
                pd.DataFrame(valloss).to_csv('valloss.csv')
                pd.DataFrame(num).to_csv('num.csv')
                pd.DataFrame(step).to_csv('step.csv')
                pd.DataFrame(valloss).to_csv('step.csv')
                pd.DataFrame(valbackup).to_csv('valbackup.csv')


In [None]:
torch.save(newmodel.state_dict(), './state_dict_13.pt')
