In [4]:
import pandas as pd
from sklearn.metrics import mean_squared_log_error
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch
import pandas as pd
from numpy import array
from tqdm import tqdm
import numpy as np

In [13]:
files=pd.read_json('test_files_801010.json')
# change path to reflect where the data is 
#and which data you are testing

In [14]:
len(files)

19967

In [5]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

def modelload(param,path_to_model):
    """[loads model for testing]

    Args:
        param ([list]): [list of parameters to fit into model for the models dimensions]
        path_to_model ([string]): [path to model that is being tested]

    Returns:
        [nn.Module]: [loaded model with pretrained weights]
    """

    class GenModel(nn.Module):
        """[LSTM Model Generator]

        """
        def __init__(self, hidden_dim,seq_length, n_layers,hidden_layers,
                    bidirectional, dropout=0.5):
            """[summary]

            Args:
                hidden_dim ([List]): [list of integers for dimensions of hidden layers]
                seq_length ([int]): [window size of 1 reading]
                n_layers ([int]): [description]
                hidden_layers ([int]): [description]
                bidirectional ([boolean]): [boolean of whether the bidirectional ]
                dropout (float, optional): [description]. Defaults to 0.5.
            """
            super().__init__()
            self.rnn = nn.LSTM(856, 
                            hidden_dim[0], 
                            num_layers=n_layers, #set to two: makes our LSTM 'deep'
                            bidirectional=bidirectional, #bidirectional or not
                            dropout=dropout,batch_first=True) #we add dropout for regularization
            
            if bidirectional:
                self.D=2
            else:
                self.D=1
            self.n_layers=n_layers
            self.hidden_dim=hidden_dim[0]
            self.nonlinearity = nn.ReLU() 
            self.hidden_layers = nn.ModuleList([])
            self.seq_length=seq_length
            self.dropout=nn.Dropout(dropout)
            assert(len(hidden_dim)>0)
            assert(len(hidden_dim)==1+hidden_layers)

            i=0
            if hidden_layers>0:
                self.hidden_layers.append(nn.Linear(hidden_dim[i]*self.D*self.seq_length, hidden_dim[i+1]))
                for i in range(hidden_layers-1):
                    self.hidden_layers.append(nn.Linear(hidden_dim[i+1], hidden_dim[i+2]))
                self.output_projection = nn.Linear(hidden_dim[i+1], 1)
            else:
                self.output_projection = nn.Linear(hidden_dim[i]*self.D*self.seq_length, 1)
        
            
            
        def forward(self, x,hidden):
            """[Forward for Neural network]

            Args:
                x ([Tensor]): [input tensor for raw values]
                hidden ([Tensor]): [hidden state values for lstm model]

            Returns:
                [Tensor]: [output results from model]
            """
            
            batch_size= x.size(0)

            val, hidden = self.rnn(x,hidden) #feed to rnn
            
            #unpack sequence
            val = val.contiguous().view( batch_size,-1)
            for hidden_layer in self.hidden_layers:
                val = hidden_layer(val)
                val = self.dropout(val)
                val = self.nonlinearity(val) 
            out = self.output_projection(val)

            return out,hidden
        
        
        def init_hidden(self, batch_size):
            """[summary]

            Args:
                batch_size ([int]): [size of batch that you are inputting into the model]

            Returns:
                [Tensor]: [Returns a tensor with the dimensions equals to the dimensions of the model's
                hidden state with values 0]
            """
            weight = next(self.parameters()).data
            hidden = (weight.new(self.n_layers*self.D, batch_size, self.hidden_dim).zero_().to(device),
                        weight.new(self.n_layers*self.D, batch_size, self.hidden_dim).zero_().to(device))
            
            return hidden


    newmodel = GenModel(param[0],param[1],param[2],param[3],param[4]).double()
    newmodel.to(device)
    newmodel.load_state_dict(torch.load(path_to_model))
    return newmodel


 
   

In [16]:
def split_sequences(sequences, n_steps):
    """[inputs a numpy array and outputs a windowed sequence to put into lstm model]

    Args:
        sequences ([np.array]): [numpy array of data to be sequenced into windows]
        n_steps ([int]): [window size for the model]

    Returns:
        [np.array]: [returns a numpy array with data sequenced into windows of size n_steps]
    """
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix > len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)

In [18]:
newmodel= modelload(([256], 30,2, 0, True,0.5),'./state_dict_11.pt')
newmodel.eval()
stepsize=40

n_timesteps=30
batch_size = 100-n_timesteps+1
epoch_val=files[0]
epoch_size=len(files[0])
listmean=[]

In [19]:

for number in tqdm(range(int(epoch_size/stepsize))):
    val_x= np.empty((0,n_timesteps,856), int)
    val_y= np.empty((0,), int)
    startno=number*stepsize
    for i in (epoch_val[startno:startno+stepsize]):
        joineddf=pd.read_feather('processed3-edited/'+i)
        joineddf=joineddf.fillna(0)
        tnp=joineddf[[c for c in joineddf if c not in ['Retweets']] 
               + ['Retweets']].to_numpy()
        valnpx,valnpy=split_sequences(tnp, n_timesteps)

        val_x = np.append(val_x, valnpx, axis=0)
        val_y = np.append(val_y, valnpy, axis=0)
    val_x=torch.Tensor(val_x).double().to(device)
    h = newmodel.init_hidden(val_x.size()[0])
    hcon = tuple([e.data for e in h])
    predictions = newmodel(val_x,hcon)
    listmean.append(mean_squared_log_error(val_y, predictions[0].cpu().detach().numpy().clip(min=0)))
    predictions=[]
    #only used to store intermediate predictions
    # pd.DataFrame(listmean).to_csv('./test/mean'+str(number)+'.csv')    
           

100%|██████████| 499/499 [1:45:02<00:00, 12.63s/it]


In [21]:
np.mean(listmean) #MSLE score

1.798414006329342