## Inference Test on Test Data Set: LSTM

### Imports and Data Loading

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from collections import Counter

In [13]:
import pickle

In [14]:
#nltk.download('punkt')

In [15]:
batch_size = 50
batch_size_train = 50
max_len = 28
vocab_size = 5001

**Selected hyperparameters**  
Learning rate: 0.001  
Hidden dim: 32  
Number of LSTM layers: 2  
Epoch number: 12

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

### Import the preprocessed data

In [17]:
df_X_test = pd.read_csv('tweets_5000/test/X_test.csv')
df_y_test = pd.read_csv('tweets_5000/test/y_test.csv')

In [18]:
# convert dataframe  to numpy array
X_test = df_X_test.to_numpy().squeeze()
y_test = df_y_test.to_numpy().squeeze()

In [19]:
df_word_list = pd.read_csv('tweets_5000/vocab/word_list.csv')

In [20]:
word_list = df_word_list.iloc[:,0].to_list()

In [21]:
vocab_count = Counter(word_list)

In [49]:
vocab = sorted(vocab_count,key=vocab_count.get,reverse=True)[:5000]

In [50]:
count_words = sorted(vocab_count.values(), reverse=True)
len(count_words)

12495

In [51]:
# words with more than one occurrence
words_more_than_1 = list(filter(lambda x: x > 1, count_words))
len(words_more_than_1)

5469

In [52]:
words_less_than_50 = list(filter(lambda x: x < 50, count_words))

In [53]:
vocab_all = sorted(vocab_count,key=vocab_count.get,reverse=True)

In [54]:
vocab_dict = {w:i+1 for i,w in enumerate(vocab)}

In [55]:
df_x_train_padded = pd.read_csv('tweets_5000/train/x_train_padded.csv')
df_x_valid_padded = pd.read_csv('tweets_5000/valid/x_valid_padded.csv')
df_X_test = pd.read_csv('tweets_5000/test/X_test.csv')
df_y_test = pd.read_csv('tweets_5000/test/y_test.csv')
df_y_train = pd.read_csv('tweets_5000/train/y_train.csv')
df_y_valid = pd.read_csv('tweets_5000/valid/y_valid.csv')

In [56]:
# convert dataframe  to numpy array
x_train_padded = df_x_train_padded.to_numpy()
x_valid_padded = df_x_valid_padded.to_numpy()
X_test = df_X_test.to_numpy().squeeze()
y_test = df_y_test.to_numpy().squeeze()
y_train = df_y_train.to_numpy().squeeze()
y_valid = df_y_valid.to_numpy().squeeze()

### Create final dataset for training

In [57]:
x_train_padded = np.array(x_train_padded)
x_valid_padded = np.array(x_valid_padded)

In [58]:
# Create tensor datasets
train_data = TensorDataset(torch.from_numpy(x_train_padded), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(x_valid_padded), torch.from_numpy(y_valid))

In [59]:
# Set up dataloader with shuffle on
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=True)
valid_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=True, drop_last=True)

In [60]:
# Display tweet and label.
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([50, 28])
Labels batch shape: torch.Size([50])


### Define LSTM model

In [61]:
class TweetLSTM(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout, num_layers) :
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        # The embedding layer takes the vocab size and the embeddings size as input
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # The LSTM layer takes in the the embedding size and the hidden vector size.
        #self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim,
                    num_layers=num_layers, batch_first=True)

        # Use dropout before the final layer to improve with regularization
        self.dropout = nn.Dropout(dropout)

        # The fully-connected layer takes in the hidden dim of the LSTM and
        #  outputs a 2x1 vector of the class scores.
        self.fc = nn.Linear(hidden_dim, 2)

    def forward(self, x, hidden):
        """
        The forward method takes in the input and the previous hidden state 
        """

        # The input is transformed to embeddings by passing it to the embedding layer
        embs = self.embedding(x)

        # The embedded inputs are fed to the LSTM alongside the previous hidden state
        out, hidden = self.lstm(embs, hidden)

        # Dropout is applied to the output and fed to the FC layer
        out = self.dropout(out)
        out = self.fc(out)

        # We extract the scores for the final hidden state since it is the one that matters.
        out = out[:, -1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        return (torch.zeros(self.num_layers, batch_size, self.hidden_dim), 
                torch.zeros(self.num_layers, batch_size, self.hidden_dim))

### Initialize TweetLSTM class 

In [62]:
embedding_dim = 64
hidden_dim = 32
dropout = 0.2
num_layers = 2

#model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)
model = TweetLSTM(vocab_size, embedding_dim, hidden_dim, dropout, num_layers)

#moving to gpu
model.to(device)
print(model)

TweetLSTM(
  (embedding): Embedding(5001, 64, padding_idx=0)
  (lstm): LSTM(64, 32, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=32, out_features=2, bias=True)
)


In [63]:
# loss and optimization functions
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### Model training

In [64]:
# function to predict accuracy
def acc(out,label):
    _, pred = torch.max(out, 1)
    return torch.sum(pred == label)

In [65]:
model = TweetLSTM(vocab_size, embedding_dim, hidden_dim, dropout, num_layers)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
clip = 5
epochs = 12

for epoch in range(epochs):
    epoch_preds = []
    epoch_labels = []
    train_losses = []
    train_acc = 0.0
    model.train()
    # initialize hidden state
    h = model.init_hidden(batch_size)
    for inputs, labels in train_dataloader:
       
        inputs, labels = inputs.to(device), labels.to(device)  
        # Get a new copy of initialized h, leaving the original h unchanged.
        h = tuple([x.data for x in h])
       
        model.zero_grad()
        output, h = model(inputs, h)
        # calculate the loss and perform backpropagation
        loss = criterion(output, labels)
        loss.backward()
        train_losses.append(loss.item())
        # calculating accuracy
        accuracy = acc(output,labels)
        train_acc += accuracy
        # clip helps prevent the exploding gradient problem in LSTM.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()               
           
    epoch_train_loss = np.mean(train_losses)
    epoch_train_acc = train_acc/len(train_dataloader.dataset)

    print(f'Epoch {epoch+1}')
    print(f'train_loss : {epoch_train_loss}')
    print(f'train_accuracy : {epoch_train_acc*100}')
    print(30 * '==')


Epoch 1
train_loss : 0.6899784299043509
train_accuracy : 52.41843032836914
Epoch 2
train_loss : 0.6305122570349619
train_accuracy : 65.64299774169922
Epoch 3
train_loss : 0.5491094804153993
train_accuracy : 72.4568099975586
Epoch 4
train_loss : 0.4506462557384601
train_accuracy : 79.9040298461914
Epoch 5
train_loss : 0.38860702901505506
train_accuracy : 83.01343536376953
Epoch 6
train_loss : 0.3380555599354781
train_accuracy : 85.8541259765625
Epoch 7
train_loss : 0.2925582710080422
train_accuracy : 89.0403060913086
Epoch 8
train_loss : 0.24769506904368216
train_accuracy : 91.0556640625
Epoch 9
train_loss : 0.22181315863361725
train_accuracy : 92.51439666748047
Epoch 10
train_loss : 0.19558816087933686
train_accuracy : 93.7427978515625
Epoch 11
train_loss : 0.18479679739819124
train_accuracy : 94.22264862060547
Epoch 12
train_loss : 0.16016311422348595
train_accuracy : 95.16314697265625


### Save and reload the trained params

In [66]:
torch.save(model.state_dict(), 'state_dict.pt')

In [67]:
state_dict = torch.load('state_dict.pt') 

  state_dict = torch.load('state_dict.pt')


In [68]:
model = TweetLSTM(vocab_size, embedding_dim, hidden_dim, dropout, num_layers)

In [69]:
# Load the saved parameters into the model
model.load_state_dict(state_dict) 

<All keys matched successfully>

In [70]:
model.eval()

TweetLSTM(
  (embedding): Embedding(5001, 64, padding_idx=0)
  (lstm): LSTM(64, 32, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=32, out_features=2, bias=True)
)

### Inference / Prediction

In [71]:
# Here the argument "tokenized_tweet" is a tokenized tweet without stop words.
def padded_vector(tokenized_tweet, vocab_dict, max_len):
    int_tweet = []
    for word in tokenized_tweet:
        if word in vocab_dict:
            int_tweet.append(vocab_dict[word])  

    padded_vector = int_tweet + [0] * max(0, max_len - len(int_tweet))
    return padded_vector

In [72]:
# Here the argument tweet is the raw text data.
def create_padded_tokens(tweet):
    tweet = re.sub(r'[,!?;-]', '.', tweet) #  Punctuations are replaced by "."
    #tweet_lowered = tweet.lower()
    tokenized_tweet = nltk.word_tokenize(tweet) 
    #  Lower case and drop non-alphabetical tokens
    tokenized_tweet = [ch.lower() for ch in tokenized_tweet if ch.isalpha() or ch == '.']  
    english_stopwords = stopwords.words('english')
    tokens_wo_stopwords = [t for t in tokenized_tweet if t not in english_stopwords]
    tokens_padded = np.array(padded_vector(tokens_wo_stopwords, vocab_dict, max_len))
    tokens_padded = np.expand_dims(tokens_padded, axis=0) # To change 2-d array to 3-d to use in model(inputs, h).

    return tokens_padded

In [73]:
# predict one tweet at a time. (batch_size is set at 1.)
def predict_tweet(tokens_padded):
    pad =  torch.from_numpy(tokens_padded) 
    inputs = pad.to(device)
    batch_size = 1
    h = model.init_hidden(batch_size)
    h = tuple([each.data for each in h])
    output, h = model(inputs, h)
    #print(output[0])
    out = nn.Sigmoid()(output[0][1])
    return(out.item())   

In [74]:
index = 525
print(X_test[index])
print('='*70)
print(f'Actual target is  : {y_test[index]}')
print('='*70)
tokens_padded = create_padded_tokens(X_test[index])
prob = predict_tweet(tokens_padded)
status = "Disaster" if prob > 0.5 else "Non-disaster"
prob = (1 - prob) if status == "Non-disaster" else prob
print(f'Predicted sentiment is {status} with a probability of {prob}')

seismic NA #Sismo DETECTADO #JapÌ_n [Report 3] 01:02:17 Okinawa Island region M3.8 Depth 10km Maximum seismic intensity 3 JST #??
Actual target is  : 1
Predicted sentiment is Disaster with a probability of 0.9275115132331848


### Test accuracy 

In [75]:
len(X_test)

1737

In [76]:
#batch_size_test = 1
all_batches_acc = []
for idx in range(len(X_test)):
    batch_acc = []

    tweet = X_test[idx]
    tokens_padded = create_padded_tokens(tweet)
    prob = torch.tensor(predict_tweet(tokens_padded))
    pred = torch.round(prob)
    target = y_test[idx]
    batch_acc.append(pred == target)
        
    all_batches_acc.extend(batch_acc) 

accuracy = sum(all_batches_acc)/len(X_test)
print(accuracy)   

tensor(0.7697)


### Precision, Recall, and F1 Score

In [77]:
def calc_metrics(y_pred, y_true):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()    
    precision = round(tp / (tp + fp), 3)
    recall = round(tp / (tp + fn), 3)
    f1_score = round(2 * precision * recall / (precision + recall), 3)

    return precision, recall, f1_score

In [81]:
preds = []
targets = []
for idx in range(len(X_test)):
    batch_acc = []

    tweet = X_test[idx]
    tokens_padded = create_padded_tokens(tweet)
    prob = torch.tensor(predict_tweet(tokens_padded))
    pred = torch.round(prob)
    target = y_test[idx]

    preds.append(int(pred.item()))
    targets.append(target.item())

precision, recall, f1_score = calc_metrics(preds, targets)

In [82]:
print(f"Accuracy: {accuracy*100: .1f}%\nPrecision: {precision*100: .1f}%\nRecall: {recall*100: .1f}%\nF1 score: {f1_score*100: .1f}%")

Accuracy:  77.0%
Precision:  75.6%
Recall:  79.9%
F1 score:  77.7%
