In [127]:
#!wget https://nlp.stanford.edu/data/glove.6B.zip
#!unzip ./glove.6B.zip

In [128]:
from typing import Callable, Iterable, List, Dict, Tuple
from numbers import Number
import re
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from torchtext.legacy.data import Field, TabularDataset, BucketIterator

from sklearn.metrics import f1_score, recall_score, precision_score, \
                            roc_curve, auc, ConfusionMatrixDisplay

In [129]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
_ = torch.manual_seed(SEED)

In [130]:
TRAIN_SET_PATH = "./vaccine_train_set.csv"
VALIDATION_SET_PATH = "./vaccine_validation_set.csv"
EMBEDDINGS_PATH = './glove.6B.50d.txt'

In [131]:
NUM_CLASSES = 3
CLASS_NAMES = ['Neutral', 'Anti-Vaccine', 'Pro-Vaccine']
EPOCHS = 20
LEARNING_RATE = 0.002
BATCH_SIZE = 128

In [132]:
trainDF = pd.read_csv(TRAIN_SET_PATH)
trainDF.isnull().sum()

Unnamed: 0    0
tweet         0
label         0
dtype: int64

In [133]:
validDF = pd.read_csv(VALIDATION_SET_PATH)
validDF.isnull().sum()

Unnamed: 0    0
tweet         0
label         0
dtype: int64

In [134]:
validLabels = validDF['label'].values

In [135]:
def createWordVectors(embeddingsPath: str):
    wordVectors: Dict[str, np.ndarray] = {}
    lineElements: List[str] = []
    with open(embeddingsPath) as file:
        # Read file line by line
        for line in file:
            # Remove new line and split
            lineElements = line.replace('\n', '').split()
            # The first element in the line is the target word
            word = lineElements.pop(0)
            # The other elements represent the vector of the word
            wordVector = np.array([float(w) for w in lineElements])
            # Store the vector for this word
            wordVectors[word] = wordVector
    dimensions = len(lineElements)

    return wordVectors, dimensions

wordVectors, dimensions = createWordVectors(EMBEDDINGS_PATH)

In [136]:
def customPreprocessor(text: str):    
    # remove url's
    trimmedText = re.sub(r'https?://\S+|www\.\S+|#', '', text).lower()

    # remove @ mentions and numbers
    res = list()
    wait_whitespace = False
    for c in trimmedText:
        if wait_whitespace:
            if c == " ":
                wait_whitespace = False
            continue
        elif re.match("[0-9]", c) or c == "@":
            wait_whitespace = True
            continue            
        res.append(c)
    
    return ''.join(res)

tokenizer = TweetTokenizer()
def customTokenizer(text: str):
    return tokenizer.tokenize(customPreprocessor(text))


In [137]:
# nltk.TweetTokenizer is used for tweet tokenization

def vectorizeTweet(tweet: str, preprocessor: Callable[[str], str], wordVectors: Dict[str, np.ndarray], dimensions: int) -> np.ndarray:
    # Split the tweet into words/tokens
    words = tokenizer.tokenize(preprocessor(tweet))
    # words = tokenizer.tokenize(tweet)

    # The sum of the vectors of the tweet words is stored here
    vector: np.ndarray = np.zeros(dimensions)
    for word in words:
        # Get the word/token pre-trained vector
        wordVector = wordVectors.get(word)
        if wordVector is not None:
            # If found, add to the tweet vector
            vector += wordVector
    
    # return the mean vector
    return vector / len(words)

def vectorizeDataSet(dataSet: Iterable[str], dimensions: int, wordVectors: Dict[str, np.ndarray]):
    matrix: np.ndarray = np.zeros(shape=(len(dataSet), dimensions))
    for i, sample in enumerate(dataSet):
        matrix[i] = vectorizeTweet(sample, customPreprocessor, wordVectors, dimensions)
    
    return matrix

In [138]:
labelField = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
tweetField = Field(tokenize=customTokenizer, include_lengths=True, batch_first=True)
fields = [('', None), ('tweet', tweetField), ('label', labelField)]

trainDataset = TabularDataset(path=TRAIN_SET_PATH, format='CSV', fields=fields, skip_header=True,)
validDataset = TabularDataset(path=VALIDATION_SET_PATH, format='CSV', fields=fields, skip_header=True)

trainIterator = BucketIterator(trainDataset, batch_size=BATCH_SIZE)
validIterator = BucketIterator(validDataset, batch_size=len(validLabels))


tweetField.build_vocab(trainDataset)
paddingIndex = tweetField.vocab.stoi['<pad>']

In [139]:
matrixNumWords = len(tweetField.vocab)
embeddingsMatrix = torch.zeros(matrixNumWords, dimensions, dtype=torch.float)

for i, word in enumerate(tweetField.vocab.itos):
    wordVector = wordVectors.get(word)

    if wordVector is not None:
        embeddingsMatrix[i] = torch.from_numpy(wordVector)

In [140]:
def calculateAccuracy(predictedLabels: Iterable[Number], trueLabels: Iterable[Number]) -> float:    
    correct = 0
    for pred, true in zip(predictedLabels, trueLabels):
        correct += int(pred == true)
    
    return correct/len(trueLabels)*100

In [141]:
def getPredictedLabels(predictions: torch.Tensor) -> np.ndarray:
    softmaxLayerOut = torch.log_softmax(predictions, dim = 1)
    _, labels = torch.max(softmaxLayerOut, dim = 1)
    return labels.detach().numpy()

In [142]:
class SkipRNN(nn.Module):
    def __init__(self, embeddingsMatrix, vectorDimension: int, numLayers: int, hiddenSize: int, skipConnections: bool = True) -> None:
        super().__init__()

        self.USE_SKIP = skipConnections
        self.vectorDimension = vectorDimension
        self.embedding = nn.Embedding.from_pretrained(embeddingsMatrix)
        
        self.startingLayer = nn.LSTM(   input_size=vectorDimension,
                                        hidden_size=hiddenSize,
                                        num_layers=1,
                                        batch_first=True,
                                        bidirectional=False )
        self.lstmLayers = nn.ModuleList()
        for _ in range(numLayers - 1):
            self.lstmLayers.append(nn.LSTM( input_size=hiddenSize,
                                            hidden_size=hiddenSize,
                                            num_layers=1,
                                            batch_first=True,
                                            bidirectional=False))

        self.linear = nn.Linear(hiddenSize, NUM_CLASSES)
        self.activation = nn.Tanh()

        if (skipConnections):
            self.skip = nn.Identity()

    def forward(self, input, inputLengths):
        embout = self.embedding(input)
        #packedInput = pack_padded_sequence(embout, inputLengths, batch_first=True, enforce_sorted=False)

        #nextInput, _ = pad_packed_sequence(self.startingLayer(packedInput))
        nextInput, _ = self.startingLayer(embout)
        nextSkipInput = nextInput
        for layer in self.lstmLayers:
            output, _ = layer(nextInput)
            if self.USE_SKIP and (nextSkipInput is not None):
                nextInput = torch.add(output, self.skip(nextSkipInput))
                nextInput = output
                nextSkipInput = None
            else:
                nextSkipInput = output
                nextInput = output
        
        #lstmOutput, _ = pad_packed_sequence(nextInput)
        #lstmOutput = output
        #return self.linear(lstmOutput)
        lstmOutput = output[:, -1, :]
        classOutput = self.linear(lstmOutput)
        #return self.activation(classOutput)
        return classOutput


In [143]:
model = SkipRNN(embeddingsMatrix, dimensions, numLayers=4, hiddenSize=16, skipConnections=True)

In [144]:
lossFunction = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [145]:
# Loss, F1 Score & Total Predictions after every epoch are stored here
validPredictions: np.ndarray
validF1: np.ndarray = np.empty(EPOCHS, dtype=float)
validErrors: np.ndarray = np.empty(EPOCHS, dtype=float)
validOutput: torch.Tensor

trainF1: np.ndarray = np.empty(EPOCHS, dtype=float)
trainErrors: np.ndarray = np.empty(EPOCHS, dtype=float)
epochTrainPredictions: List[int]

# Epochs loop
for epoch in range(EPOCHS):
    # Set model to train mode
    epochLabels = []
    epochTrainPredictions = []
    batchLosses = []
    batchAccs = []
    model.train()

    # Batch loop
    for ((tweets, tweetsLen), labels), _ in trainIterator:
        
        # Make predictions for batch samples
        predictions = model(tweets, tweetsLen)

        # Extract & store predicted labels and calculate accuracy
        predictedLabels = getPredictedLabels(predictions)
        epochTrainPredictions.extend(predictedLabels)
        batchAccs.append(calculateAccuracy(predictedLabels, labels))

        # Run loss function, store loss & backpropagate
        batchLoss = lossFunction(predictions, labels.long())
        batchLosses.append(batchLoss.item())

        optimizer.zero_grad()

        batchLoss.backward()

        optimizer.step()
        epochLabels.extend(labels)
        # Set model to evaluation mode

    model.eval()
    with torch.no_grad():
        
        for ((tweets, tweetsLen), labels), _ in validIterator:
          # Make predictions on the Validation set
          validOutput = model(tweets, tweetsLen)

          # Run loss function & store loss
          validLoss = lossFunction(validOutput, labels.long())
          validErrors[epoch] = validLoss.item()
          
          # Extract & store predicted labels, calculate accuracy and F1 Score
          validPredictions = getPredictedLabels(validOutput)
          acc = calculateAccuracy(validPredictions, labels)
          validF1[epoch] = f1_score(labels, validPredictions, average="micro")

    # Find the total epoch loss & F1 Score for the Train set
    trainErrors[epoch] = sum(batchLosses)/len(batchLosses)
    trainF1[epoch] = f1_score(epochLabels, epochTrainPredictions, average="micro")
    
    print(f"Epoch {epoch:3} Train Acc = {sum(batchAccs)/len(batchAccs):.5f} Valid Acc = {acc:.5f}\r")

Epoch   0 Train Acc = 46.76731 Valid Acc = 47.72130
Epoch   1 Train Acc = 51.40962 Valid Acc = 59.37774
Epoch   2 Train Acc = 60.53702 Valid Acc = 63.84750
Epoch   3 Train Acc = 63.67212 Valid Acc = 55.69676
Epoch   4 Train Acc = 63.97115 Valid Acc = 65.16214
Epoch   5 Train Acc = 65.24183 Valid Acc = 64.59246
Epoch   6 Train Acc = 66.16971 Valid Acc = 65.38124
Epoch   7 Train Acc = 66.75769 Valid Acc = 65.99474
Epoch   8 Train Acc = 66.96442 Valid Acc = 66.65206
Epoch   9 Train Acc = 67.77356 Valid Acc = 66.91499
Epoch  10 Train Acc = 67.88942 Valid Acc = 66.21385
Epoch  11 Train Acc = 67.00433 Valid Acc = 66.17003
Epoch  12 Train Acc = 67.40288 Valid Acc = 64.06661
Epoch  13 Train Acc = 67.53365 Valid Acc = 67.70377
Epoch  14 Train Acc = 68.05913 Valid Acc = 67.04645
Epoch  15 Train Acc = 67.83990 Valid Acc = 67.61613
Epoch  16 Train Acc = 68.39471 Valid Acc = 68.22962
Epoch  17 Train Acc = 68.38990 Valid Acc = 67.87905
Epoch  18 Train Acc = 67.47404 Valid Acc = 66.65206
Epoch  19 Tr