In [396]:
import math
import re

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import f1_score

from typing import Callable, Iterable

In [397]:
LEARNING_RATE = 1e-3
BATCH_SIZE = 32
EPOCHS = 200
NUM_CLASSES = 3

In [398]:
TRAIN_SET_PATH = "./vaccine_train_set.csv"
VALIDATION_SET_PATH = "./vaccine_validation_set.csv"
EMBEDDINGS_PATH = './glove.6B.50d.txt'

In [399]:
wordVectors: "dict[str, np.ndarray]" = {}
lineElements: "list[str]" = []
with open(EMBEDDINGS_PATH) as file:
    for line in file:
        lineElements = line.replace('\n', '').split()
        word = lineElements.pop(0)
        wordVector = np.array([float(w) for w in lineElements])
        wordVectors[word] = wordVector
dimensions = len(lineElements)

In [400]:
trainDF = pd.read_csv(TRAIN_SET_PATH)
trainDF.isnull().sum()

Unnamed: 0    0
tweet         0
label         0
dtype: int64

In [401]:
validDF = pd.read_csv(VALIDATION_SET_PATH)
validDF.isnull().sum()

Unnamed: 0    0
tweet         0
label         0
dtype: int64

In [402]:
features = trainDF.drop(['label', 'Unnamed: 0'], axis=1)
X_train = np.array([ar[0] for ar in features.values])
trainLabels = trainDF['label'].values

features = validDF.drop(['label', 'Unnamed: 0'], axis=1)
X_valid = np.array([ar[0] for ar in features.values])
validLabels = validDF['label'].values

In [403]:
def customPreprocessor(text: str):    
    # remove url's
    trimmedText = re.sub(r'https?://\S+|www\.\S+|#', '', text).lower()

    # remove @ mentions and numbers
    res = list()
    wait_whitespace = False
    for c in trimmedText:
        if wait_whitespace:
            if c == " ":
                wait_whitespace = False
            continue
        elif re.match("[0-9]", c) or c == "@":
            wait_whitespace = True
            continue            
        res.append(c)
    
    return ''.join(res)

In [404]:
def vectorizeTweet(tweet: str, preprocessor: Callable[[str], str], wordVectors: "dict[str, np.ndarray]", dimensions: int) -> np.ndarray:
    words = preprocessor(tweet).split()
    vector: np.ndarray = np.zeros(dimensions)
    for word in words:
        wordVector = wordVectors.get(word)
        if wordVector is not None:
            vector += wordVector        
    return vector / len(words)

In [405]:
trainSet_X: np.ndarray = np.zeros(shape=(len(X_train), dimensions))
for i, sample in enumerate(X_train):
    trainSet_X[i] = vectorizeTweet(sample, customPreprocessor, wordVectors, dimensions)

validSet_X: np.ndarray = np.zeros(shape=(len(X_valid), dimensions))
for i, sample in enumerate(X_valid):
    validSet_X[i] = vectorizeTweet(sample, customPreprocessor, wordVectors, dimensions)


In [406]:
class Network(nn.Module):
    def __init__(self, n_features: int, hidden_sizes: Iterable[int], n_classes: int) -> None:
        super(Network, self).__init__()
        
        self.layers = nn.ModuleList()
        n_in = n_features
        for size in hidden_sizes:
            self.layers.append(nn.Linear(n_in, size))
            n_in = size
        self.layers.append(nn.Linear(n_in, n_classes))
    
    def forward(self, input):
        last_out = input
        for layer in self.layers:
            last_out = layer(last_out)
        return last_out


In [407]:
network = Network(dimensions, [32, 16], NUM_CLASSES)
network.layers

ModuleList(
  (0): Linear(in_features=50, out_features=32, bias=True)
  (1): Linear(in_features=32, out_features=16, bias=True)
  (2): Linear(in_features=16, out_features=3, bias=True)
)

In [408]:
def createClassWeights(sampleLabels, numClasses):
    classCounts = [0 for n in range(numClasses)]
    for label in sampleLabels:
        classCounts[label] += 1

    weights = []
    for count in classCounts:
        weights.append(len(sampleLabels)/count)
    
    return weights

In [409]:
y = torch.tensor(trainLabels, dtype=torch.float)
x = torch.tensor(trainSet_X, dtype=torch.float)

dataset = TensorDataset(x, y)
trainSetLoader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

y = torch.tensor(validLabels, dtype=torch.float)
x = torch.tensor(validSet_X, dtype=torch.float)

weights = createClassWeights(trainLabels, NUM_CLASSES)
sampler = WeightedRandomSampler(weights, len(trainLabels))
dataset = TensorDataset(x, y)
validSetLoader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, sampler=sampler)

lossFunction = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(network.parameters(), lr=LEARNING_RATE)

In [410]:
def calculateAccuracy(y_pred, y_true):
    softmaxLayerOut = torch.log_softmax(y_pred, dim = 1)
    _, predictions = torch.max(softmaxLayerOut, dim = 1)
    
    correct = 0
    for pred, true in zip(predictions, y_true):
        correct += int(pred == true)
    
    return correct/len(y_true)*100

In [411]:
for epoch in range(EPOCHS):
    network.train()
    batchLosses = []
    batchAccs = []

    for batchSamples, batchLabels in trainSetLoader:
        predictions = network(batchSamples)

        batchLoss = lossFunction(predictions, batchLabels.long())
        batchAccs.append(calculateAccuracy(predictions, batchLabels))
        batchLosses.append(batchLoss.item())

        optimizer.zero_grad()

        batchLoss.backward()

        optimizer.step()
    
    network.eval()
    pred = network(x)
    acc = calculateAccuracy(pred, y)
    #predictions = [list(pred) for pred in network(x).detach().numpy()]
    #predictions = [pred.index(max(pred)) for pred in predictions]
    #score = f1_score(y.detach().numpy(), predictions, average="macro")
    #print(f"Epoch {epoch:3}: Loss = {sum(batchLosses)/len(trainSetLoader):.5f} Validation Score: {score}\r", end='')
    print(f"Epoch {epoch:3} Train Acc = {sum(batchAccs)/len(batchAccs):.5f} Valid Acc = {acc:.5f}\r", end='')
print()



KeyboardInterrupt: 

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])