In [15]:
import math
import re

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from typing import Callable, Iterable

In [16]:
LEARNING_RATE = 1e-4
BATCH_SIZE = 25
EPOCHS = 50

In [17]:
TRAIN_SET_PATH = "./vaccine_train_set.csv"
VALIDATION_SET_PATH = "./vaccine_validation_set.csv"
EMBEDDINGS_PATH = './glove.6B.50d.txt'

In [18]:
wordVectors: "dict[str, np.ndarray]" = {}
lineElements: "list[str]" = []
with open(EMBEDDINGS_PATH) as file:
    for line in file:
        lineElements = line.replace('\n', '').split()
        word = lineElements.pop(0)
        wordVector = np.array([float(w) for w in lineElements])
        wordVectors[word] = wordVector
dimensions = len(lineElements)

In [19]:
trainDF = pd.read_csv(TRAIN_SET_PATH)
trainDF.isnull().sum()

Unnamed: 0    0
tweet         0
label         0
dtype: int64

In [20]:
validDF = pd.read_csv(VALIDATION_SET_PATH)
validDF.isnull().sum()

Unnamed: 0    0
tweet         0
label         0
dtype: int64

In [21]:
features = trainDF.drop(['label', 'Unnamed: 0'], axis=1)
X_train = np.array([ar[0] for ar in features.values])
trainLabels = trainDF['label'].values

features = validDF.drop(['label', 'Unnamed: 0'], axis=1)
X_valid = np.array([ar[0] for ar in features.values])
validLabels = validDF['label'].values

In [22]:
def customPreprocessor(text: str):    
    # remove url's
    trimmedText = re.sub(r'https?://\S+|www\.\S+|#', '', text).lower()

    # remove @ mentions and numbers
    res = list()
    wait_whitespace = False
    for c in trimmedText:
        if wait_whitespace:
            if c == " ":
                wait_whitespace = False
            continue
        elif re.match("[0-9]", c) or c == "@":
            wait_whitespace = True
            continue            
        res.append(c)
    
    return ''.join(res)

In [23]:
def vectorizeTweet(tweet: str, preprocessor: Callable[[str], str], wordVectors: "dict[str, np.ndarray]", dimensions: int) -> np.ndarray:
    words = preprocessor(tweet).split()
    vector: np.ndarray = np.zeros(dimensions)
    for word in words:
        wordVector = wordVectors.get(word)
        if wordVector is not None:
            vector += wordVector        
    return vector / len(words)

In [24]:
trainSet_X: np.ndarray = np.zeros(shape=(len(X_train), dimensions))
for i, sample in enumerate(X_train):
    trainSet_X[i] = vectorizeTweet(sample, customPreprocessor, wordVectors, dimensions)

validSet_X: np.ndarray = np.zeros(shape=(len(X_valid), dimensions))
for i, sample in enumerate(X_valid):
    trainSet_X[i] = vectorizeTweet(sample, customPreprocessor, wordVectors, dimensions)


In [25]:
class Network(nn.Module):
    def __init__(self, n_features: int, hidden_sizes: Iterable[int], n_classes: int) -> None:
        super(Network, self).__init__()
        
        self.layers = nn.ModuleList()
        n_in = n_features
        for size in hidden_sizes:
            self.layers.append(nn.Linear(n_in, size))
            n_in = size
        self.layers.append(nn.Linear(n_in, n_classes))
    
    def forward(self, input):
        last_out = input
        for layer in self.layers:
            last_out = layer(last_out)
        return last_out


In [26]:
network = Network(dimensions, [64, 32, 16], 3)
network.layers

ModuleList(
  (0): Linear(in_features=50, out_features=64, bias=True)
  (1): Linear(in_features=64, out_features=32, bias=True)
  (2): Linear(in_features=32, out_features=16, bias=True)
  (3): Linear(in_features=16, out_features=3, bias=True)
)

In [27]:
y = torch.tensor(trainLabels, dtype=torch.float)
x = torch.tensor(trainSet_X, dtype=torch.float)

dataset = TensorDataset(x, y)
trainSetLoader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

y = torch.tensor(validLabels, dtype=torch.float)
x = torch.tensor(validSet_X, dtype=torch.float)

dataset = TensorDataset(x, y)
validSetLoader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)

lossFunction = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(network.parameters(), lr=LEARNING_RATE)

In [28]:
for epoch in range(EPOCHS):
    batchLosses = []

    for batchSamples, batchLabels in trainSetLoader:
        predictions = network(batchSamples)

        batchLoss = lossFunction(predictions, batchLabels.long())
        batchLosses.append(batchLoss.item())

        optimizer.zero_grad()

        batchLoss.backward()

        optimizer.step()
    
    print(f"Epoch {epoch:3}: Loss = {sum(batchLosses)/len(trainSetLoader):.5f}\r", end='')
print()



Epoch  49: Loss = 0.98904


In [49]:
pred = network(x)

multi_acc(pred, y.long())

RuntimeError: The size of tensor a (3) must match the size of tensor b (2282) at non-singleton dimension 0