In [50]:
import math
import re

import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from typing import Callable, Iterable

In [51]:
TRAIN_SET_PATH = "./vaccine_train_set.csv"
VALIDATION_SET_PATH = "./vaccine_validation_set.csv"
EMBEDDINGS_PATH = './glove.6B.50d.txt'

In [52]:
wordVectors: "dict[str, np.ndarray]" = {}
lineElements: "list[str]" = []
with open(EMBEDDINGS_PATH) as file:
    for line in file:
        lineElements = line.replace('\n', '').split()
        word = lineElements.pop(0)
        wordVector = np.array([float(w) for w in lineElements])
        wordVectors[word] = wordVector
dimensions = len(lineElements)

In [53]:
trainDF = pd.read_csv('./vaccine_train_set.csv')
trainDF.isnull().sum()

Unnamed: 0    0
tweet         0
label         0
dtype: int64

In [54]:
validDF = pd.read_csv(VALIDATION_SET_PATH)
validDF.isnull().sum()

Unnamed: 0    0
tweet         0
label         0
dtype: int64

In [55]:
features = trainDF.drop(['label', 'Unnamed: 0'], axis=1)
X_train = [ar[0] for ar in features.values]
trainLabels = trainDF['label'].values

features = validDF.drop(['label', 'Unnamed: 0'], axis=1)
X_valid = [ar[0] for ar in features.values]
validLabels = validDF['label'].values

In [56]:
def customPreprocessor(text: str):    
    # remove url's
    trimmedText = re.sub(r'https?://\S+|www\.\S+|#', '', text).lower()

    # remove @ mentions and numbers
    res = list()
    wait_whitespace = False
    for c in trimmedText:
        if wait_whitespace:
            if c == " ":
                wait_whitespace = False
            continue
        elif re.match("[0-9]", c) or c == "@":
            wait_whitespace = True
            continue            
        res.append(c)
    
    return ''.join(res)

In [57]:
def vectorizeTweet(tweet: str, preprocessor: Callable[[str], str], wordVectors: "dict[str, np.ndarray]", dimensions: int) -> np.ndarray:
    words = preprocessor(tweet).split()
    vector: np.ndarray = np.zeros(dimensions)
    for word in words:
        vector += wordVectors[word]
    return vector / len(words)

In [64]:
class Network(nn.Module):
    def __init__(self, n_features: int, n_classes: int, hidden_sizes: Iterable[int]) -> None:
        super(Network, self).__init__()
        
        self.layers: "list[nn.Linear]" = []
        n_in = n_features
        for size in hidden_sizes:
            self.layers.append(nn.Linear(n_in, size))
            n_in = size
        self.layers.append(nn.Linear(n_in, n_classes))
    
    def forward(self, input):
        last_out = input
        for layer in self.layers:
            last_out = layer(last_out)
        return last_out


In [65]:
network = Network(50, 3, [])
network.layers

[Linear(in_features=50, out_features=3, bias=True)]

In [None]:
vectorizeTweet('the 68 world', customPreprocessor, wordVectors, dimensions)

In [61]:
# df = pd.DataFrame([[1, 2, 3, 4], [0, 0, 4, 8]])
# Min = df.min()
# df = (df-Min)/(df.max() - Min)
# df

In [62]:
# l = [1, 2, 3, 4]
# Max = max(l)
# Min = min(l)
# for i, element in enumerate(l):
#     l[i] = (element - Min)/(Max - Min)
# l