In [1]:
from typing import Callable, Iterable, List, Dict, Tuple
from numbers import Number
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import TweetTokenizer

import torch
from torch import FloatTensor
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.data import Field, TabularDataset, BucketIterator


In [2]:
TRAIN_SET_PATH = "./vaccine_train_set.csv"
VALIDATION_SET_PATH = "./vaccine_validation_set.csv"
EMBEDDINGS_PATH = '/mnt/c/Users/pavlo/Downloads/glove.6B.50d.txt'

In [3]:
def createWordVectors(embeddingsPath: str):
    wordVectors: Dict[str, np.ndarray] = {}
    lineElements: List[str] = []
    with open(embeddingsPath) as file:
        # Read file line by line
        for line in file:
            # Remove new line and split
            lineElements = line.replace('\n', '').split()
            # The first element in the line is the target word
            word = lineElements.pop(0)
            # The other elements represent the vector of the word
            wordVector = np.array([float(w) for w in lineElements])
            # Store the vector for this word
            wordVectors[word] = wordVector
    dimensions = len(lineElements)

    return wordVectors, dimensions

wordVectors, dimensions = createWordVectors(EMBEDDINGS_PATH)

In [4]:
def customPreprocessor(text: str):    
    # remove url's
    trimmedText = re.sub(r'https?://\S+|www\.\S+|#', '', text).lower()

    # remove @ mentions and numbers
    res = list()
    wait_whitespace = False
    for c in trimmedText:
        if wait_whitespace:
            if c == " ":
                wait_whitespace = False
            continue
        elif re.match("[0-9]", c) or c == "@":
            wait_whitespace = True
            continue            
        res.append(c)
    
    return ''.join(res)

tokenizer = TweetTokenizer()
def customTokenizer(text: str):
    return tokenizer.tokenize(customPreprocessor(text))


In [5]:
# nltk.TweetTokenizer is used for tweet tokenization

def vectorizeTweet(tweet: str, preprocessor: Callable[[str], str], wordVectors: Dict[str, np.ndarray], dimensions: int) -> np.ndarray:
    # Split the tweet into words/tokens
    words = tokenizer.tokenize(preprocessor(tweet))
    # words = tokenizer.tokenize(tweet)

    # The sum of the vectors of the tweet words is stored here
    vector: np.ndarray = np.zeros(dimensions)
    for word in words:
        # Get the word/token pre-trained vector
        wordVector = wordVectors.get(word)
        if wordVector is not None:
            # If found, add to the tweet vector
            vector += wordVector
    
    # return the mean vector
    return vector / len(words)

def vectorizeDataSet(dataSet: Iterable[str], dimensions: int, wordVectors: Dict[str, np.ndarray]):
    matrix: np.ndarray = np.zeros(shape=(len(dataSet), dimensions))
    for i, sample in enumerate(dataSet):
        matrix[i] = vectorizeTweet(sample, customPreprocessor, wordVectors, dimensions)
    
    return matrix

In [6]:
labelField = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
tweetField = Field(tokenize=customTokenizer, include_lengths=True, batch_first=True)
fields = [('', None), ('tweet', tweetField), ('label', labelField)]

trainDataset = TabularDataset(path=TRAIN_SET_PATH, format='CSV', fields=fields, skip_header=True,)
validDataset = TabularDataset(path=VALIDATION_SET_PATH, format='CSV', fields=fields, skip_header=True)

trainIterator = BucketIterator(trainDataset, batch_size=32)
validIterator = BucketIterator(validDataset, batch_size=32)

In [7]:
tweetField.build_vocab(trainDataset)
len(tweetField.vocab)

20296

In [8]:
matrixNumWords = len(tweetField.vocab)
embeddingsMatrix = torch.zeros(matrixNumWords, dimensions, dtype=torch.float)

for i, word in enumerate(tweetField.vocab.itos):
    wordVector = wordVectors.get(word)

    if wordVector is not None:
        embeddingsMatrix[i] = torch.from_numpy(wordVector)

In [9]:
tweetField.vocab.stoi['the']

3

In [10]:
emb = nn.Embedding.from_pretrained(embeddingsMatrix)
# emb(torch.LongTensor([tweetField.vocab.stoi]))

In [None]:
for ((tweets, tweetsLen), labels), _ in trainIterator:
    # tweets: Tensor with the tweets
    # each tweet is a Tensor with the corresponding word indexes as values

    # tweetsLen: Tensor, contains the length of the corresponding index tweet in `tweets`

    embout = emb(tweets)
    # embout: Tensor.
    # Each element is a 2-D Tensor of a tweet. Each Tensor in it is the vector of the corresponding word.
    pass

In [10]:
class StackedRNN(nn.Module):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, input):
        pass