In [9]:
import os
import re
import time
import pandas as pd
import numpy as np
from collections import Counter

import spacy
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [3]:
dataPath = "Data\\track-a.csv"

In [6]:
dataFrame = pd.read_csv(dataPath)

In [10]:
nlpModel = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [7]:
textColumn = "text"
labelColumns = ["anger", "fear", "joy", "sadness", "surprise"]

In [12]:
def cleanerFunction(text: str) -> str:
    tempDoc = nlpModel(text)
    tokens = [
        tok.lemma_.lower()
        for tok in tempDoc
        if not tok.is_stop and not tok.is_punct and tok.lemma_ != "-PRON-"
    ]
    return " ".join(tokens)

In [13]:
dataFrame["Spacy_text"] = dataFrame["text"].astype(str).apply(cleanerFunction)

In [14]:
labelColumns = [col for col in ["anger", "fear", "joy", "sadness", "surprise"] if col in dataFrame]
yData = dataFrame[labelColumns].values.astype(np.float32)

In [19]:
xTrain, xVal, yTrain, yVal = train_test_split(dataFrame["Spacy_text"], yData, test_size = 0.10, random_state = 69)

In [20]:
xTrain = xTrain.reset_index(drop=True).tolist()
xVal   = xVal.reset_index(drop=True).tolist()

print(f"Number of train examples: {len(xTrain)}, number of val examples: {len(xVal)}")

Number of train examples: 2491, number of val examples: 277


In [21]:
TOKEN_PATTERN = re.compile(r"\b\w+\b")

In [22]:
def simpleTokenize(text: str) -> list:
    return TOKEN_PATTERN.findall(text.lower())

In [23]:
def vocabularyBuilder(texts: list, vocabsize: int = 20000):
    counter = Counter()
    for t in texts:
        tokens = simpleTokenize(t)
        counter.update(tokens)
    
    mostCommon = counter.most_common(vocabsize - 2)
    indextoword = ["<pad>", "<unk>"] + [token for token, _ in mostCommon]
    wordtoindex = {w: i for i, w in enumerate(indextoword)}

    return wordtoindex, indextoword

In [24]:
MAX_VOCAB_SIZE = 20000

wordtoindex, indextoword = vocabularyBuilder(xTrain, vocabsize = MAX_VOCAB_SIZE)
vocabSize = len(indextoword)

padIndex = wordtoindex["<pad>"]
unkIndex = wordtoindex["<unk>"]

In [None]:
def encodePadFunction(texts: list, wordtoindex: dict, sequenceLength: int = 100) -> np.ndarray:
    encodings = []

    for t in texts:
        tokens = simpleTokenize(t)
        tokenIDs = [wordtoindex.get(tok, unkIndex) for tok in tokens]
        if len(tokenIDs) > sequenceLength:
            tokenIDs = tokenIDs[:sequenceLength]
        else:
            tokenIDs = tokenIDs + [padIndex] * (sequenceLength - len(tokenIDs))
        encodings.append(tokenIDs)

    return np.array(encodings, dtype = np.int64)