In [1]:
import spacy
import numpy
import pandas as pd
import string

In [2]:
dataPath = "Data\\track-a.csv"

In [3]:
dataFrame = pd.read_csv(dataPath)

In [5]:
nlpModel = spacy.load("en_core_web_sm", disable=["parser","ner"])

In [6]:
def cleanerFunction(text):
    tempDoc = nlpModel(text)
    token = [
        tok.lemma_.lower()
        for tok in tempDoc
        if not tok.is_stop and not tok.is_punct and tok.lemma_ != "-PRON-"
    ]
    return " ".join(token)

In [7]:
dataFrame["Spacy_text"] = dataFrame["text"].astype(str).apply(cleanerFunction)

In [8]:
labelColumns = [col for col in ["anger", "fear", "joy", "sadness", "surprise"] if col in dataFrame]
y = dataFrame[labelColumns].values

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [10]:
xTrain, xTest, yTrain, yTest = train_test_split(dataFrame["Spacy_text"], y, test_size = 0.1, random_state = 42)

In [19]:
tfidfModel = TfidfVectorizer(max_features = 5000, ngram_range = (1, 2))
Xtr = tfidfModel.fit_transform(xTrain)
Xv = tfidfModel.transform(xTest)

In [20]:
orClassifier = OneVsRestClassifier(LogisticRegression(max_iter = 1500))
orClassifier.fit(Xtr, yTrain)
yPred = orClassifier.predict(Xv)

In [None]:
print(classification_report(yTest, yPred, target_names = labelColumns, zero_division = 1.0))

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        34
        fear       0.67      0.85      0.75       168
         joy       1.00      0.12      0.22        48
     sadness       0.74      0.17      0.27        84
    surprise       0.79      0.13      0.23        83

   micro avg       0.69      0.41      0.52       417
   macro avg       0.64      0.25      0.29       417
weighted avg       0.69      0.41      0.43       417
 samples avg       0.55      0.40      0.44       417



In [16]:
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamWeightDecay, get_linear_schedule_with_warmup

In [17]:
textTrain, textValue, labelTrain, labelValue = train_test_split(dataFrame["Spacy_text"].tolist(), y, test_size = 0.1, random_state = 42)

In [18]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
