In [1]:
import spacy
import numpy
import pandas as pd
import string

In [2]:
dataPath = "Data\\track-a.csv"

In [3]:
dataFrame = pd.read_csv(dataPath)

In [4]:
nlpModel = spacy.load("en_core_web_sm", disable=["parser","ner"])

In [5]:
def cleanerFunction(text):
    tempDoc = nlpModel(text)
    token = [
        tok.lemma_.lower()
        for tok in tempDoc
        if not tok.is_stop and not tok.is_punct and tok.lemma_ != "-PRON-"
    ]
    return " ".join(token)

In [6]:
dataFrame["Spacy_text"] = dataFrame["text"].astype(str).apply(cleanerFunction)

In [7]:
labelColumns = [col for col in ["anger", "fear", "joy", "sadness", "surprise"] if col in dataFrame]
y = dataFrame[labelColumns].values

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [9]:
xTrain, xTest, yTrain, yTest = train_test_split(dataFrame["Spacy_text"], y, test_size = 0.1, random_state = 42)

In [10]:
tfidfModel = TfidfVectorizer(max_features = 5000, ngram_range = (1, 2))
Xtr = tfidfModel.fit_transform(xTrain)
Xv = tfidfModel.transform(xTest)

In [11]:
orClassifier = OneVsRestClassifier(LogisticRegression(max_iter = 1500))
orClassifier.fit(Xtr, yTrain)
yPred = orClassifier.predict(Xv)

In [12]:
print(classification_report(yTest, yPred, target_names = labelColumns, zero_division = 0.0))

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        34
        fear       0.67      0.85      0.75       168
         joy       1.00      0.12      0.22        48
     sadness       0.74      0.17      0.27        84
    surprise       0.79      0.13      0.23        83

   micro avg       0.69      0.41      0.52       417
   macro avg       0.64      0.25      0.29       417
weighted avg       0.69      0.41      0.43       417
 samples avg       0.55      0.40      0.44       417



In [13]:
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
textTrain, textValue, labelTrain, labelValue = train_test_split(dataFrame["Spacy_text"].tolist(), y, test_size = 0.1, random_state = 42)

In [15]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [16]:
class sentDetect(Dataset):
    def __init__(self, text, label, token, maxLength = 128):
        self.text = text
        self.label = label
        self.token = token
        self.maxLength = maxLength

    def __len__(self):
        return len(self.text)
    
    def __fetchItem__(self, i):
        enc = self.token(
            self.text[i], max_length = self.maxLength, truncation = True, padding = "maxlen", return_tensors = "pt"
        )
        chosenItem = {k: v.squeeze(0) for k, v in enc.items()}
        chosenItem["label"] = torch.FloatTensor(self.label[i])
        return chosenItem
    
trainSD = sentDetect(xTrain, yTrain, tokenizer)
testSD = sentDetect(xTest, yTest, tokenizer)
trainDL = DataLoader(trainSD, batch_size = 16, shuffle = True)
testDL = DataLoader(testSD, batch_size = 32)

In [17]:
userDevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
tensorModel = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type = "multi_label_classification",
    num_labels = len(labelColumns)
).to(userDevice)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
lossFunction = nn.BCEWithLogitsLoss()
optPara = AdamW(tensorModel.parameters(), lr = 2e-5)
steps = len(trainDL) * 3
schedule = get_linear_schedule_with_warmup(optPara, 0, steps)

In [21]:
def epochTrain():
    tensorModel.train()
    total = 0
    for batch in trainDL:
        optPara.zero_grad()
        id = batch["input_ids"].to(userDevice)
        mask = batch["attention_mask"].to(userDevice)
        labs = batch["label"].to(userDevice)
        outs = tensorModel(id, attention_mask = mask).logits
        loss = lossFunction(outs, labs)
        loss.backward(); optPara.step(); schedule.step()
        total += loss.item()
    
    return total / len(trainDL)