In [54]:
import spacy
import numpy
import pandas as pd


In [55]:
dataPath = "Data\\track-a.csv"

In [56]:
dataFrame = pd.read_csv(dataPath)

In [57]:
nlpModel = spacy.load("en_core_web_sm", disable=["parser","ner"])

KeyboardInterrupt: 

In [None]:
def cleanerFunction(text):
    tempDoc = nlpModel(text)
    token = [
        tok.lemma_.lower()
        for tok in tempDoc
        if not tok.is_stop and not tok.is_punct and tok.lemma_ != "-PRON-"
    ]
    return " ".join(token)

In [None]:
dataFrame["Spacy_text"] = dataFrame["text"].astype(str).apply(cleanerFunction)

In [None]:
labelColumns = [col for col in ["anger", "fear", "joy", "sadness", "surprise"] if col in dataFrame]
y = dataFrame[labelColumns].values

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
xTrain, xTest, yTrain, yTest = train_test_split(dataFrame["Spacy_text"], y, test_size = 0.1, random_state = 42)

In [None]:
tfidfModel = TfidfVectorizer(max_features = 5000, ngram_range = (1, 2))
Xtr = tfidfModel.fit_transform(xTrain)
Xv = tfidfModel.transform(xTest)

In [None]:
orClassifier = OneVsRestClassifier(LogisticRegression(max_iter = 1500))
orClassifier.fit(Xtr, yTrain)
yPred = orClassifier.predict(Xv)

In [None]:
print(classification_report(yTest, yPred, target_names = labelColumns, zero_division = 0.0))

In [None]:
import torch, torch.nn as nn
import time
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup


In [None]:
textTrain, textValue, labelTrain, labelValue = train_test_split(dataFrame["Spacy_text"], y, test_size = 0.1, random_state = 42)

In [None]:
textTrain = textTrain.reset_index(drop = True)
textValue = textValue.reset_index(drop = True)

textValue = textValue.tolist()
textTrain = textTrain.tolist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
class sentDetect(Dataset):
    def __init__(self, text, label, token, maxLength = 128):
        self.text = text
        self.label = label
        self.token = token
        self.maxLength = maxLength

    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, i):
        textchanged = self.text.iloc[i]

        enc = self.token(
            textchanged, max_length = self.maxLength, truncation = True, padding = "max_length", return_tensors = "pt"
        )
        chosenItem = {k: v.squeeze(0) for k, v in enc.items()}
        chosenItem["label"] = torch.FloatTensor(self.label[i])
        return chosenItem
    
trainSD = sentDetect(xTrain, yTrain, tokenizer)
testSD = sentDetect(xTest, yTest, tokenizer)
trainDL = DataLoader(trainSD, batch_size = 16, shuffle = True)
testDL = DataLoader(testSD, batch_size = 32)

In [None]:
userDevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tensorModel = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type = "multi_label_classification",
    num_labels = len(labelColumns)
).to(userDevice)

In [None]:
lossFunction = nn.BCEWithLogitsLoss()

steps = len(trainDL) * 3

In [None]:
optPara = AdamW(tensorModel.parameters(), lr = 1e-5, weight_decay = 0.01)
optPara1 = AdamW(tensorModel.parameters(), lr = 2e-5, weight_decay = 0.01)
optPara2 = AdamW(tensorModel.parameters(), lr = 5e-5, weight_decay = 0.01)

In [None]:
warmup_steps = int(0.1 * steps)
warmup_steps1 = int(0.3 * steps)
warmup_steps2 = int(0.5 * steps)

In [None]:
schedule = get_linear_schedule_with_warmup(optPara, warmup_steps, steps)
schedule1 = get_linear_schedule_with_warmup(optPara1, warmup_steps1, steps)
schedule2 = get_linear_schedule_with_warmup(optPara2, warmup_steps2, steps)

In [None]:
torch.nn.utils.clip_grad_norm_(tensorModel.parameters(), max_norm = 1.0)

In [None]:
def epochTrain(oPara, selectSch):
    tensorModel.train()
    total = 0
    for batch in trainDL:
        oPara.zero_grad()
        id = batch["input_ids"].to(userDevice)
        mask = batch["attention_mask"].to(userDevice)
        labs = batch["label"].to(userDevice)
        outs = tensorModel(id, attention_mask = mask).logits
        loss = lossFunction(outs, labs)
        loss.backward()
        oPara.step()
        selectSch.step()
        total += loss.item()
    
    return total / len(trainDL)

In [None]:
def epochEvaluate():
    tensorModel.eval()
    total = 0
    logits = []
    with torch.no_grad():
        for batch in testDL:
            id = batch["input_ids"].to(userDevice)
            mask = batch["attention_mask"].to(userDevice)
            labs = batch["label"].to(userDevice)
            outs = tensorModel(id, attention_mask = mask).logits
            total += lossFunction(outs, labs).item()
            logits.append(outs.cpu().numpy())
    return total / len(testDL), numpy.vstack(logits)

In [None]:
print("Combination 1:\n")
for e in range(5):
    start_time = time.time()
    tl = epochTrain(optPara, schedule)
    vl, lg = epochEvaluate()
    end_time = time.time() - start_time
    print(f"Epoch: {e + 1}: train_loss = {tl : .4f}, value_loss = {vl : .4f}, time_taken = {end_time : .4f} seconds")

KeyboardInterrupt: 

In [None]:
print("Combination 2:\n")
for e in range(5):
    start_time = time.time()
    tl = epochTrain(optPara1, schedule1)
    vl, lg = epochEvaluate()
    end_time = time.time() - start_time
    print(f"Epoch: {e + 1}: train_loss = {tl : .4f}, value_loss = {vl : .4f}, time_taken = {end_time : .4f} seconds")

In [None]:
print("Combination 3:\n")
for e in range(5):
    start_time = time.time()
    tl = epochTrain(optPara2, schedule2)
    vl, lg = epochEvaluate()
    end_time = time.time() - start_time
    print(f"Epoch: {e + 1}: train_loss = {tl : .4f}, value_loss = {vl : .4f}, time_taken = {end_time : .4f} seconds")

# Below model performs worse than the one above

In [None]:
# from torch.amp import GradScaler, autocast

In [None]:
# scalerFunction = GradScaler()

# for epoch in range(3):
#     start_time = time.time()
#     tensorModel.train()
#     total_loss = 0.0

#     for batch in trainDL:
#         inputids = batch["input_ids"].to(userDevice)
#         masks = batch["attention_mask"].to(userDevice)
#         labels = batch["label"].to(userDevice)

#         optPara.zero_grad()

#         with autocast(device_type = "cuda"):
#             logits = tensorModel(inputids, attention_mask = masks).logits
#             loss = lossFunction(logits, labels)

#         scalerFunction.scale(loss).backward()

#         scalerFunction.unscale_(optPara)
#         torch.nn.utils.clip_grad_norm_(tensorModel.parameters(), max_norm = 1.0)

#         scalerFunction.step(optPara)
#         scalerFunction.update()

#         total_loss += loss.item()
    
#     averageLoss = total_loss / len(trainDL)
#     end_time = time.time() - start_time
#     print(f"Epoch {epoch + 1} ; train_loss: {averageLoss : .4f} ; time_taken: {end_time : .4f}")