In [1]:
import spacy
import numpy
import pandas as pd


In [2]:
dataPath = "Data\\track-a.csv"

In [3]:
dataFrame = pd.read_csv(dataPath)

In [4]:
nlpModel = spacy.load("en_core_web_sm", disable=["parser","ner"])

In [5]:
def cleanerFunction(text):
    tempDoc = nlpModel(text)
    token = [
        tok.lemma_.lower()
        for tok in tempDoc
        if not tok.is_stop and not tok.is_punct and tok.lemma_ != "-PRON-"
    ]
    return " ".join(token)

In [6]:
dataFrame["Spacy_text"] = dataFrame["text"].astype(str).apply(cleanerFunction)

In [7]:
labelColumns = [col for col in ["anger", "fear", "joy", "sadness", "surprise"] if col in dataFrame]
y = dataFrame[labelColumns].values

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [9]:
xTrain, xTest, yTrain, yTest = train_test_split(dataFrame["Spacy_text"], y, test_size = 0.1, random_state = 42)

In [10]:
tfidfModel = TfidfVectorizer(max_features = 5000, ngram_range = (1, 2))
Xtr = tfidfModel.fit_transform(xTrain)
Xv = tfidfModel.transform(xTest)

In [11]:
orClassifier = OneVsRestClassifier(LogisticRegression(max_iter = 1500))
orClassifier.fit(Xtr, yTrain)
yPred = orClassifier.predict(Xv)

In [12]:
print(classification_report(yTest, yPred, target_names = labelColumns, zero_division = 0.0))

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        34
        fear       0.67      0.85      0.75       168
         joy       1.00      0.12      0.22        48
     sadness       0.74      0.17      0.27        84
    surprise       0.79      0.13      0.23        83

   micro avg       0.69      0.41      0.52       417
   macro avg       0.64      0.25      0.29       417
weighted avg       0.69      0.41      0.43       417
 samples avg       0.55      0.40      0.44       417



In [53]:
import torch, torch.nn as nn
import time
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
import optuna
from torch.amp import autocast
from torch.cuda.amp import GradScaler
from torch.nn.functional import sigmoid

In [14]:
textTrain, textValue, labelTrain, labelValue = train_test_split(dataFrame["Spacy_text"], y, test_size = 0.1, random_state = 42)

In [15]:
textTrain = textTrain.reset_index(drop = True)
textValue = textValue.reset_index(drop = True)

textValue = textValue.tolist()
textTrain = textTrain.tolist()

In [16]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [17]:
class sentDetect(Dataset):
    def __init__(self, text, label, token, maxLength = 128):
        self.text = text
        self.label = label
        self.token = token
        self.maxLength = maxLength

    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, i):
        textchanged = self.text.iloc[i]

        enc = self.token(
            textchanged, max_length = self.maxLength, truncation = True, padding = "max_length", return_tensors = "pt"
        )
        chosenItem = {k: v.squeeze(0) for k, v in enc.items()}
        chosenItem["label"] = torch.FloatTensor(self.label[i])
        return chosenItem
    
trainSD = sentDetect(xTrain, yTrain, tokenizer)
testSD = sentDetect(xTest, yTest, tokenizer)
trainDL = DataLoader(trainSD, batch_size = 16, shuffle = True)
testDL = DataLoader(testSD, batch_size = 32)

In [18]:
userDevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
tensorModel = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type = "multi_label_classification",
    num_labels = len(labelColumns)
).to(userDevice)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
lossFunction = nn.BCEWithLogitsLoss()

steps = len(trainDL) * 3

In [21]:
# optPara = AdamW(tensorModel.parameters(), lr = 1e-5, weight_decay = 0.01)
# optPara1 = AdamW(tensorModel.parameters(), lr = 3e-5, weight_decay = 0.01)
# optPara2 = AdamW(tensorModel.parameters(), lr = 5e-5, weight_decay = 0.01)

In [22]:
# warmup_steps = int(0.1 * steps)
# warmup_steps1 = int(0.3 * steps)
# warmup_steps2 = int(0.5 * steps)

In [23]:
# schedule = get_linear_schedule_with_warmup(optPara, warmup_steps, steps)
# schedule1 = get_linear_schedule_with_warmup(optPara1, warmup_steps1, steps)
# schedule2 = get_linear_schedule_with_warmup(optPara2, warmup_steps2, steps)

In [24]:
torch.nn.utils.clip_grad_norm_(tensorModel.parameters(), max_norm = 1.0)

tensor(0.)

In [25]:
# def epochTrain(oPara, selectSch):
#     tensorModel.train()
#     total = 0
#     for batch in trainDL:
#         oPara.zero_grad()
#         id = batch["input_ids"].to(userDevice)
#         mask = batch["attention_mask"].to(userDevice)
#         labs = batch["label"].to(userDevice)
#         outs = tensorModel(id, attention_mask = mask).logits
#         loss = lossFunction(outs, labs)
#         loss.backward()
#         oPara.step()
#         selectSch.step()
#         total += loss.item()
    
#     return total / len(trainDL)

In [26]:
# def epochEvaluate():
#     tensorModel.eval()
#     total = 0
#     logits = []
#     with torch.no_grad():
#         for batch in testDL:
#             id = batch["input_ids"].to(userDevice)
#             mask = batch["attention_mask"].to(userDevice)
#             labs = batch["label"].to(userDevice)
#             outs = tensorModel(id, attention_mask = mask).logits
#             total += lossFunction(outs, labs).item()
#             logits.append(outs.cpu().numpy())
#     return total / len(testDL), numpy.vstack(logits)

In [61]:
def epochTrain(optimizer, scheduler, use_amp = False):
    tensorModel.train()
    running_loss = 0.0

    for batch in trainDL:
        tensorModel.train()
        total_loss = 0.0

        for batch in trainDL:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(userDevice)
            attention_mask = batch["attention_mask"].to(userDevice)
            labels = batch["label"].to(userDevice)

            with autocast(userDevice.type):
                logits = tensorModel(input_ids, attention_mask=attention_mask).logits
                loss   = lossFunction(logits, labels)

            GradScaler.scale(loss).backward()
            GradScaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(tensorModel.parameters(), max_norm = 1.0)

            GradScaler.step(optimizer)
            GradScaler.update()
            scheduler.step()

            total_loss += loss.item()

        return total_loss / len(trainDL)

In [55]:
def epochEvaluate():
    tensorModel.eval()
    total_loss = 0.0
    all_logits = []

    with torch.no_grad():
        for batch in testDL:
            input_ids = batch["input_ids"].to(userDevice)
            attention_mask = batch["attention_mask"].to(userDevice)
            labels = batch["label"].to(userDevice)

            outputs = tensorModel(input_ids, attention_mask = attention_mask)
            logits  = outputs.logits

            total_loss += lossFunction(logits, labels).item()

            all_logits.append(logits.cpu().numpy())

    avg_loss = total_loss / len(testDL)
    all_logits = numpy.vstack(all_logits)

    return avg_loss, all_logits

In [29]:
# print("Combination 1:\n")
# for e in range(5):
#     start_time = time.time()
#     tl = epochTrain(optPara, schedule)
#     vl, lg = epochEvaluate()
#     end_time = time.time() - start_time
#     print(f"Epoch: {e + 1}: train_loss = {tl : .4f}, value_loss = {vl : .4f}, time_taken = {end_time : .4f} seconds")

In [30]:
# print("Combination 2:\n")
# for e in range(5):
#     start_time = time.time()
#     tl = epochTrain(optPara1, schedule1)
#     vl, lg = epochEvaluate()
#     end_time = time.time() - start_time
#     print(f"Epoch: {e + 1}: train_loss = {tl : .4f}, value_loss = {vl : .4f}, time_taken = {end_time : .4f} seconds")

In [31]:
# print("Combination 3:\n")
# for e in range(5):
#     start_time = time.time()
#     tl = epochTrain(optPara2, schedule2)
#     vl, lg = epochEvaluate()
#     end_time = time.time() - start_time
#     print(f"Epoch: {e + 1}: train_loss = {tl : .4f}, value_loss = {vl : .4f}, time_taken = {end_time : .4f} seconds")

In [60]:
import optuna 

def objective(trial):
    learnRate = trial.suggest_loguniform("lr", 5e-6, 1e-5)
    weightDecay = trial.suggest_loguniform("wd", 1e-4, 1e-1)
    warmup_steps = trial.suggest_loguniform("wup", (0.1 * steps), (0.5 * steps))
    optimizer = AdamW(tensorModel.parameters(), lr = learnRate, weight_decay = weightDecay)

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps = steps)

    for epoch in range(5):
        start_time = time.time()
        tl = epochTrain(optimizer, scheduler, True)
        vl, lg = epochEvaluate()
        end_time = time.time() - start_time
        print(f"Epoch: {epoch + 1}: train_loss = {tl : .4f}, value_loss = {vl : .4f}, time_taken = {end_time : .4f} seconds")

study = optuna.create_study(direction = "minimize")
study.optimize(objective, n_trials = 20)
print("Best Parameters:", study.best_params)

[I 2025-05-23 17:17:08,706] A new study created in memory with name: no-name-157e8dbb-7b0e-4d31-841a-08d4f80b12e0
  learnRate = trial.suggest_loguniform("lr", 5e-6, 1e-5)
  weightDecay = trial.suggest_loguniform("wd", 1e-4, 1e-1)
  warmup_steps = trial.suggest_loguniform("wup", (0.1 * steps), (0.5 * steps))
[W 2025-05-23 17:17:08,724] Trial 0 failed with parameters: {'lr': 7.4885917921086435e-06, 'wd': 0.0014306173787849757, 'wup': 68.68924922140474} because of the following error: TypeError("autocast.__init__() missing 1 required positional argument: 'device_type'").
Traceback (most recent call last):
  File "d:\Coding\Uni Marburg\ProjectNLP\.venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\sshre\AppData\Local\Temp\ipykernel_4044\2907694361.py", line 13, in objective
    tl = epochTrain(optimizer, scheduler, True)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Use

TypeError: autocast.__init__() missing 1 required positional argument: 'device_type'

# Below model performs worse than the one above

In [None]:
# from torch.amp import GradScaler, autocast

In [None]:
# scalerFunction = GradScaler()

# for epoch in range(3):
#     start_time = time.time()
#     tensorModel.train()
#     total_loss = 0.0

#     for batch in trainDL:
#         inputids = batch["input_ids"].to(userDevice)
#         masks = batch["attention_mask"].to(userDevice)
#         labels = batch["label"].to(userDevice)

#         optPara.zero_grad()

#         with autocast(device_type = "cuda"):
#             logits = tensorModel(inputids, attention_mask = masks).logits
#             loss = lossFunction(logits, labels)

#         scalerFunction.scale(loss).backward()

#         scalerFunction.unscale_(optPara)
#         torch.nn.utils.clip_grad_norm_(tensorModel.parameters(), max_norm = 1.0)

#         scalerFunction.step(optPara)
#         scalerFunction.update()

#         total_loss += loss.item()
    
#     averageLoss = total_loss / len(trainDL)
#     end_time = time.time() - start_time
#     print(f"Epoch {epoch + 1} ; train_loss: {averageLoss : .4f} ; time_taken: {end_time : .4f}")