In [3]:
import re
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import Model
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
import pandas as pd
import warnings
from nltk.tokenize import word_tokenize
import nltk
from sklearn.model_selection import train_test_split
import torch.nn as nn
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoTokenizer
import torch
import os
from datasets import Dataset
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import EvalPrediction
from transformers import TrainingArguments, Trainer
warnings.filterwarnings("ignore")


## Load the data and perform preprocessing

In [4]:
train = pd.read_csv("/kaggle/input/youtube-comments-dataset/YoutubeCommentsDataSet.csv")

# Download necessary NLTK data
nltk.download("stopwords")
nltk.download("punkt_tab")
nltk.download("wordnet")
# Check for missing comments labels
train['Comment'].isna().sum()
train = train.dropna(subset=['Comment'])

# Ensure that the train, test and eval datasets are equal distributed with all the 3 classes
train, test = train_test_split(train, test_size=0.2, stratify=train["Sentiment"], random_state=42)
test, eval_df = train_test_split(test, test_size=0.5, stratify=test["Sentiment"], random_state=42)
trainLabels = train['Sentiment']
testLabels = test["Sentiment"]

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove URLs if any
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize the tokens
    tokens = [
        lemmatizer.lemmatize(word.lower())
        for word in tokens
    ]
    return " ".join(tokens)

train["Comment"] = train["Comment"].apply(preprocess_text)
test["Comment"] = test["Comment"].apply(preprocess_text)
print(train.head())

# Verify the class distribution in each split
train_dist = train["Sentiment"].value_counts(normalize=True) * 100
test_dist = test["Sentiment"].value_counts(normalize=True) * 100
eval_dist = eval_df["Sentiment"].value_counts(normalize=True) * 100
print("Train - ", train_dist)
print("Test - ", test_dist)
print("Eval - " , eval_dist)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                 Comment Sentiment
3000   it ‚Äô s so adorable that he say ‚Äú baap ‚Äù for up...  positive
13554  sir i have no word to describe your teaching i...  positive
15647  the reason they said large and open space inst...   neutral
10370  for ur information this is an fact that jrntr ...   neutral
17741  you can really tell the progress awesome espec...  positive
Train -  Sentiment
positive    62.085631
neutral     25.185488
negative    12.728882
Name: proportion, dtype: float64
Test -  Sentiment
positive    62.091503
neutral     25.163399
negative    12.745098
Name: proportion, dtype: float64
Eval -  Sentiment
positive    62.112139
neutral     25.204137
negative    12.683723
Name: proportion, dtype: float64


### Create the mappings for the labels and perform the mapping

In [5]:
train_temp = train
test_temp = test
eval_df_temp = eval_df

labels = ["neutral", "positive", "negative"]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

# Define the mapping
label_mapping = {"neutral": 0, "positive": 1, "negative": 2}

print(train["Sentiment"])

train["Sentiment"] = train["Sentiment"].str.strip().str.lower()
test["Sentiment"] = test["Sentiment"].str.strip().str.lower()
eval_df["Sentiment"] = eval_df["Sentiment"].str.strip().str.lower()

# Apply the mapping to the 'Sentiment' column
train["Sentiment"] = train["Sentiment"].map(label_mapping)
test["Sentiment"] = test["Sentiment"].map(label_mapping)
eval_df["Sentiment"] = eval_df["Sentiment"].map(label_mapping)
print(train["Sentiment"])

3000     positive
13554    positive
15647     neutral
10370     neutral
17741    positive
           ...   
17900    positive
9859     positive
474      negative
10441     neutral
8065     positive
Name: Sentiment, Length: 14691, dtype: object
3000     1
13554    1
15647    0
10370    0
17741    1
        ..
17900    1
9859     1
474      2
10441    0
8065     1
Name: Sentiment, Length: 14691, dtype: int64


### Convert the data to HF Datasets and tokenize them into the required format


In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
hf_dataset = Dataset.from_pandas(train)
hf_test = Dataset.from_pandas(test)
hf_eval = Dataset.from_pandas(eval_df)
def preprocess_data(examples):
    # Get the text data
    text = examples["Comment"]
    
    # Tokenize the text
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
    
    # Add the label column directly from "Sentiment"
    encoding["labels"] = examples["Sentiment"]
    
    return encoding

# Map the function over the dataset, removing original columns so only the tokenized output remains
encoded_dataset = hf_dataset.map(
    preprocess_data,
    batched=True,
    remove_columns=hf_dataset.column_names,
)
encoded_test = hf_test.map(
    preprocess_data,
    batched=True,
    remove_columns=hf_test.column_names,
)
train_dataset = encoded_dataset
eval_dataset = hf_eval.map(
    preprocess_data,
    batched=True,
    remove_columns=hf_eval.column_names,
)

### Define the model and the arguments

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=3,
                                                           id2label=id2label,
                                                           label2id=label2id)
def adjustArguments(lr, epochs, batch_size):
    metric_name = "f1"
    args = TrainingArguments(
        f"bert-finetuned-youtube_sentiment_analysis",
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model=metric_name,
        gradient_accumulation_steps=2,  # Use gradient accumulation
        lr_scheduler_type="linear",  # Use a learning rate scheduler
        #push_to_hub=True,
    )
    return args

### Define a custom function for evaluation during training using F1 score and Accuracy measures

In [8]:


def compute_metrics(p: EvalPrediction):
    # Get raw logits
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

    # Apply softmax to get probabilities
    probs = np.exp(preds) / np.exp(preds).sum(-1, keepdims=True)

    # Get predicted class index (0, 1, or 2)
    y_pred = np.argmax(probs, axis=-1)
    
    # True labels
    y_true = p.label_ids

    # Compute metrics
    f1 = f1_score(y_true, y_pred, average='macro')  # Macro-average for imbalanced classes
    accuracy = accuracy_score(y_true, y_pred)

    return {"f1": f1, "accuracy": accuracy}


### Define a custom loss function with weights corresponding to the label distribution

In [9]:
train_mapped_labels = np.array(train_dataset['labels'])
print(train_mapped_labels)
class_weights = compute_class_weight('balanced', classes=np.unique(train_mapped_labels), y=train_mapped_labels)
print(class_weights)
class_weights = torch.tensor(class_weights, dtype=torch.float).to("cuda")

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Ensure labels are 1D (batch_size,) and of type long
        labels = labels.view(-1).to(torch.long)

        # Define weighted loss function
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


def startTraining(args):
    trainer = WeightedTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )

    trainer.train()
    return trainer

[1 1 0 ... 2 0 1]
[1.32351351 0.53689288 2.61871658]


### Define the method to push to Hugging Face

In [10]:
def saveModel(trainer, message):
    trainer.push_to_hub(message)

### Define a method to perform the predictions

In [11]:
def predict(trainer):
    
    all_preds = []
    
    # Inference loop
    for i in range(len(encoded_test)):
        batch = encoded_test[i]
        inputs = {k: torch.tensor(v).unsqueeze(0).to(trainer.model.device) for k, v in batch.items() if k in ["input_ids", "attention_mask"]}
    
        with torch.no_grad():
            outputs = trainer.model(**inputs)
            logits = outputs.logits
    
        predicted_class = torch.argmax(logits, dim=-1).item()

    
        # Collect predictions and true labels
        all_preds.append(predicted_class)
    
    # Convert lists to numpy arrays
    all_preds = np.array(all_preds)
    
    # Convert one-hot encoded vectors to class indices
    all_preds = [id2label[num] for num in all_preds]
    # Calculate F1 score (macro-average for multi-label)
    f1 = f1_score(testLabels, all_preds, average="macro")
    # Print confusion matrix
    cm = confusion_matrix(testLabels, all_preds)

    print("Classification Report\n")
    print(classification_report(testLabels, all_preds, digits=4))
    return (f1, cm)

### Run the trainer with the set arguments and perform predictions on the given dataset

In [12]:
args = adjustArguments(5e-5, 5, 16)
trainer = startTraining(args)
f1, cm = predict(trainer)
print("F1 score (macro): ")
f1
print("Confusion matrix:\n")
print(cm)
#saveModel(trainer, "Youtube Comment Sentiment Analysis with BERT Model")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Tracking run with wandb version 0.19.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250131_215626-59i3p5uj[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mbert-finetuned-youtube_sentiment_analysis[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at [34m[4mhttps://wandb.ai/fakerahulk-university-of-trier/huggingface[0m
[34m[1mwandb[0m: üöÄ View run at [34m[4mhttps://wandb.ai/fakerahulk-university-of-trier/huggingface/runs/59i3p5uj[0m


Epoch,Training Loss,Validation Loss,F1,Accuracy
0,No log,0.427181,0.824623,0.859554
2,0.280100,0.604237,0.820704,0.861187
4,0.077000,1.017388,0.821502,0.861731


Classification Report

              precision    recall  f1-score   support

    negative     0.7696    0.7564    0.7629       234
     neutral     0.6922    0.8420    0.7598       462
    positive     0.9559    0.8754    0.9139      1140

    accuracy                         0.8519      1836
   macro avg     0.8059    0.8246    0.8122      1836
weighted avg     0.8658    0.8519    0.8559      1836

F1 score (macro): 
Confusion matrix:

[[177  47  10]
 [ 37 389  36]
 [ 16 126 998]]
