In [1]:
!pip install transformers datasets pandas torch openpyxl



In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
import wandb

In [3]:
# Load the dataset
df = pd.read_excel("dataset_rv.xlsx")

# Map string labels to numerical labels
label_map = {"outrage": 0, "despair": 1, "hope": 2}
df["Sentiment"] = df["Sentiment"].map(label_map)

# Rename the column to text
df.rename(columns={'Headline':'text', 'Sentiment':'labels'}, inplace=True)

In [4]:
def english_to_bangla_number_text(number):
    """Converts an English number to Bangla textual representation."""
    bangla_numbers = {
        0: "‡¶∂‡ßÇ‡¶®‡ßç‡¶Ø", 1: "‡¶è‡¶ï", 2: "‡¶¶‡ßÅ‡¶á", 3: "‡¶§‡¶ø‡¶®", 4: "‡¶ö‡¶æ‡¶∞",
        5: "‡¶™‡¶æ‡¶Å‡¶ö", 6: "‡¶õ‡¶Ø‡¶º", 7: "‡¶∏‡¶æ‡¶§", 8: "‡¶Ü‡¶ü", 9: "‡¶®‡¶Ø‡¶º",
        10: "‡¶¶‡¶∂", 11: "‡¶è‡¶ó‡¶æ‡¶∞‡ßã", 12: "‡¶¨‡¶æ‡¶∞‡ßã", 13: "‡¶§‡ßá‡¶∞‡ßã", 14: "‡¶ö‡ßå‡¶¶‡ßç‡¶¶",
        15: "‡¶™‡¶®‡ßá‡¶∞‡ßã", 16: "‡¶∑‡ßã‡¶≤‡ßã", 17: "‡¶∏‡¶§‡ßá‡¶∞‡ßã", 18: "‡¶Ü‡¶†‡¶æ‡¶∞‡ßã", 19: "‡¶â‡¶®‡¶ø‡¶∂",
        20: "‡¶¨‡¶ø‡¶∂", 21: "‡¶è‡¶ï‡ßÅ‡¶∂", 22: "‡¶¨‡¶æ‡¶á‡¶∂", 23: "‡¶§‡ßá‡¶á‡¶∂", 24: "‡¶ö‡¶¨‡ßç‡¶¨‡¶ø‡¶∂",
        25: "‡¶™‡¶Å‡¶ö‡¶ø‡¶∂", 26: "‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂", 27: "‡¶∏‡¶æ‡¶§‡¶æ‡¶∂", 28: "‡¶Ü‡¶ü‡¶æ‡¶∂", 29: "‡¶ä‡¶®‡¶§‡ßç‡¶∞‡¶ø‡¶∂",
        30: "‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 31: "‡¶è‡¶ï‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 32: "‡¶¨‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 33: "‡¶§‡ßá‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 34: "‡¶ö‡ßå‡¶§‡ßç‡¶∞‡¶ø‡¶∂",
        35: "‡¶™‡¶Å‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 36: "‡¶õ‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 37: "‡¶∏‡¶æ‡¶Å‡¶á‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 38: "‡¶Ü‡¶ü‡¶§‡ßç‡¶∞‡¶ø‡¶∂", 39: "‡¶ä‡¶®‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂",
        40: "‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 41: "‡¶è‡¶ï‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 42: "‡¶¨‡¶ø‡¶Ø‡¶º‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 43: "‡¶§‡ßá‡¶§‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 44: "‡¶ö‡ßÅ‡¶Ø‡¶º‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂",
        45: "‡¶™‡¶Å‡¶á‡¶Ø‡¶º‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 46: "‡¶õ‡¶ø‡¶Ø‡¶º‡¶æ‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 47: "‡¶∏‡¶æ‡¶§‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 48: "‡¶Ü‡¶ü‡¶ö‡¶≤‡ßç‡¶≤‡¶ø‡¶∂", 49: "‡¶ä‡¶®‡¶™‡¶û‡ßç‡¶ö‡¶æ‡¶∂",
        50: "‡¶™‡¶û‡ßç‡¶ö‡¶æ‡¶∂", 51: "‡¶è‡¶ï‡¶æ‡¶®‡ßç‡¶®", 52: "‡¶¨‡¶æ‡¶π‡¶æ‡¶®‡ßç‡¶®", 53: "‡¶§‡¶ø‡¶™‡ßç‡¶™‡¶æ‡¶®‡ßç‡¶®", 54: "‡¶ö‡ßÅ‡¶Ø‡¶º‡¶æ‡¶®‡ßç‡¶®",
        55: "‡¶™‡¶û‡ßç‡¶ö‡¶æ‡¶®‡ßç‡¶®", 56: "‡¶õ‡¶æ‡¶™‡ßç‡¶™‡¶æ‡¶®‡ßç‡¶®", 57: "‡¶∏‡¶æ‡¶§‡¶æ‡¶®‡ßç‡¶®", 58: "‡¶Ü‡¶ü‡¶æ‡¶®‡ßç‡¶®", 59: "‡¶ä‡¶®‡¶∑‡¶æ‡¶ü",
        60: "‡¶∑‡¶æ‡¶ü", 61: "‡¶è‡¶ï‡¶∑‡¶ü‡ßç‡¶ü‡¶ø", 62: "‡¶¨‡¶æ‡¶∑‡¶ü‡ßç‡¶ü‡¶ø", 63: "‡¶§‡ßá‡¶∑‡¶ü‡ßç‡¶ü‡¶ø", 64: "‡¶ö‡ßå‡¶∑‡¶ü‡ßç‡¶ü‡¶ø",
        65: "‡¶™‡¶Å‡¶∑‡¶ü‡ßç‡¶ü‡¶ø", 66: "‡¶õ‡ßá‡¶∑‡¶ü‡ßç‡¶ü‡¶ø", 67: "‡¶∏‡¶æ‡¶§‡¶∑‡¶ü‡ßç‡¶ü‡¶ø", 68: "‡¶Ü‡¶ü‡¶∑‡¶ü‡ßç‡¶ü‡¶ø", 69: "‡¶ä‡¶®‡¶∏‡¶§‡ßç‡¶§‡¶∞",
        70: "‡¶∏‡¶§‡ßç‡¶§‡¶∞", 71: "‡¶è‡¶ï‡¶æ‡¶§‡ßç‡¶§‡¶∞", 72: "‡¶¨‡¶æ‡¶π‡¶æ‡¶§‡ßç‡¶§‡¶∞", 73: "‡¶§‡¶ø‡¶Ø‡¶º‡¶æ‡¶§‡ßç‡¶§‡¶∞", 74: "‡¶ö‡ßÅ‡¶Ø‡¶º‡¶æ‡¶§‡ßç‡¶§‡¶∞",
        75: "‡¶™‡¶Å‡¶ö‡¶æ‡¶§‡ßç‡¶§‡¶∞", 76: "‡¶õ‡¶ø‡¶Ø‡¶º‡¶æ‡¶§‡ßç‡¶§‡¶∞", 77: "‡¶∏‡¶æ‡¶§‡¶æ‡¶§‡ßç‡¶§‡¶∞", 78: "‡¶Ü‡¶ü‡¶æ‡¶§‡ßç‡¶§‡¶∞", 79: "‡¶ä‡¶®‡¶Ü‡¶∂‡¶ø",
        80: "‡¶Ü‡¶∂‡¶ø", 81: "‡¶è‡¶ï‡¶æ‡¶∂‡¶ø", 82: "‡¶¨‡¶ø‡¶∞‡¶æ‡¶∂‡¶ø", 83: "‡¶§‡¶ø‡¶∞‡¶æ‡¶∂‡¶ø", 84: "‡¶ö‡ßÅ‡¶∞‡¶æ‡¶∂‡¶ø",
        85: "‡¶™‡¶Å‡¶ö‡¶æ‡¶∂‡¶ø", 86: "‡¶õ‡¶ø‡¶Ø‡¶º‡¶æ‡¶∂‡¶ø", 87: "‡¶∏‡¶æ‡¶§‡¶æ‡¶∂‡¶ø", 88: "‡¶Ü‡¶ü‡¶æ‡¶∂‡¶ø", 89: "‡¶ä‡¶®‡¶®‡¶¨‡ßç‡¶¨‡¶á",
        90: "‡¶®‡¶¨‡ßç‡¶¨‡¶á", 91: "‡¶è‡¶ï‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á", 92: "‡¶¨‡¶ø‡¶∞‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á", 93: "‡¶§‡¶ø‡¶∞‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á", 94: "‡¶ö‡ßÅ‡¶∞‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á",
        95: "‡¶™‡¶Å‡¶ö‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á", 96: "‡¶õ‡¶ø‡¶Ø‡¶º‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á", 97: "‡¶∏‡¶æ‡¶§‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á", 98: "‡¶Ü‡¶ü‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á", 99: "‡¶®‡¶ø‡¶∞‡¶æ‡¶®‡¶¨‡ßç‡¶¨‡¶á",
    }

    def convert_two_digit_number(n):
        """Handles numbers from 0 to 99 based on direct mapping."""
        if n in bangla_numbers:
            return bangla_numbers[n]
        tens = (n // 10) * 10
        units = n % 10
        return f"{bangla_numbers[tens]} {bangla_numbers[units]}" if units > 0 else bangla_numbers[tens]

    if number < 100:
        return convert_two_digit_number(number)
    elif number < 1000:
        hundreds = number // 100
        remainder = number % 100
        if remainder == 0:
            return f"{bangla_numbers[hundreds]} ‡¶∂‡¶§"
        return f"{bangla_numbers[hundreds]} ‡¶∂‡¶§ {convert_two_digit_number(remainder)}"
    elif number < 100000: #Handle numbers from 1000 to 99,999
        parts = []
        if number >= 1000:
            parts.append(f"{english_to_bangla_number_text(number // 1000)} ‡¶π‡¶æ‡¶ú‡¶æ‡¶∞")
            number %= 1000
        if number > 0:
            if number >= 100:
              parts.append(english_to_bangla_number_text(number))
            else:
              parts.append(convert_two_digit_number(number))  #handles cases less than 100
        return " ".join(parts)
    elif number < 10000000: #handles numbers from 100,000 to 9,999,999
      parts = []
      if number >= 100000:
          parts.append(f"{english_to_bangla_number_text(number // 100000)} ‡¶≤‡¶ï‡ßç‡¶∑")
          number %= 100000
      if number > 0:
          if number >= 1000:
            parts.append(english_to_bangla_number_text(number))
          elif number > 0:
            if number >= 100:
                parts.append(english_to_bangla_number_text(number))
            else:
              parts.append(convert_two_digit_number(number)) #handle numbers less than 100
      return " ".join(parts)
    else: # Handle numbers >= 10,000,000 (Crore)
        parts = []
        if number >= 10000000:
            parts.append(f"{english_to_bangla_number_text(number // 10000000)} ‡¶ï‡ßã‡¶ü‡¶ø")
            number %= 10000000
        if number > 0:
            parts.append(english_to_bangla_number_text(number))
        return " ".join(parts)

In [5]:
import re

def text_to_word_list(text):
    text = text.split()
    return text

def replace_strings(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\u2600-\u26FF"          # miscellaneous symbols
                               u"\u2700-\u27BF"          # dingbats
                               u"\u2000-\u206F"          # general punctuations
                               "]+", flags=re.UNICODE)
    english_pattern = re.compile('[a-zA-Z]+', flags=re.I)

    text = emoji_pattern.sub(r'', text)
    text = english_pattern.sub(r'', text)

    return text

def remove_punctuations(my_str):
    punctuations = '''````¬£|¬¢|√ë+-*/=EROero‡ß≥‡ß¶‡ßß‡ß®‡ß©‡ß™‡ß´‡ß¨‡ß≠‡ßÆ‡ßØ012‚Äì34567‚Ä¢89‡•§!()-[]{};:'"‚Äú\‚Äô,<>./?@#$%^&*_~‚Äò‚Äî‡••‚Äù‚Ä∞ü§£‚öΩÔ∏è‚úåÔøΩÔø∞‡ß∑Ôø∞'''
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct += char

    return no_punct

def convert_numbers_to_bangla(text):
    words = text.split()
    converted_words = []
    for word in words:
        if word.isdigit():  # Check if the word is an integer
            bangla_number = english_to_bangla_number_text(int(word))
            converted_words.append(bangla_number)
        else:
            converted_words.append(word)
    return ' '.join(converted_words)

def preprocessing(text):
    text = replace_strings(text)
    text = convert_numbers_to_bangla(text)  # Convert numbers to Bangla
    text = remove_punctuations(text)
    return text

In [6]:
df['text'] =df['text'].apply(lambda x: preprocessing(str(x)))
df.head()

Unnamed: 0,text,labels
0,‡¶¢‡¶æ‡¶ï‡¶æ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶¨‡¶ø‡¶¶‡ßç‡¶Ø‡¶æ‡¶≤‡ßü ‡¶ï‡ßã‡¶ü‡¶æ‡¶¨‡¶ø‡¶∞‡ßã‡¶ß‡ßÄ ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶®‡ßá‡¶§‡¶æ‡¶ï‡ßá...,0
1,‡¶ï‡ßã‡¶ü‡¶æ‡¶¨‡¶ø‡¶∞‡ßã‡¶ß‡ßÄ ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá ‡¶¢‡¶æ‡¶ï‡¶æ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶¨‡¶ø‡¶¶‡ßç‡¶Ø‡¶æ‡¶≤‡ßü‡ßá‡¶∞ ‡¶¨‡¶ø‡¶è‡¶®‡¶™...,2
2,‡¶ï‡ßã‡¶ü‡¶æ‡¶¨‡¶ø‡¶∞‡ßã‡¶ß‡ßÄ ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶® ‡¶Ü‡¶ú‡¶ì ‡¶ú‡¶ø‡¶∞‡ßã ‡¶™‡ßü‡ßá‡¶®‡ßç‡¶ü ‡¶Ö‡¶¨‡¶∞‡ßã‡¶ß ‡¶ï‡¶∞‡ßá ...,0
3,‡¶∏‡¶∞‡ßç‡¶¨‡¶ú‡¶®‡ßÄ‡¶® ‡¶™‡ßá‡¶®‡¶∂‡¶® ‡¶∏‡¶∞‡¶ï‡¶æ‡¶∞ ‡¶Ö‡¶®‡ßú ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶® ‡¶ö‡¶æ‡¶≤‡¶ø‡ßü‡ßá ‡¶Ø‡¶æ‡¶¨‡ßá‡¶® ...,0
4,‡¶Ü‡¶ú ‡¶∂‡¶®‡¶ø‡¶¨‡¶æ‡¶∞ ‡¶∏‡¶ï‡¶æ‡¶≤ ‡¶∏‡¶æ‡ßú‡ßá ‡¶ü‡¶æ‡¶∞ ‡¶¶‡¶ø‡¶ï‡ßá ‡¶Æ‡¶π‡¶æ‡¶∏‡ßú‡¶ï‡ßá‡¶∞ ‡¶∂‡¶π‡¶∞ ‡¶¨‡¶æ‡¶á‡¶™...,0


In [7]:
# Split the data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [8]:
# Load the model and tokenizer
model_name = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) # num_labels=3 (outrage/despair/hope)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1622 [00:00<?, ? examples/s]

Map:   0%|          | 0/406 [00:00<?, ? examples/s]

In [11]:
training_args = TrainingArguments(
    output_dir="./banglabert_sentiment",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Initialize wandb
wandb.init(project="banglabert-sentiment",
             name="banglabert-sentiment-run",
             config=training_args)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtanvirrahmedd[0m ([33mtanvirrahmedd-north-south-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   from sklearn.metrics import accuracy_score, f1_score
   accuracy = accuracy_score(labels, predictions)
   f1 = f1_score(labels, predictions, average='weighted')
   return {"accuracy": accuracy, "f1_score": f1}

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train_dataset,
   eval_dataset=tokenized_test_dataset,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

  trainer = Trainer(


In [13]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,No log,0.721569,0.741379,0.722974
2,No log,0.591907,0.778325,0.767533
3,No log,0.627684,0.780788,0.778094
4,No log,0.745512,0.79064,0.785824
5,0.444200,0.791546,0.812808,0.811139
6,0.444200,0.939942,0.793103,0.796041
7,0.444200,0.969602,0.807882,0.807001
8,0.444200,0.998437,0.815271,0.813883
9,0.444200,1.059058,0.807882,0.806774
10,0.028400,1.06483,0.820197,0.81799


TrainOutput(global_step=1530, training_loss=0.15580185123907975, metrics={'train_runtime': 2507.7338, 'train_samples_per_second': 9.702, 'train_steps_per_second': 0.61, 'total_flos': 6401549453322240.0, 'train_loss': 0.15580185123907975, 'epoch': 15.0})

In [14]:
evaluation = trainer.evaluate()
print(evaluation)

{'eval_loss': 0.5919071435928345, 'eval_accuracy': 0.7783251231527094, 'eval_f1_score': 0.7675333419094921, 'eval_runtime': 12.8776, 'eval_samples_per_second': 31.528, 'eval_steps_per_second': 2.019, 'epoch': 15.0}


In [15]:
trainer.save_model("./fine_tuned_banglabert_sentiment")
tokenizer.save_pretrained("./fine_tuned_banglabert_sentiment")

('./fine_tuned_banglabert_sentiment/tokenizer_config.json',
 './fine_tuned_banglabert_sentiment/special_tokens_map.json',
 './fine_tuned_banglabert_sentiment/vocab.txt',
 './fine_tuned_banglabert_sentiment/added_tokens.json',
 './fine_tuned_banglabert_sentiment/tokenizer.json')

In [16]:
!zip -r /content/fine_tuned_banglabert_sentiment.zip /content/fine_tuned_banglabert_sentiment


updating: content/fine_tuned_banglabert_sentiment/ (stored 0%)
updating: content/fine_tuned_banglabert_sentiment/training_args.bin (deflated 51%)
updating: content/fine_tuned_banglabert_sentiment/tokenizer.json (deflated 76%)
updating: content/fine_tuned_banglabert_sentiment/config.json (deflated 54%)
updating: content/fine_tuned_banglabert_sentiment/tokenizer_config.json (deflated 74%)
updating: content/fine_tuned_banglabert_sentiment/special_tokens_map.json (deflated 42%)
updating: content/fine_tuned_banglabert_sentiment/vocab.txt (deflated 71%)
updating: content/fine_tuned_banglabert_sentiment/model.safetensors (deflated 7%)


In [17]:
# Load the fine-tuned model and tokenizer
model_path = "./fine_tuned_banglabert_sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3)
model.eval()  # Set the model to evaluation mode

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [18]:
# Define a reverse label mapping
reverse_label_map = {0: "outrage", 1: "despair", 2: "hope"}


# Function to perform sentiment prediction using the fine-tuned model
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():  # Disable gradient calculations for inference
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1).item()
    return reverse_label_map[predictions]

In [19]:
# Get random samples from test_df
num_samples = 10
random_samples = test_df.sample(num_samples)


# Make predictions and display results
print("\n--- Inference Results ---")
for index, row in random_samples.iterrows():
    text = row["text"]
    actual_label = reverse_label_map[row["labels"]]
    predicted_label = predict_sentiment(text)
    print(f"Text: {text}")
    print(f"Actual Sentiment: {actual_label}")
    print(f"Predicted Sentiment: {predicted_label}")
    print("-" * 50)


--- Inference Results ---
Text: ‡¶ú‡¶æ‡¶Æ‡¶æ‡ßü‡¶æ‡¶§‡ßá ‡¶á‡¶∏‡¶≤‡¶æ‡¶Æ‡ßÄ‡¶∞ ‡¶®‡¶æ‡¶Æ‡¶¨‡ßç‡¶Ø‡¶æ‡¶®‡¶æ‡¶∞ ‡¶¨‡ßç‡¶Ø‡¶¨‡¶π‡¶æ‡¶∞ ‡¶ï‡¶∞‡ßá ‡¶ï‡ßá‡¶â ‡¶¶‡ßÅ‡¶∞‡ßç‡¶¨‡ßÉ‡¶§‡ßç‡¶§‡¶™‡¶®‡¶æ ‡¶ï‡¶∞‡¶≤‡ßá ‡¶∏‡ßÅ‡¶®‡¶ø‡¶∞‡ßç‡¶¶‡¶ø‡¶∑‡ßç‡¶ü ‡¶§‡¶•‡ßç‡¶Ø ‡¶¶‡¶ø‡ßü‡ßá ‡¶∏‡¶π‡¶Ø‡ßã‡¶ó‡¶ø‡¶§‡¶æ ‡¶ï‡¶∞‡¶§‡ßá ‡¶π‡¶ø‡¶®‡ßç‡¶¶‡ßÅ ‡¶∏‡¶Æ‡ßç‡¶™‡ßç‡¶∞‡¶¶‡¶æ‡ßü‡ßá‡¶∞ ‡¶®‡ßá‡¶§‡¶æ‡¶¶‡ßá‡¶∞ ‡¶Ö‡¶®‡ßÅ‡¶∞‡ßã‡¶ß ‡¶ï‡¶∞‡ßá‡¶õ‡ßá‡¶® ‡¶¶‡¶≤‡¶ü‡¶ø‡¶∞ ‡¶Ü‡¶Æ‡¶ø‡¶∞ ‡¶∂‡¶´‡¶ø‡¶ï‡ßÅ‡¶∞ ‡¶∞‡¶π‡¶Æ‡¶æ‡¶®
Actual Sentiment: hope
Predicted Sentiment: hope
--------------------------------------------------
Text: ‡¶ó‡¶æ‡¶õ‡ßá‡¶∞ ‡¶°‡¶æ‡¶¨ ‡¶•‡ßá‡¶ï‡ßá ‡¶≤‡ßá‡¶™‡¶§‡ßã‡¶∑‡¶ï ‡¶ó‡¶£‡¶≠‡¶¨‡¶®‡ßá ‡¶ï‡¶ø‡¶õ‡ßÅ‡¶á ‡¶Ö‡¶¨‡¶∂‡¶ø‡¶∑‡ßç‡¶ü ‡¶®‡ßá‡¶á
Actual Sentiment: outrage
Predicted Sentiment: despair
--------------------------------------------------
Text: ‡¶ö‡¶ü‡ßç‡¶ü‡¶ó‡ßç‡¶∞‡¶æ‡¶Æ‡ßá ‡¶∂‡¶ø‡¶ï‡ßç‡¶∑‡¶æ‡¶Æ‡¶®‡ßç‡¶§‡ßç‡¶∞‡ßÄ‡¶∞ ‡¶¨‡¶æ‡¶∏‡¶æ‡ßü ‡¶π‡¶æ‡¶Æ‡¶≤‡¶æ ‡¶∏‡¶Ç‡¶∏‡¶¶ ‡¶∏‡¶¶‡¶∏‡ßç‡¶Ø‡ßá‡¶∞ ‡¶ï‡¶æ‡¶∞‡ßç‡¶Ø‡¶æ‡¶≤‡ßü‡ßá ‡¶Ü‡¶ó‡