In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os
import pandas as pd

# Load EmoBERTa
emo_model_name = "nateraw/bert-base-uncased-emotion"
emo_tokenizer = AutoTokenizer.from_pretrained(emo_model_name)
emo_model = AutoModelForSequenceClassification.from_pretrained(emo_model_name)
emo_model.eval()


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

2025-04-10 15:16:00.460799: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [2]:
# Emotion labels in EmoRoBERTa
emo_labels = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
              'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
              'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
              'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
              'sadness', 'surprise', 'neutral']

def predict_emotion(text):
    # Ensure the input is a string
    if not isinstance(text, str):
        text = str(text)
    inputs = emo_tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = emo_model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    return emo_labels[predicted_class_id]



In [3]:
def enrich_and_save_with_emotion(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    for file in os.listdir(input_dir):
        if file.endswith(".csv"):
            path = os.path.join(input_dir, file)
            df = pd.read_csv(path)

            # Ensure the Emotion column is of object type so that strings can be assigned.
            if 'Emotion' in df.columns:
                df['Emotion'] = df['Emotion'].astype(object)
            else:
                df['Emotion'] = ""
            
            # Standardize the utterance for easier comparisons
            df['Utterance_cleaned'] = df['Utterance'].str.lower().str.strip()

            # Fill in missing or empty Emotion values using the emotion prediction function.
            for idx, row in df.iterrows():
                if pd.isna(row['Emotion']) or row['Emotion'] == "":
                    # Skip rows with empty utterance (or handle as desired)
                    if pd.isna(row['Utterance']) or row['Utterance'].strip() == "":
                        continue
                    if row['Utterance_cleaned'] not in ['summary', 'primary_topic', 'secondary_topic']:
                        df.at[idx, 'Emotion'] = predict_emotion(row['Utterance'])

            
            # Optionally, you could drop the helper column after processing.
            df.drop(columns=['Utterance_cleaned'], inplace=True)
            
            # Save the modified CSV to the output directory.
            df.to_csv(os.path.join(output_dir, file), index=False)


In [4]:
enrich_and_save_with_emotion(
    "dataset/Train", 
    "dataset/Train_Emo"
)
enrich_and_save_with_emotion(
    "dataset/Validation", 
    "dataset/Validation_Emo"
)
enrich_and_save_with_emotion(
    "dataset/Test", 
    "dataset/Test_Emo"
)


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]