In [1]:
import json
import pandas as pd

# Load GoEmotions dataset
with open("goemotions_cleaned.json", "r") as file:
    goemotions_data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame(goemotions_data)

# Ensure correct columns exist
df = df[["text", "emotions"]]  


In [6]:
from transformers import AutoTokenizer
from datasets import Dataset
from sklearn.preprocessing import MultiLabelBinarizer


# Load tokenizer
emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)

# One-hot encode emotion labels
mlb = MultiLabelBinarizer()
df["emotions"] = mlb.fit_transform(df["emotions"]).tolist()

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Tokenize function
def tokenize_function(examples):
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    tokens["labels"] = [list(map(float, lbl)) for lbl in examples["emotions"]]  
    return tokens


# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split dataset into train & test
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_data = train_test_split["train"]
test_data = train_test_split["test"]


Map: 100%|██████████| 43410/43410 [00:36<00:00, 1189.42 examples/s]


In [8]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
import torch
from torch.nn import BCEWithLogitsLoss
from transformers import AutoModelForSequenceClassification

# Load model
emotion_model = AutoModelForSequenceClassification.from_pretrained(
    emotion_model_name,
    num_labels=len(mlb.classes_),  
    problem_type="multi_label_classification",  ignore_mismatched_sizes=True
)



# Define training arguments
training_args = TrainingArguments(
    output_dir="./emotion_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
)

# Define trainer
trainer = Trainer(
    model=emotion_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
)

# Train the model
trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at j-hartmann/emotion-english-distilroberta-base and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([7, 768]) in the checkpoint and torch.Size([1, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([7]) in the checkpoint and torch.Size([1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Evaluate model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)
