In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict
import torch

# Define the model name and load the tokenizer
model_name = 'microsoft/xtremedistil-l6-h256-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the go_emotions dataset
ds = load_dataset("go_emotions", "raw")

# Split the dataset into training and validation sets
ds = ds['train'].train_test_split(test_size=0.2)




In [2]:
# Define the list of emotions
emotions = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

# Map the labels to a list of emotions
def map_labels(example):
    return {"labels": [example[emotion] for emotion in emotions]}

ds = ds.map(map_labels)


Map:   0%|          | 0/168980 [00:00<?, ? examples/s]

Map:   0%|          | 0/42245 [00:00<?, ? examples/s]

In [3]:
# Tokenize the text data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)

# Apply tokenization to the dataset
cols = ds["train"].column_names
cols.remove("labels")
ds_enc = ds.map(tokenize_function, batched=True, remove_columns=cols)


Map:   0%|          | 0/168980 [00:00<?, ? examples/s]

Map:   0%|          | 0/42245 [00:00<?, ? examples/s]

In [4]:
# Convert labels to float and set the dataset format to torch
ds_enc.set_format("torch")
ds_enc = ds_enc.map(lambda x: {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels"]).rename_column("float_labels", "labels")


Map:   0%|          | 0/168980 [00:00<?, ? examples/s]

Map:   0%|          | 0/42245 [00:00<?, ? examples/s]

In [5]:
# Define the device for training (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [6]:
# Load the model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(emotions), problem_type="multi_label_classification")
model = model.to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=128,
    num_train_epochs=4,
    learning_rate=3e-5,
    evaluation_strategy="epoch"
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_enc['train'],
    eval_dataset=ds_enc['test']
)

# Start training
trainer.train()

# Save the model and tokenizer
model_path = "C:/Users/project/Desktop/sentiment analysis/Untitled Folder"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.161,0.15677
2,0.1546,0.152163
3,0.1503,0.146706
4,0.1447,0.144523


('C:/Users/project/Desktop/sentiment analysis/Untitled Folder\\tokenizer_config.json',
 'C:/Users/project/Desktop/sentiment analysis/Untitled Folder\\special_tokens_map.json',
 'C:/Users/project/Desktop/sentiment analysis/Untitled Folder\\vocab.txt',
 'C:/Users/project/Desktop/sentiment analysis/Untitled Folder\\added_tokens.json',
 'C:/Users/project/Desktop/sentiment analysis/Untitled Folder\\tokenizer.json')

In [7]:
print(f"Number of training examples: {len(ds['train'])}")
print(f"Number of validation examples: {len(ds['test'])}")

Number of training examples: 168980
Number of validation examples: 42245
