In [1]:
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# Define the model name and load the tokenizer
model_name = 'microsoft/xtremedistil-l6-h256-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the go_emotions dataset
ds = load_dataset("go_emotions", "raw")

# Define the list of emotions
emotions = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

# Map the labels to a list of emotions
ds = ds.map(lambda x: {"labels": [x[c] for c in emotions]})

# Split the dataset into training and validation sets
ds = ds['train'].train_test_split(test_size=0.2)

# Tokenize the text data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)

# tokenization to the dataset
cols = ds["train"].column_names
cols.remove("labels")
ds_enc = ds.map(tokenize_function, batched=True, remove_columns=cols)


ds_enc.set_format("torch")
ds_enc = ds_enc.map(lambda x: {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels"]).rename_column("float_labels", "labels")

# device for training 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load the model and sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(emotions), problem_type="multi_label_classification")
model = model.to(device)

# training arguments
training_args = TrainingArguments(
    "test_trainer",
    per_device_train_batch_size=32,  
    num_train_epochs=6,  
    learning_rate=2e-5,  
    weight_decay=0.01,  
    evaluation_strategy="epoch"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_enc['train'],
    eval_dataset=ds_enc['test']
)

# Train the model
trainer.train()

# Save the model
model_path = "C:/Users/project/Desktop/sentiment analysis/Final"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)




Map:   0%|          | 0/168980 [00:00<?, ? examples/s]

Map:   0%|          | 0/42245 [00:00<?, ? examples/s]

Map:   0%|          | 0/168980 [00:00<?, ? examples/s]

Map:   0%|          | 0/42245 [00:00<?, ? examples/s]

Using device: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.1493,0.146762
2,0.1353,0.134429
3,0.129,0.128248
4,0.1244,0.125176
5,0.1235,0.123509
6,0.1219,0.123157


('C:/Users/project/Desktop/sentiment analysis/Final\\tokenizer_config.json',
 'C:/Users/project/Desktop/sentiment analysis/Final\\special_tokens_map.json',
 'C:/Users/project/Desktop/sentiment analysis/Final\\vocab.txt',
 'C:/Users/project/Desktop/sentiment analysis/Final\\added_tokens.json',
 'C:/Users/project/Desktop/sentiment analysis/Final\\tokenizer.json')