# Sentiment Classification on a Multilingual Tweet Dataset

This project focuses on fine-tuning the multilingual language model [XLM-RoBERTa-base](https://huggingface.co/FacebookAI/xlm-roberta-base) on a sentiment classification task using [multilingual tweets](https://huggingface.co/FacebookAI/xlm-roberta-base) as data. The dataset contains tweets labeled with sentiment categories in several languages.

# Install Libraries

In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install bertviz transformers
!pip install accelerate --upgrade

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

Collecting accelerate
  Downloading accelerate-1.5.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.5.1-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.3.0
    Uninstalling accelerate-1.3.0:
      Successfully uninstalled accelerate-1.3.0
Successfully installed accelerate-1.5.1


In [None]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import numpy as np
import evaluate

# Load and Preprocess the Dataset

The Tweet Sentiment Multilingual dataset contains tweets labeled as **negative**, **neutral**, and **positive**. To make the training efficient, a truncation limit is set so a maximum of 100 words to prevent unneccessary memory usage.

The dataset is divided into a training, validation, and test set.

In [None]:
# Load the multilingual tweet sentiment dataset
tweet_dataset = load_dataset("cardiffnlp/tweet_sentiment_multilingual", 'all')

# Function to truncate text for efficiency
def truncate(example):
    return {
        'text': " ".join(example['text'].split()[:100]),
        'label': example['label']
    }

# Reduce dataset size
small_tweet_dataset = DatasetDict(
    train=tweet_dataset['train'].shuffle(seed=24).select(range(5000)).map(truncate),
    val=tweet_dataset['validation'].shuffle(seed=24).select(range(1000)).map(truncate),
    test=tweet_dataset['test'].shuffle(seed=24).select(range(1000)).map(truncate),
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

tweet_sentiment_multilingual.py:   0%|          | 0.00/4.14k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/187k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/464k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14712 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2592 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6960 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

# Tokenization

During the tokenization the raw text is converted into numerical values, needed for the model to process them. The XLM-RoBERTa tokenizer supports multiple languages which is important for this specific task.

In [None]:
# Load tokenizer
model_name = "FacebookAI/xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
# Keep text before tokenization
def tokenize_function(examples):
    tokenized_examples = tokenizer_xlm_roberta(examples["text"], padding=True, truncation=True)
    tokenized_examples["text"] = examples["text"]
    return tokenized_examples

# Apply tokenization and keep text
small_tokenized_dataset = small_tweet_dataset.map(tokenize_function, batched=True, batch_size=16)
small_tokenized_dataset = small_tokenized_dataset.rename_column("label", "labels")
small_tokenized_dataset.set_format("torch")


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

# Load the Model

In [None]:
# Define data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load pre-trained multilingual model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) # 3 sentiment classes

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training

In [None]:
# Load evaluation metric
accuracy = evaluate.load("accuracy")

# Define training arguments
training_args = TrainingArguments(
    output_dir="mydrive/tweet_trainer",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=50,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to='none',
    seed=224
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_tokenized_dataset['train'],
    eval_dataset=small_tokenized_dataset['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
# Train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.0002,0.881311,0.6
2,0.8235,0.837071,0.615
3,0.7306,0.882735,0.618
4,0.6467,0.891189,0.63
5,0.5717,0.914382,0.632


TrainOutput(global_step=785, training_loss=0.7710945761127836, metrics={'train_runtime': 6632.4709, 'train_samples_per_second': 3.769, 'train_steps_per_second': 0.118, 'total_flos': 1042081246647504.0, 'train_loss': 0.7710945761127836, 'epoch': 5.0})

In [None]:
# What Checkpoint is what Epoch
'''
1	checkpoint-157
2	checkpoint-314
3	checkpoint-471
4	checkpoint-628
5	checkpoint-785
'''

'\n1\tcheckpoint-157\n2\tcheckpoint-314\n3\tcheckpoint-471\n4\tcheckpoint-628\n5\tcheckpoint-785\n'

# Evaluation

In [None]:
# Evaluate the model
trainer.evaluate()

{'eval_loss': 0.837071418762207,
 'eval_accuracy': 0.615,
 'eval_runtime': 59.9937,
 'eval_samples_per_second': 16.668,
 'eval_steps_per_second': 0.533,
 'epoch': 5.0}

# Continue Training

In [None]:
# Load the best checkpoint
best_checkpoint = "mydrive/tweet_trainer/checkpoint-785"
model = AutoModelForSequenceClassification.from_pretrained(best_checkpoint)

training_args = TrainingArguments(
    output_dir="mydrive/tweet_trainer2",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=50,
    num_train_epochs=10, # Increased
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to='none',
    seed=224
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_tokenized_dataset["train"],
    eval_dataset=small_tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Resume training
trainer.train(resume_from_checkpoint="mydrive/tweet_trainer/checkpoint-785")



  trainer = Trainer(
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Epoch,Training Loss,Validation Loss,Accuracy
6,0.5404,0.927784,0.637
7,0.4853,0.994446,0.624
8,0.4286,1.060001,0.628
9,0.3748,1.093254,0.634
10,0.3482,1.091038,0.638


TrainOutput(global_step=1570, training_loss=0.22631755361131803, metrics={'train_runtime': 7526.1206, 'train_samples_per_second': 6.644, 'train_steps_per_second': 0.209, 'total_flos': 2069251362574704.0, 'train_loss': 0.22631755361131803, 'epoch': 10.0})

In [None]:
# What Checkpoint is what Epoch
'''
6	checkpoint-942
7	checkpoint-1099
8	checkpoint-1256
9	checkpoint-1413
10	checkpoint-1570
'''

'\n6\tcheckpoint-942\n7\tcheckpoint-1099\n8\tcheckpoint-1256\n9\tcheckpoint-1413\n10\tcheckpoint-1570\n'

# Evaluation

In [None]:
# Evaluate the model
trainer.evaluate()

{'eval_loss': 0.837071418762207,
 'eval_accuracy': 0.615,
 'eval_runtime': 45.7675,
 'eval_samples_per_second': 21.85,
 'eval_steps_per_second': 0.699,
 'epoch': 10.0}

In [None]:
# I tried several manually created sentence in this cell
test_text = "I drive a blue car."
model_inputs = tokenizer(test_text, return_tensors="pt")
prediction = torch.argmax(model(**model_inputs).logits)
print(["Negative", "Neutral", "Positive"][prediction])

Neutral


# Evaluate the Model on the test set

In [None]:
# Load the best checkpoint for TEST
best_checkpoint = "mydrive/tweet_trainer2/checkpoint-1413"
model = AutoModelForSequenceClassification.from_pretrained(best_checkpoint)

training_args = TrainingArguments(
    output_dir="mydrive/tweet_trainer2",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=50,
    num_train_epochs=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to='none',
    seed=224
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_tokenized_dataset["train"],
    eval_dataset=small_tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
# Evaluate the trained model on the test set
test_results = trainer.evaluate(eval_dataset=small_tokenized_dataset["test"])
print(test_results)

{'eval_loss': 1.0152422189712524, 'eval_model_preparation_time': 0.0039, 'eval_accuracy': 0.645, 'eval_runtime': 57.3381, 'eval_samples_per_second': 17.44, 'eval_steps_per_second': 0.558}


In [None]:
# Save the final model
model.save_pretrained("drive/MyDrive/tweet_sentiment_model")
tokenizer.save_pretrained("drive/MyDrive/tweet_sentiment_model")

('drive/MyDrive/tweet_sentiment_model/tokenizer_config.json',
 'drive/MyDrive/tweet_sentiment_model/special_tokens_map.json',
 'drive/MyDrive/tweet_sentiment_model/sentencepiece.bpe.model',
 'drive/MyDrive/tweet_sentiment_model/added_tokens.json',
 'drive/MyDrive/tweet_sentiment_model/tokenizer.json')

In [None]:
# More manually created example sentences
test_sentences = [
    "I love this product!",
    "The service was terrible.",
    "It was okay, nothing special.",
    "I use the car."
]

# Run predictions
for text in test_sentences:
    model_inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        logits = model(**model_inputs).logits
    prediction = torch.argmax(logits).item()

    print(f"Text: {text}")
    print(f"Predicted Sentiment: {['Negative', 'Neutral', 'Positive'][prediction]}")

Text: I love this product!
Predicted Sentiment: Positive
Text: The service was terrible.
Predicted Sentiment: Negative
Text: It was okay, nothing special.
Predicted Sentiment: Positive
Text: I use the car.
Predicted Sentiment: Neutral


Visualization for Epoch 10

In [None]:
import os
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

# Load Model and Tokenizer
model_path = "/content/drive/MyDrive/tweet_sentiment_model"
checkpoint_path = "/content/drive/MyDrive/tweet_trainer2/checkpoint-1570" # Load epoch 10

fine_tuned_model_xlm_roberta = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer_xlm_roberta = AutoTokenizer.from_pretrained(model_path)

# Prepare test dataset
num_examples = 500
small_test_dataset = small_tokenized_dataset["test"].shuffle(seed=42).select(range(num_examples))

sentence_list = small_test_dataset["text"]
labels_list = small_test_dataset["labels"]

# Tokenize inputs
model_inputs = tokenizer_xlm_roberta(
    sentence_list,
    padding=True, truncation=True, return_tensors='pt'
)

# Run inference to get hidden states
fine_tuned_model_xlm_roberta.eval()
with torch.no_grad():
    outputs = fine_tuned_model_xlm_roberta(**model_inputs, output_hidden_states=True)

# Define path for visualization results
path = "drive/MyDrive/results_visualization"
os.makedirs(path, exist_ok=True)

all_hidden_states = outputs.hidden_states
num_layers = len(all_hidden_states)

# Iterate over layers
for layer in range(num_layers):
    layer_dir = os.path.join(path, f"layer_{layer}")
    os.makedirs(layer_dir, exist_ok=True)

    tensors = []
    labels = []

    for example in range(all_hidden_states[layer].shape[0]):
        cls_embedding = all_hidden_states[layer][example][0]  # CLS Token
        tensors.append(cls_embedding)

        label = [
            sentence_list[example],
            str(labels_list[example])
        ]
        labels.append(label)

    # Save embeddings
    embeddings_tensor = torch.stack(tensors)
    writer = SummaryWriter(log_dir=layer_dir)
    writer.add_embedding(embeddings_tensor, metadata=labels, metadata_header=['text', 'label'])
    writer.close()

# Visualization for Epoch 5

In [None]:
import os
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

# Load Model and Tokenizer
model_path = "/content/drive/MyDrive/tweet_sentiment_model"
checkpoint_path = "/content/drive/MyDrive/tweet_trainer/checkpoint-785" # Load epoch 5

fine_tuned_model_xlm_roberta = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer_xlm_roberta = AutoTokenizer.from_pretrained(model_path)

# Prepare test dataset
num_examples = 500
small_test_dataset = small_tokenized_dataset["test"].shuffle(seed=42).select(range(num_examples))

sentence_list = small_test_dataset["text"]
labels_list = small_test_dataset["labels"]

# Tokenize inputs
model_inputs = tokenizer_xlm_roberta(
    sentence_list,
    padding=True, truncation=True, return_tensors='pt'
)

# Run inference to get hidden states
fine_tuned_model_xlm_roberta.eval()
with torch.no_grad():
    outputs = fine_tuned_model_xlm_roberta(**model_inputs, output_hidden_states=True)

# Define path for visualization results
path = "drive/MyDrive/results_visualization_epoch5"
os.makedirs(path, exist_ok=True)

all_hidden_states = outputs.hidden_states
num_layers = len(all_hidden_states)

# Iterate over layers
for layer in range(num_layers):
    layer_dir = os.path.join(path, f"layer_{layer}")
    os.makedirs(layer_dir, exist_ok=True)

    tensors = []
    labels = []

    for example in range(all_hidden_states[layer].shape[0]):
        cls_embedding = all_hidden_states[layer][example][0]  # CLS Token
        tensors.append(cls_embedding)

        label = [
            sentence_list[example],
            str(labels_list[example])
        ]
        labels.append(label)

    # Save embeddings
    embeddings_tensor = torch.stack(tensors)
    writer = SummaryWriter(log_dir=layer_dir)
    writer.add_embedding(embeddings_tensor, metadata=labels, metadata_header=['text', 'label'])
    writer.close()