<a href="https://colab.research.google.com/github/sgbyteninja/sentiment_analysis_customer_reviews/blob/main/RoBERTa_training_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports


In [1]:
# Install necessary packages
!pip install datasets
!pip install huggingface_hub

# Import libraries
import pandas as pd
from datasets import Dataset
import torch
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    AutoConfig,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    AutoModelForSequenceClassification,
    AutoTokenizer
)
from torch.utils.data import DataLoader
from google.colab import drive
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from huggingface_hub import login, Repository

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [2]:
# Login to Hugging Face
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Tokenisation

In [21]:
# URLs of the CSV files from my Github repository
train_url = (
    'https://raw.githubusercontent.com/sgbyteninja/sentiment_analysis_customer_reviews/'
    'refs/heads/main/train_data.csv'
)
test_url = (
    'https://raw.githubusercontent.com/sgbyteninja/sentiment_analysis_customer_reviews/'
    'refs/heads/main/test_data.csv'
)
val_url = (
    'https://raw.githubusercontent.com/sgbyteninja/sentiment_analysis_customer_reviews/'
    'refs/heads/main/val_data.csv'
)

# Loading the CSV files
train_df = pd.read_csv(train_url)
test_df = pd.read_csv(test_url)
val_df = pd.read_csv(val_url)



In [22]:
# Converting the train_df into Hugging Face dataset format
train_df = Dataset.from_pandas(train_df)
val_df = Dataset.from_pandas(val_df)
test_df = Dataset.from_pandas(test_df)

# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


In [23]:
# Tokenize the review column
def tokenize_function(examples):
    """
    Tokenizes the 'review' column in the input examples.

    This function takes a batch of examples containing the 'review' column (text data)
    and tokenizes the text using a pre-defined tokenizer. The tokenized output is
    padded to a maximum length and truncated where necessary to ensure consistency in the input length.

    Args:
        examples (dict): A dictionary containing the input data, including a 'review' key
                         with text that needs to be tokenized.

    Returns:
        dict: A dictionary containing tokenized inputs, with keys 'input_ids' and 'attention_mask'.
    """
    return tokenizer(
        examples['review'],
        padding="max_length",
        truncation=True
    )

In [24]:
# Tokenize the datasets
train_df = train_df.map(tokenize_function, batched=True)
val_df = val_df.map(tokenize_function, batched=True)
test_df = test_df.map(tokenize_function, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [25]:
# Check if the tokenization worked properly
print(train_df[0])
print(val_df[0])
print(test_df[0])

{'review': "A very nice pizza.  I haven't made up my mind which is more authentic, NYPD or Ray's.  Both, have the thin NY style crust, but NYPD makes me feel like I'm waiting for an E train to Queens; whereas Ray's feels more upper east side-ish.", 'label': 0, 'input_ids': [0, 250, 182, 2579, 9366, 4, 1437, 38, 2220, 75, 156, 62, 127, 1508, 61, 16, 55, 12757, 6, 20283, 50, 4622, 18, 4, 1437, 1868, 6, 33, 5, 7174, 5300, 2496, 22196, 6, 53, 20283, 817, 162, 619, 101, 38, 437, 2445, 13, 41, 381, 2341, 7, 12446, 131, 9641, 4622, 18, 2653, 55, 2853, 3017, 526, 12, 1173, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [26]:
# Take a look at the columns of the test_df
test_df

Dataset({
    features: ['review', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

In [27]:
# Assuring the compatibility with PyTorch
train_df.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_df.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_df.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Model and Configuration Setup


In [28]:
# Extracting the number of names and classes
num_labels = len(set(train_df["label"]))
id2label = {i: str(i) for i in range(num_labels)}

In [29]:
# Configuration of the RoBERTa-model
config = AutoConfig.from_pretrained("roberta-base")
config.update({"id2label": id2label, "num_labels": num_labels})

In [30]:
# Loading the RoBERTa model
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", config=config
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Defining the Metrics for the Model Evaluation


In [31]:
# Defining the Metrics for Model Evaluation
# For Classification it makes sense to use the metrics accuracy, precision, recall and f1 (confusion matrix)
def compute_metrics(p):
    """
    Computes classification metrics (accuracy, precision, recall, F1-score)
    for the given predictions and labels.

    This function calculates several evaluation metrics for classification tasks,
    including precision, recall, F1-score, and accuracy, using the predicted labels
    and true labels. It uses a weighted average for precision, recall, and F1-score.

    Args:
        p (tuple): A tuple containing two elements:
            - predictions (array-like): The predicted labels (probabilities or logits).
            - labels (array-like): The true labels for the data.

    Returns:
        dict: A dictionary containing the following metrics:
            - 'accuracy' (float): Accuracy score of the model.
            - 'precision' (float): Weighted precision score.
            - 'recall' (float): Weighted recall score.
            - 'f1' (float): Weighted F1-score.
    """
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    # Compute precision, recall, and f1-score for each class
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted'
    )

    accuracy = accuracy_score(labels, predictions)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Define the Parameters for Training


In [32]:
# Defining the training parameters

training_args = TrainingArguments(
    output_dir="./roberta_sentiment",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="none",
    metric_for_best_model="eval_loss",
    greater_is_better=False
)



# Setup the Trainer and train the Model


In [15]:
# Defining the Trainer and implementing an Early Stopp
# Early Stopping helps prevent overfitting by stopping training when the validation loss stops improving.
# This ensures that the model does not continue learning patterns that do not generalize to unseen data.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=val_df,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4016,0.4887,0.788,0.82664,0.788,0.802422
2,0.3235,0.453516,0.839,0.838237,0.839,0.83776


TrainOutput(global_step=2000, training_loss=0.9610499306917191, metrics={'train_runtime': 1553.2518, 'train_samples_per_second': 10.301, 'train_steps_per_second': 1.288, 'total_flos': 4512084393984000.0, 'train_loss': 0.9610499306917191, 'epoch': 2.0})

# Safe the trained Model to Hugging-Face-Hub


In [None]:
# Create the directory again and clone the repository
!mkdir /content/roberta_sentiment

In [None]:
repo = Repository(local_dir="/content/roberta_sentiment", clone_from="sgbyteninja/sentiment_analysis_with_roBERTa")

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/sgbyteninja/sentiment_analysis_with_roBERTa into local empty directory.


In [None]:
# Uploading the model and the tokenizer to huggingface-hub
model.push_to_hub("sgbyteninja/sentiment_analysis_with_roBERTa")
tokenizer.push_to_hub("sgbyteninja/sentiment_analysis_with_roBERTa")


model.safetensors:   0%|          | 0.00/523M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sgbyteninja/sentiment_analysis_with_roBERTa/commit/313deec92e056b852578d064859250867b22dace', commit_message='Upload tokenizer', commit_description='', oid='313deec92e056b852578d064859250867b22dace', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sgbyteninja/sentiment_analysis_with_roBERTa', endpoint='https://huggingface.co', repo_type='model', repo_id='sgbyteninja/sentiment_analysis_with_roBERTa'), pr_revision=None, pr_num=None)

# Evaluation of the Model


In [None]:
# Evaluate the model
results = trainer.evaluate()

# Print evaluation results
print(results)


{'eval_loss': 0.4535159766674042, 'eval_accuracy': 0.839, 'eval_precision': 0.8382365247801574, 'eval_recall': 0.839, 'eval_f1': 0.8377597148770084, 'eval_runtime': 25.6681, 'eval_samples_per_second': 38.959, 'eval_steps_per_second': 4.87, 'epoch': 2.0}


In [None]:
# Display the results of the model evaluation
results

{'eval_loss': 0.4535159766674042,
 'eval_accuracy': 0.839,
 'eval_precision': 0.8382365247801574,
 'eval_recall': 0.839,
 'eval_f1': 0.8377597148770084,
 'eval_runtime': 25.6681,
 'eval_samples_per_second': 38.959,
 'eval_steps_per_second': 4.87,
 'epoch': 2.0}

# Testing the Model

In [None]:
# Load the fine-tuned model and tokenizer from Hugging Face
model = AutoModelForSequenceClassification.from_pretrained(
    "sgbyteninja/sentiment_analysis_with_roBERTa"
)
tokenizer = AutoTokenizer.from_pretrained(
    "sgbyteninja/sentiment_analysis_with_roBERTa"
)

# List of example texts, with mixed opinions and different emotions
texts = [
    "The movie was fantastic, I couldn't stop laughing from start to finish.",  # Positive (Positive)
    "The food was decent, but the portions were way too small for the price.",  # Neutral (Neutral)
    "I loved the concept of the new app, but it crashes every time I try to open it. What a waste of money...",  # Negative (Negative)
    "The concert was incredible, I will definitely attend again next year.",  # Positive (Positive)
    "The weather was nice today, perfect for a walk in the park.",  # Positive (Positive)
    "I can't believe I spent money on this. What a waste.",  # Negative (Negative)
    "It’s okay, I guess. Not great, but not terrible either.",  # Neutral (Neutral)

    # Irony and sarcasm examples
    "Oh, what a surprise, the service is slow again. I really wasn't expecting that! I will definitely never come here again!",  # Sarcastic (Negative)
    "Sure, I’d love to wait another hour for my coffee. Who doesn't enjoy a good delay? I am more than annoyed. Can't recommend this cafe.",  # Sarcastic (Negative)
]

# Corresponding labels (2 for negative, 1 for positive, 0 for neutral)
labels = [
    1,  # Positive (Positive)
    0,  # Neutral (Neutral)
    2,  # Negative (Negative)
    1,  # Positive (Positive)
    1,  # Positive (Positive)
    2,  # Negative (Negative)
    0,  # Neutral (Neutral)

    # Irony and sarcasm labels (negative because of frustration)
    2,  # Sarcastic (Negative)
    2,  # Sarcastic (Negative)
]

# Tokenizing the texts
inputs = tokenizer(
    texts, padding=True, truncation=True, return_tensors="pt", max_length=512
)

# Perform inference and collect predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Calculate probabilities for each class (optional, depending on your model)
probabilities = torch.nn.functional.softmax(logits, dim=-1)

# Get predicted classes (the index of the class with the highest probability)
predicted_classes = torch.argmax(probabilities, dim=-1)

# Output the predictions and compare with the true labels
for i, text in enumerate(texts):
    true_label = labels[i]
    predicted_label = predicted_classes[i].item()
    prob = probabilities[i].tolist()

    print(f"Text: {text}")
    print(f"True label: {true_label} - Predicted label: {predicted_label}")
    print(f"Predicted probabilities: {prob}")
    print("-" * 100)

Text: The movie was fantastic, I couldn't stop laughing from start to finish.
True label: 1 - Predicted label: 1
Predicted probabilities: [0.005388807039707899, 0.9932502508163452, 0.0005337840411812067, 9.148856605634137e-08, 1.3281328392622527e-07, 1.3023350220464636e-07, 9.159523983726103e-08, 4.421170629598237e-08, 8.599015188792691e-08, 1.2561886819639767e-07, 1.0805182881767905e-07, 9.12442388312229e-08, 2.0949211432252923e-07, 6.819801967594685e-08, 6.111540074016375e-08, 1.1639016861408891e-07, 1.291037534656425e-07, 8.025152453683404e-08, 8.827566944091814e-08, 1.1262840615700043e-07, 1.7356975945403974e-07, 9.982131388142079e-08, 1.3255943542844761e-07, 9.955682855888881e-08, 8.786294358742452e-08, 9.606755213553697e-08, 9.43824645105451e-08, 9.01820698118172e-08, 9.599484229738664e-08, 1.1495500018554594e-07, 9.474662476804951e-08, 8.598917133895156e-08, 1.1925099840937037e-07, 5.461906482651102e-08, 9.915791565617837e-08, 9.778422338513337e-08, 1.014201558291461e-07, 8.8419