<a href="https://colab.research.google.com/github/sgbyteninja/sentiment_analysis_customer_reviews/blob/main/RoBERTa_training_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports


In [24]:
!pip install datasets
import pandas as pd
from datasets import Dataset
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoConfig, TrainingArguments, Trainer, EarlyStoppingCallback
from torch.utils.data import DataLoader



# Tokenisation

In [25]:
# URLs of the CSV files from your GitHub repository
train_url = 'https://raw.githubusercontent.com/sgbyteninja/sentiment_analysis_customer_reviews/refs/heads/main/train_data.csv'
test_url = 'https://raw.githubusercontent.com/sgbyteninja/sentiment_analysis_customer_reviews/refs/heads/main/test_data.csv'
val_url = 'https://raw.githubusercontent.com/sgbyteninja/sentiment_analysis_customer_reviews/refs/heads/main/val_data.csv'

# Load the CSV files into pandas DataFrames
train_df = pd.read_csv(train_url)
test_df = pd.read_csv(test_url)
val_df = pd.read_csv(val_url)


In [26]:
# Convert train_df to Hugging Face Dataset format
train_df = Dataset.from_pandas(train_df)
val_df = Dataset.from_pandas(val_df)
test_df = Dataset.from_pandas(test_df)
# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [27]:
# Tokenize the review column
def tokenize_function(examples):
    return tokenizer(examples['review'], padding="max_length", truncation=True)

In [28]:
# Tokenize datasets
train_df = train_df.map(tokenize_function, batched=True)
val_df = val_df.map(tokenize_function, batched=True)
test_df = test_df.map(tokenize_function, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [29]:
# Check if the tokenization worked properly
print(train_df[0])  # First entry train_df
print(val_df[0])    # First entry val_df
print(test_df[0])   # first entry test_df

{'review': "A very nice pizza.  I haven't made up my mind which is more authentic, NYPD or Ray's.  Both, have the thin NY style crust, but NYPD makes me feel like I'm waiting for an E train to Queens; whereas Ray's feels more upper east side-ish.", 'label': 0, 'input_ids': [0, 250, 182, 2579, 9366, 4, 1437, 38, 2220, 75, 156, 62, 127, 1508, 61, 16, 55, 12757, 6, 20283, 50, 4622, 18, 4, 1437, 1868, 6, 33, 5, 7174, 5300, 2496, 22196, 6, 53, 20283, 817, 162, 619, 101, 38, 437, 2445, 13, 41, 381, 2341, 7, 12446, 131, 9641, 4622, 18, 2653, 55, 2853, 3017, 526, 12, 1173, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [30]:
test_df

Dataset({
    features: ['review', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

In [31]:
# Assure the compatibility with PyTorch
train_df.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_df.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_df.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Model and Configuration Setup


In [33]:
# extract the number of names an classes
num_labels = len(set(train_df["label"]))
id2label = {i: str(i) for i in range(num_labels)}

In [34]:
# Configuration of the RoBERTa-model
config = AutoConfig.from_pretrained("roberta-base")
config.update({"id2label": id2label, "num_labels": num_labels})

In [35]:
# Load the RoBERTa-model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", config=config)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Define the Parameters for Training


In [36]:
training_args = TrainingArguments(
    output_dir="./roberta_sentiment",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="none",
    metric_for_best_model="eval_loss",  # make sure, that eval_loss ist used for Early Stopp
    greater_is_better=False
)



# Setup the Trainer and train the Model


In [37]:
# Early Stopping helps prevent overfitting by stopping training when the validation loss stops improving.
# This ensures that the model does not continue learning patterns that do not generalize to unseen data.

# The parameter early_stopping_patience=3 means that training stops if there is no improvement in evaluation loss
# for 3 consecutive epochs. This prevents wasting computational resources on unnecessary training.

# Early stopping is particularly useful when training deep learning models, as they can easily overfit if trained for too long.
# By stopping early, we keep the model at its best generalization point.

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=val_df,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4509,0.535756
2,0.4432,0.528185
3,0.6701,0.647686
4,0.2958,0.698201
5,0.107,0.854256


TrainOutput(global_step=5000, training_loss=0.5512854406185448, metrics={'train_runtime': 4113.1675, 'train_samples_per_second': 9.725, 'train_steps_per_second': 1.216, 'total_flos': 1.128021098496e+16, 'train_loss': 0.5512854406185448, 'epoch': 5.0})

# Evaluation of the Model


In [1]:
trainer.evaluate()


NameError: name 'trainer' is not defined