<a href="https://colab.research.google.com/github/sgbyteninja/Airbnb_SQL_Database/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
import pandas as pd
from datasets import Dataset
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoConfig, TrainingArguments, Trainer, EarlyStoppingCallback
from torch.utils.data import DataLoader

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [1]:
# Tokenisation

# URLs of the CSV files from your GitHub repository
train_url = 'https://raw.githubusercontent.com/sgbyteninja/sentiment_analysis_customer_reviews/refs/heads/main/train_data.csv'
test_url = 'https://raw.githubusercontent.com/sgbyteninja/sentiment_analysis_customer_reviews/refs/heads/main/test_data.csv'
val_url = 'https://raw.githubusercontent.com/sgbyteninja/sentiment_analysis_customer_reviews/refs/heads/main/val_data.csv'

# Load the CSV files into pandas DataFrames
train_df = pd.read_csv(train_url)
test_df = pd.read_csv(test_url)
val_df = pd.read_csv(val_url)


# Convert train_df to Hugging Face Dataset format
train_df = Dataset.from_pandas(train_df)
val_df = Dataset.from_pandas(val_df)
test_df = Dataset.from_pandas(test_df)
# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the review column
def tokenize_function(examples):
    return tokenizer(examples['review'], padding="max_length", truncation=True)
# Tokenize datasets
train_df = train_df.map(tokenize_function, batched=True)
val_df = val_df.map(tokenize_function, batched=True)
test_df = test_df.map(tokenize_function, batched=True)
# Check if the tokenization worked properly
print(train_df[0])  # First entry train_df
print(val_df[0])    # First entry val_df
print(test_df[0])   # first entry test_df
test_df
# Assure the compatibility with PyTorch
train_df.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_df.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_df.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Model and Configuration Setup
# extract the number of names an classes
num_labels = len(set(train_df["label"]))
id2label = {i: str(i) for i in range(num_labels)}

# Configuration of the RoBERTa-model
config = AutoConfig.from_pretrained("roberta-base")
config.update({"id2label": id2label, "num_labels": num_labels})

# Load the RoBERTa-model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", config=config)
# Define the Parameters for Training
training_args = TrainingArguments(
    output_dir="./roberta_sentiment",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="none",
    metric_for_best_model="eval_loss",  # make sure, that eval_loss ist used for Early Stopp
    greater_is_better=False
)

# Setup the Trainer and train the Model
# Early Stopping helps prevent overfitting by stopping training when the validation loss stops improving.
# This ensures that the model does not continue learning patterns that do not generalize to unseen data.

# The parameter early_stopping_patience=3 means that training stops if there is no improvement in evaluation loss
# for 3 consecutive epochs. This prevents wasting computational resources on unnecessary training.

# Early stopping is particularly useful when training deep learning models, as they can easily overfit if trained for too long.
# By stopping early, we keep the model at its best generalization point.

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=val_df,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# train the model
trainer.train()
# Evaluation of the Model
trainer.evaluate()


ModuleNotFoundError: No module named 'datasets'