# Imports


In [1]:
import pandas as pd
from datasets import Dataset
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader
from transformers import TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


# Tokenisation

In [2]:
# Laden der CSV-Dateien
train_df = pd.read_csv('train_data.csv')
val_df = pd.read_csv('val_data.csv')
test_df = pd.read_csv('test_data.csv')

In [3]:
# Convert train_df to Hugging Face Dataset format
train_df = Dataset.from_pandas(train_df)
val_df = Dataset.from_pandas(val_df)
test_df = Dataset.from_pandas(test_df)

In [4]:
# Load the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the review column
def tokenize_function(examples):
    return tokenizer(examples['review'], padding="max_length", truncation=True)

In [5]:
# Tokenize datasets
train_df = train_df.map(tokenize_function, batched=True)
val_df = val_df.map(tokenize_function, batched=True)
test_df = test_df.map(tokenize_function, batched=True)

Map: 100%|██████████| 8000/8000 [00:04<00:00, 1840.65 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 1593.45 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 1603.54 examples/s]


In [6]:
# Check if the tokenization worked properly
print(train_df[0])  # First entry train_df
print(val_df[0])    # First entry val_df
print(test_df[0])   # first entry test_df

{'review': "A very nice pizza.  I haven't made up my mind which is more authentic, NYPD or Ray's.  Both, have the thin NY style crust, but NYPD makes me feel like I'm waiting for an E train to Queens; whereas Ray's feels more upper east side-ish.", 'polarity': 0, 'input_ids': [0, 250, 182, 2579, 9366, 4, 1437, 38, 2220, 75, 156, 62, 127, 1508, 61, 16, 55, 12757, 6, 20283, 50, 4622, 18, 4, 1437, 1868, 6, 33, 5, 7174, 5300, 2496, 22196, 6, 53, 20283, 817, 162, 619, 101, 38, 437, 2445, 13, 41, 381, 2341, 7, 12446, 131, 9641, 4622, 18, 2653, 55, 2853, 3017, 526, 12, 1173, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [7]:
test_df

Dataset({
    features: ['review', 'polarity', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

# Prepare the RoBERTa model

In [8]:
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)

# We work with 3 output labels for the three emotions categories neutral, positive and negative
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)  


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Define the Parameters for the Training Phase

In [9]:
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every 10 steps
    evaluation_strategy="epoch",     # Evaluate after every epoch
    save_strategy="epoch",           # Save model after each epoch
    load_best_model_at_end=True      # Load the best model when finished
)




# Define the Trainer

In [10]:
from transformers import Trainer, TrainingArguments
from datasets import load_metric

# Define a metric for evaluation
accuracy_metric = load_metric("accuracy")

# Define the trainer
trainer = Trainer(
    model=model,                         # The model to be trained
    args=training_args,                  # Training arguments
    train_dataset=train_df,              # Training dataset
    eval_dataset=val_df,                 # Validation dataset
    tokenizer=tokenizer,                 # Tokenizer used to process inputs
    compute_metrics=lambda p: accuracy_metric.compute(predictions=p.predictions, references=p.label_ids)  # Metric calculation
)


ImportError: cannot import name 'load_metric' from 'datasets' (/Users/sandragedig/Documents/Dokumente/Data_Science/Elective A/Project/2_development_phase/venv/lib/python3.11/site-packages/datasets/__init__.py)