In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch

# Load data from CSV
file_path = 'file.csv'
data = pd.read_csv(file_path)

# Ensure 'commentText' column contains strings
data['commentText'] = data['commentText'].astype(str)

# Map sentiment labels to numerical values
sentiment_dict = {
    'Positive': 0,
    'Negative': 1,
    'Not_relevant': 2,
    'Mixed_feelings': 3,
    'Neutral': 4
}
data['Sentiment_Class'] = data['Sentiment_Class'].map(sentiment_dict)

# Tokenization
tokenizer = RobertaTokenizer.from_pretrained('siebert/sentiment-roberta-large-english')
encoded_inputs = tokenizer(data['commentText'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Prepare target variable
labels = torch.tensor(data['Sentiment_Class'].values)

# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(encoded_inputs.input_ids, labels, test_size=0.2, random_state=10)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=10)

# Load pre-trained RoBERTa model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RobertaForSequenceClassification.from_pretrained('siebert/sentiment-roberta-large-english').to(device)

# Define a smaller batch size
batch_size = 4

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
for epoch in range(3):
    model.train()
    for i in range(0, len(X_train), batch_size):
        optimizer.zero_grad()
        batch_X = X_train[i:i+batch_size].to(device)
        batch_y = y_train[i:i+batch_size].to(device)
        outputs = model(input_ids=batch_X, labels=batch_y, attention_mask=batch_X != tokenizer.pad_token_id)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Evaluate on validation set
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=X_val.to(device), attention_mask=X_val != tokenizer.pad_token_id)
        predicted_labels = outputs.logits.argmax(dim=1)
        val_accuracy = (predicted_labels == y_val.to(device)).float().mean()
        print(f"Epoch {epoch+1}, Validation Accuracy: {val_accuracy.item():.4f}")

# Evaluate the model on test data
model.eval()
with torch.no_grad():
    outputs = model(input_ids=X_test.to(device), attention_mask=X_test != tokenizer.pad_token_id)
    predicted_labels = outputs.logits.argmax(dim=1)
    accuracy = (predicted_labels == y_test.to(device)).float().mean()

print(f"Test Accuracy: {accuracy.item():.4f}")


OutOfMemoryError: CUDA out of memory. Tried to allocate 198.00 MiB. GPU 0 has a total capacty of 4.00 GiB of which 0 bytes is free. Of the allocated memory 10.53 GiB is allocated by PyTorch, and 87.59 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF