In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AlbertTokenizerFast, AlbertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
from sklearn.metrics import accuracy_score, classification_report

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Using device:', device)

# Load data from CSV
file_path = 'file.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Ensure 'commentText' column contains strings
data['commentText'] = data['commentText'].astype(str)

# Function to convert sentiment labels to numerical values
sentiment_dict = {
    'Positive': 0,
    'Negative': 1,
    'Not_relevant': 2,
    'Mixed_feelings': 3,
    'Neutral': 4
}

data['Sentiment_Class'] = data['Sentiment_Class'].map(sentiment_dict)

# Prepare data for ALBERT
X = data['commentText']
y = data['Sentiment_Class']

# Tokenization
tokenizer = AlbertTokenizerFast.from_pretrained('albert-base-v2')
encoded_data = tokenizer(X.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

# Prepare input data and attention masks
input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']
labels = torch.tensor(y.tolist())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(input_ids, labels, test_size=0.2, random_state=10)
train_masks, test_masks, _, _ = train_test_split(attention_masks, input_ids, test_size=0.2, random_state=10)

# Create DataLoader for training and testing sets with reduced batch size
train_dataset = TensorDataset(X_train, train_masks, y_train)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=8)  # Reduced batch size from 32 to 8

test_dataset = TensorDataset(X_test, test_masks, y_test)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=8)  # Reduced batch size from 32 to 8

# Load ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=5)
model.to(device)

# Optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training the model
model.train()
for epoch in range(10):
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
y_true = []
y_pred = []
for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    y_true.extend(inputs['labels'].tolist())
    y_pred.extend(preds.tolist())

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_true, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_true, y_pred))


Using device: cuda


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacty of 4.00 GiB of which 0 bytes is free. Of the allocated memory 10.62 GiB is allocated by PyTorch, and 6.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF