In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pandas as pd
from tqdm import tqdm

# Load the dataset
df = pd.read_csv('file3.csv')

# Convert Sentiment Classes to Integers
label_mapping = {'Negative': 0, 'Nuetral': 1, 'Positive': 2, 'Mixed_feelings': 3, 'Not_relevant': 4}
df['Sentiment_Class'] = df['Sentiment_Class'].map(label_mapping)

# Load pre-trained ALBERT tokenizer and model
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_mapping))

# Tokenize and encode the dataset
encoded_data = tokenizer.batch_encode_plus(
    df['commentText'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    max_length=256,
    return_tensors='pt',
    truncation=True
)

# Extract input tensors
input_ids = encoded_data['input_ids']
attention_mask = encoded_data['attention_mask']
labels = torch.tensor(df['Sentiment_Class'].values)

# Split the dataset
train_inputs, val_inputs, train_labels, val_labels, train_masks, val_masks = train_test_split(
    input_ids, labels, attention_mask, random_state=42, test_size=0.2
)

# Create DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=8, shuffle=False)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_dataloader) * 2
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Train the model with early stopping
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

epochs = 2
best_val_f1 = 0.0
for epoch in range(epochs):
    # Training loop with progress bar
    model.train()
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False)
    for batch in progress_bar:
        inputs, masks, labels = batch
        inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Update progress bar
        progress_bar.set_postfix({'training_loss': f'{loss.item():.3f}'})

    # Validation
    model.eval()
    val_accuracy = []
    val_predictions = []
    val_true_labels = []
    for batch in val_dataloader:
        inputs, masks, labels = batch
        inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)

        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks)
            logits = outputs.logits

        predictions = torch.argmax(logits, dim=1)
        accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        val_accuracy.append(accuracy)

        val_predictions.extend(predictions.cpu().numpy())
        val_true_labels.extend(labels.cpu().numpy())

    val_f1 = f1_score(val_true_labels, val_predictions, average='weighted')
    print(f'Epoch {epoch + 1}/{epochs}, Validation Accuracy: {sum(val_accuracy) / len(val_accuracy):.4f}, Validation F1 Score: {val_f1:.4f}')

    # Early stopping
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        patience = 0
    else:
        patience += 1
        if patience > 2:
            print("Early stopping triggered, no improvement in validation F1 score.")
            break

# Print classification report
print("Classification Report:")
print(classification_report(val_true_labels, val_predictions, target_names=label_mapping.keys()))

# Save the model using save_pretrained()
output_dir = './albert_sentiment_model'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
torch.save({'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict()}, 
           os.path.join(output_dir, 'training_state.pth'))

print("Model saved successfully at:", output_dir)


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                                                   

Epoch 1/2, Validation Accuracy: 0.3478, Validation F1 Score: 0.1794


                                                                                   

Epoch 2/2, Validation Accuracy: 0.3591, Validation F1 Score: 0.1898
Classification Report:
                precision    recall  f1-score   support

      Negative       0.00      0.00      0.00       276
       Nuetral       0.00      0.00      0.00       823
      Positive       0.00      0.00      0.00       346
Mixed_feelings       0.00      0.00      0.00        72
  Not_relevant       0.36      1.00      0.53       850

      accuracy                           0.36      2367
     macro avg       0.07      0.20      0.11      2367
  weighted avg       0.13      0.36      0.19      2367



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


NameError: name 'os' is not defined