# Sentiment analysis using Tranformers

In [9]:
import torch
from typing import Any
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def load_data(file_path):
    data = pd.read_csv(file_path, usecols=range(3))
    return data

data = load_data('amazon_reviews_sentiment_3cls.csv')

In [3]:
data.head()

Unnamed: 0,class_index,review_title,review_text
0,-1,mens ultrasheer,"This model may be ok for sedentary types, but ..."
1,1,Surprisingly delightful,This is a fast read filled with unexpected hum...
2,-1,"Works, but not as advertised",I bought one of these chargers..the instructio...
3,-1,Oh dear,I was excited to find a book ostensibly about ...
4,-1,Incorrect disc!,"I am a big JVC fan, but I do not like this mod..."


In [5]:
data.to_csv('amazon_sentiment_analysis_3cls_3cols.csv', index=False)

### Split the data

In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['review_text'].tolist(),
    data['class_index'].tolist(),
    test_size=0.1,
    random_state=42
)

### Prepare the dataset and dataloaders

In [17]:
class AmazonReviewSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index) -> Any:
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids' : encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels':torch.tensor(label, dtype=torch.long)
        }
    
# Initialize tokenizer and create datasets
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128

train_dataset = AmazonReviewSentimentDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = AmazonReviewSentimentDataset(val_texts, val_labels, tokenizer, max_length)

# Create dataloader
batch_size = 32
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

val_dataloader = DataLoader(
    val_dataset,
    sampler=RandomSampler(val_dataset),
    batch_size=batch_size
)



### Initialize the BERT model for sequence classification

In [13]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

### Set up the optimizer and learning rate scheduler

In [14]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs=5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)



### Define training and evaluation functions

In [15]:
def train(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        model.zero_grad()
        outputs = model(
            input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(dataloader)
    return avg_loss


def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().tolist())
            true_labels.extend(labels.cpu().tolist())

    avg_loss = total_loss / len(dataloader)
    return avg_loss, predictions, true_labels

### Train and evaluate the model

In [16]:
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    train_loss = train(model, train_dataloader, optimizer, scheduler, device)
    print(f'Training loss: {train_loss:.4f}')

    val_loss, predictions, true_labels = evaluate(
        model, val_dataloader, device)
    print(f'Validation loss: {val_loss:.4f}')

    print('\nClassification Report:')
    print(classification_report(true_labels, predictions))

    print('\nConfusion Matrix:')
    cm = confusion_matrix(true_labels, predictions)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    print('\n')

# Save the fine-tuned model and tokenizer
model.save_pretrained('fine_tuned_bert_sentiment')
tokenizer.save_pretrained('fine_tuned_bert_sentiment')

Epoch 1/5
----------


Keyword arguments {'add_special_token': True, 'return_tensor': 'pt'} not recognized.


AttributeError: 'list' object has no attribute 'flatten'