In [3]:
!pip install transformers datasets torch scikit-learn




In [4]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, XLNetTokenizer, XLNetForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report


In [5]:

# Load the dataset
df = pd.read_csv("/kaggle/input/review-data/reviews_final.csv")

# Select relevant columns
df = df[['Cleaned_Review', 'Emotion']].dropna()

# Split dataset into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Cleaned_Review'].tolist(), df['Emotion'].tolist(), test_size=0.2, random_state=42)

# Encode emotion labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)


In [6]:
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [7]:
batch_size = 8

# Load tokenizers
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

# Create datasets
train_dataset_bert = ReviewDataset(train_texts, train_labels_encoded, bert_tokenizer)
test_dataset_bert = ReviewDataset(test_texts, test_labels_encoded, bert_tokenizer)
train_dataset_xlnet = ReviewDataset(train_texts, train_labels_encoded, xlnet_tokenizer)
test_dataset_xlnet = ReviewDataset(test_texts, test_labels_encoded, xlnet_tokenizer)

# Create DataLoaders
train_loader_bert = DataLoader(train_dataset_bert, batch_size=batch_size, shuffle=True)
test_loader_bert = DataLoader(test_dataset_bert, batch_size=batch_size, shuffle=False)
train_loader_xlnet = DataLoader(train_dataset_xlnet, batch_size=batch_size, shuffle=True)
test_loader_xlnet = DataLoader(test_dataset_xlnet, batch_size=batch_size, shuffle=False)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

In [8]:
def train_model(model, train_loader, optimizer, criterion, device, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")


In [9]:
!pip install torch transformers datasets scikit-learn




In [10]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, XLNetTokenizer, XLNetForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report


In [11]:
def evaluate_model(model, test_loader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=label_encoder.classes_)
    return accuracy, report


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load BERT Model
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
bert_model.to(device)

# Define Optimizer and Loss Function
optimizer = optim.AdamW(bert_model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

# Train BERT
train_model(bert_model, train_loader_bert, optimizer, criterion, device, epochs=10)

# Evaluate BERT
bert_accuracy, bert_report = evaluate_model(bert_model, test_loader_bert, device)
print("BERT Performance Metrics:")
print(f"Accuracy: {bert_accuracy}")
print(bert_report)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Loss: 1.5661008538343968
Epoch 2/10, Loss: 1.5930829919301546
Epoch 3/10, Loss: 1.590072944967738
Epoch 4/10, Loss: 1.591137616974967
Epoch 5/10, Loss: 1.5875471279297992
Epoch 6/10, Loss: 1.5862478310808594
Epoch 7/10, Loss: 1.5850574040587568
Epoch 8/10, Loss: 1.582501393141764
Epoch 9/10, Loss: 1.580225586760175
Epoch 10/10, Loss: 1.5794379936033116
BERT Performance Metrics:
Accuracy: 0.4712348845731037
              precision    recall  f1-score   support

       angry       0.00      0.00      0.00        23
disappointed       0.00      0.00      0.00       491
     excited       0.00      0.00      0.00         8
  frustrated       0.47      1.00      0.64      1286
       happy       0.00      0.00      0.00       144
     hopeful       0.00      0.00      0.00        17
      joyous       0.00      0.00      0.00        23
    relieved       0.00      0.00      0.00       147
         sad       0.00      0.00      0.00       348
   surprised       0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# Load XLNet Model
xlnet_model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=len(label_encoder.classes_))
xlnet_model.to(device)

# Define Optimizer and Loss Function
optimizer_xlnet = optim.AdamW(xlnet_model.parameters(), lr=2e-5)

# Train XLNet
train_model(xlnet_model, train_loader_xlnet, optimizer_xlnet, criterion, device, epochs=10)

# Evaluate XLNet
xlnet_accuracy, xlnet_report = evaluate_model(xlnet_model, test_loader_xlnet, device)
print("XLNet Performance Metrics:")
print(f"Accuracy: {xlnet_accuracy}")
print(xlnet_report)


pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Loss: 1.195082097852623
Epoch 2/10, Loss: 0.8680507792876317
Epoch 3/10, Loss: 0.5954666034014888
Epoch 4/10, Loss: 0.38366520413003125
Epoch 5/10, Loss: 0.2641446336233927
Epoch 9/10, Loss: 0.0984135348184355
Epoch 10/10, Loss: 0.0917182969627049
XLNet Performance Metrics:
Accuracy: 0.7537559545621106
              precision    recall  f1-score   support

       angry       0.48      0.61      0.54        23
disappointed       0.67      0.68      0.67       491
     excited       0.60      0.38      0.46         8
  frustrated       0.81      0.82      0.82      1286
       happy       0.92      0.75      0.82       144
     hopeful       0.64      0.41      0.50        17
      joyous       0.94      0.65      0.77        23
    relieved       0.69      0.86      0.77       147
         sad       0.70      0.67      0.68       348
   surprised       0.71      0.66      0.68       242

    accuracy                           0.75      2729
   macro avg       0.71      0.65 

In [20]:
def predict_emotion(model, tokenizer, text, device):
    model.eval()
    encoding = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    input_ids, attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        predicted_class = torch.argmax(outputs.logits, dim=1).cpu().item()

    return label_encoder.inverse_transform([predicted_class])[0]

first_review = df['Cleaned_Review'].iloc[12]
predicted_emotion_bert = predict_emotion(bert_model, bert_tokenizer, first_review, device)
predicted_emotion_xlnet = predict_emotion(xlnet_model, xlnet_tokenizer, first_review, device)

print(f"BERT Prediction for eleventh Review: {predicted_emotion_bert}")
print(f"XLNet Prediction for eleventh Review: {predicted_emotion_xlnet}")


BERT Prediction for eleventh Review: frustrated
XLNet Prediction for eleventh Review: disappointed
