In [1]:
!pip install transformers torch torchvision torchaudio scikit-learn nlpaug


Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import nlpaug.augmenter.word as naw


In [4]:
# Load dataset
df = pd.read_csv("/kaggle/input/review-data/reviews_final.csv")  # Change this path
df = df[['Cleaned_Review', 'Emotion']].dropna()

# Remove very short reviews
df = df[df['Cleaned_Review'].str.len() > 5]

# Print dataset info
print(df.head(), "\nDataset Size:", len(df))


                                      Cleaned_Review       Emotion
0  just when you thought customer disservice coul...           sad
1  worst delivery any store united statesi paid m...    frustrated
2  poor customer service scampoor customer servic...  disappointed
3  walmartwalmart where do i even begin its like ...    frustrated
4  i bought cellphone from an osli bought cellpho...  disappointed 
Dataset Size: 13644


In [9]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  # Required for WordNet-based augmentations

# Now import nlpaug after downloading NLTK data
import nlpaug.augmenter.word as naw

# Augment dataset using synonym replacement
aug = naw.SynonymAug(aug_src='wordnet')
df['Augmented_Review'] = df['Cleaned_Review'].apply(lambda x: aug.augment(x))
df = df.append(df[['Augmented_Review', 'Emotion']].rename(columns={'Augmented_Review': 'Cleaned_Review'}))

# Print new dataset size
print("Dataset Size After Augmentation:", len(df))


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


LookupError: 
**********************************************************************
  Resource 'corpora/wordnet' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/root/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

In [10]:
# Encode emotion labels
label_encoder = LabelEncoder()
df['Emotion'] = label_encoder.fit_transform(df['Emotion'])

# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [11]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Split dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Cleaned_Review'].values, df['Emotion'].values, test_size=0.2, random_state=42)

# Create datasets
train_dataset = EmotionDataset(train_texts, train_labels)
test_dataset = EmotionDataset(test_texts, test_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

print("Dataset split completed!")


Dataset split completed!


In [12]:
# Set device (GPU/CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load RoBERTa model for emotion classification
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_)).to(device)

# Optimizer and Learning Rate Scheduler
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 10)
criterion = nn.CrossEntropyLoss()

print("Model and optimizer loaded successfully!")


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and optimizer loaded successfully!


In [15]:
def train_model(model, train_loader, optimizer, criterion, device, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss, correct, total = 0, 0, 0
        for batch in train_loader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            correct += (outputs.logits.argmax(dim=1) == labels).sum().item()
            total += labels.size(0)

        print(f"Epoch {epoch+1}: Loss={total_loss/len(train_loader):.4f}, Accuracy={correct/total:.4f}")


In [16]:
train_model(model, train_loader, optimizer, criterion, device, epochs=10)


Epoch 1: Loss=1.1656, Accuracy=0.5726
Epoch 2: Loss=0.8363, Accuracy=0.6996
Epoch 3: Loss=0.5898, Accuracy=0.7960
Epoch 7: Loss=0.1113, Accuracy=0.9678
Epoch 8: Loss=0.0781, Accuracy=0.9769
Epoch 9: Loss=0.0512, Accuracy=0.9859
Epoch 10: Loss=0.0364, Accuracy=0.9907


In [17]:
# Set model to evaluation mode
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = outputs.logits.argmax(dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Print classification report
print("Classification Report:\n", classification_report(all_labels, all_preds, target_names=label_encoder.classes_))


Classification Report:
               precision    recall  f1-score   support

       angry       0.65      0.57      0.60        23
disappointed       0.70      0.72      0.71       491
     excited       0.33      0.25      0.29         8
  frustrated       0.83      0.82      0.82      1286
       happy       0.79      0.86      0.82       144
     hopeful       0.75      0.53      0.62        17
      joyous       0.62      0.70      0.65        23
    relieved       0.77      0.77      0.77       147
         sad       0.69      0.72      0.70       348
   surprised       0.72      0.64      0.68       242

    accuracy                           0.77      2729
   macro avg       0.68      0.66      0.67      2729
weighted avg       0.77      0.77      0.77      2729



In [28]:
first_review = df.iloc[14]['Cleaned_Review']

def predict_first_review(model, tokenizer, text, label_encoder, device):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt").to(device)
        outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
        pred_label = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]
        return label_encoder.inverse_transform([pred_label])[0]

first_prediction = predict_first_review(model, tokenizer, first_review, label_encoder, device)
print(f"Predicted Emotion for First Review: {first_prediction}")


Predicted Emotion for First Review: disappointed
