## Data Collection and Feature Engineering steps

In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer

# Download and load the train data
train_data_url = 'https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/train.tsv'
train_data = pd.read_csv(train_data_url, sep='\t')

# Comment will be the only feature, emotion will be the target (multiple labels)
header = ["comment", "emotion", "id"]
train_data.columns = header

# Remove instances with more than one emotion from each dataset
train_data = train_data[train_data['emotion'].apply(lambda x: len(x.split(',')) == 1)]

# Convert emotion column into integers
train_data['emotion'] = train_data['emotion'].apply(lambda x: ''.join(filter(str.isdigit, str(x)))).astype(int)

# Determine the frequency of each emotion
emotion_counts = train_data['emotion'].value_counts()

# Select the top most common emotions
top_emotions = emotion_counts.head(2).index

# Filter the dataset to include only instances with the top 2 emotions
filtered_data = train_data[train_data['emotion'].isin(top_emotions)]

# Take only 10% of the data for each of the top 2 emotions
sampled_data = filtered_data.groupby('emotion').apply(lambda x: x.sample(frac=0.1)).reset_index(drop=True)

# Exclude the grouping columns after the groupby operation
sampled_data = sampled_data.reset_index(drop=True)

# Tokenize the comments using the BERT tokenizer (Convert comments into Tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_comments = tokenizer(sampled_data['comment'].to_list(), padding=True, truncation=True, return_tensors='pt')

# Bert will use these features and labels
X_train = tokenized_comments['input_ids']
attention_masks = tokenized_comments['attention_mask']
y_train = torch.tensor(sampled_data['emotion'].values)

# Map the top emotions to their labels
emotions_dict = {
    0: "admiration", 1: "amusement", 2: "anger", 3: "annoyance", 4: "approval",
    5: "caring", 6: "confusion", 7: "curiosity", 8: "desire", 9: "disappointment",
    10: "disapproval", 11: "disgust", 12: "embarrassment", 13: "excitement", 14: "fear",
    15: "gratitude", 16: "grief", 17: "joy", 18: "love", 19: "nervousness",
    20: "optimism", 21: "pride", 22: "realization", 23: "relief", 24: "remorse",
    25: "sadness", 26: "surprise", 27: "neutral"
}
top_emotions_dict = {k: emotions_dict[k] for k in top_emotions}

print()
print("Top 2 most common emotions:", [top_emotions_dict[e] for e in top_emotions])
print("Sampled data shape:", sampled_data.shape)


  sampled_data = filtered_data.groupby('emotion').apply(lambda x: x.sample(frac=0.1)).reset_index(drop=True)



Top 2 most common emotions: ['neutral', 'admiration']
Sampled data shape: (1553, 3)


# 1. Model Building - ML Classifiers

We try classic ML Classifiers first.

Here's the current research question:

**"Can we predict the sentiment of a textual comment?"**

### Initialization

In [None]:
# I wont use  -->  from transformers import Trainer, TrainingArguments

# training_args = TrainingArguments(
#     output_dir='./results',          # Output directory
#     num_train_epochs=3,              # Number of training epochs
#     per_device_train_batch_size=8,   # Batch size for training
#     per_device_eval_batch_size=8,    # Batch size for evaluation
#     warmup_steps=500,                # Number of warmup steps
#     weight_decay=0.01,               # Weight decay
#     logging_dir='./logs',            # Directory for storing logs
#     logging_steps=10,
# )

# trainer = Trainer(
#     model=model,                         # The instantiated ðŸ¤— Transformers model to be trained
#     args=training_args,                  # Training arguments, defined above
#     train_dataset=train_dataset,         # Training dataset
#     eval_dataset=eval_dataset            # Evaluation dataset
# )

In [None]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_built() else "cpu")
print(f"Using device: {device}")

Using device: mps


### Define the model

In [None]:
from transformers import BertForSequenceClassification
# Load the BERT model
# we will use the bert-base-uncased model
# this model will classify the comments into 10 emotions

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device);  # Move the model to the GPU

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Defining the optimizer and training loop

In [None]:
__output_dir='./results'
__num_train_epochs=1
__per_device_train_batch_size=2
__warmup_steps=50
__weight_decay=0.1
__logging_dir='./logs'
__logging_steps=50

In [None]:
from torch.utils.data import DataLoader, Dataset

# Create a custom dataset class
class EmotionDataset(Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx],
            'attention_mask': self.masks[idx],
            'labels': self.labels[idx]
        }

train_dataset = EmotionDataset(X_train, attention_masks, y_train)
train_loader = DataLoader(train_dataset, batch_size=__per_device_train_batch_size, shuffle=True)

In [None]:
torch.mps.empty_cache()
print("Current allocated memory: {:.2f} GB".format(torch.mps.current_allocated_memory() / (1024 * 1024 * 1024)))

Current allocated memory: 0.41 GB


In [None]:
import os
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

# Step 1: Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=__weight_decay)
total_steps = len(train_loader) * __num_train_epochs 
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=__warmup_steps, num_training_steps=total_steps)

# Directory for saving model and logs
output_dir = __output_dir
logging_dir = __logging_dir
os.makedirs(output_dir, exist_ok=True)
os.makedirs(logging_dir, exist_ok=True)

# Step 2: Training loop
num_train_epochs = __num_train_epochs
logging_steps = __logging_steps

model.train()
for epoch in range(num_train_epochs):
    epoch_loss = 0

    for step, batch in enumerate(train_loader):
        # Move batch data to the GPU
        inputs = batch['input_ids'].to(device)
        masks = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        
        # Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        epoch_loss += loss.item()
        
        if step % logging_steps == 0:
            print(f"Epoch {epoch + 1}, Step {step}, Loss: {loss.item()}")
    
    epoch_loss /= len(train_loader)
    print(f"Epoch {epoch + 1} completed. Average Loss: {epoch_loss}")
    
    # Save the model after each epoch
    model.save_pretrained(os.path.join(output_dir, f"model_epoch_{epoch + 1}"))
    tokenizer.save_pretrained(os.path.join(output_dir, f"model_epoch_{epoch + 1}"))

Epoch 1, Step 0, Loss: 0.0
Epoch 1, Step 50, Loss: 0.0
Epoch 1, Step 100, Loss: 0.035982392728328705
Epoch 1, Step 150, Loss: 0.0
Epoch 1, Step 200, Loss: 0.0
Epoch 1, Step 250, Loss: 0.0
Epoch 1, Step 300, Loss: 0.0
Epoch 1, Step 350, Loss: 0.004793396219611168
Epoch 1, Step 400, Loss: 0.0
Epoch 1, Step 450, Loss: 0.0035409717820584774
Epoch 1, Step 500, Loss: 0.0
Epoch 1, Step 550, Loss: 0.0
Epoch 1, Step 600, Loss: 0.0014696555444970727
Epoch 1, Step 650, Loss: 0.0
Epoch 1, Step 700, Loss: 0.0016236735973507166
Epoch 1, Step 750, Loss: 0.0
Epoch 1 completed. Average Loss: 0.02225907958287055
