## Data Collection and Feature Engineering steps

In [10]:
from IPython.display import clear_output
!pip install transformers[torch]
clear_output()

In [11]:
import numpy as np
import pandas as pd
import torch
import nlpaug.augmenter.word as naw
from transformers import BertTokenizer

def load_data(data_url):
    return pd.read_csv(data_url, sep='\t')

def preprocess_data(data):
    header = ["comment", "emotion", "id"]
    data.columns = header
    data = data[['comment', 'emotion']]
    data = data[data['emotion'].apply(lambda x: len(x.split(',')) == 1)]
    data['emotion'] = data['emotion'].apply(lambda x: ''.join(filter(str.isdigit, str(x)))).astype(int)
    return data

def map_emotions(data):
    emotions_dict = {
        0: "admiration", 1: "amusement", 2: "anger", 3: "annoyance", 4: "approval",
        5: "caring", 6: "confusion", 7: "curiosity", 8: "desire", 9: "disappointment",
        10: "disapproval", 11: "disgust", 12: "embarrassment", 13: "excitement", 14: "fear",
        15: "gratitude", 16: "grief", 17: "joy", 18: "love", 19: "nervousness",
        20: "optimism", 21: "pride", 22: "realization", 23: "relief", 24: "remorse",
        25: "sadness", 26: "surprise", 27: "neutral"
    }
    emotion_mapping = {
        "admiration": "positive_intent", "amusement": "positive_intent", "anger": "negative_intent", "annoyance": "negative_intent",
        "approval": "positive_intent", "caring": "positive_intent", "confusion": "inquiry", "curiosity": "inquiry",
        "desire": "positive_intent", "disappointment": "negative_intent", "disapproval": "negative_intent", "disgust": "negative_intent",
        "embarrassment": "negative_intent", "excitement": "positive_intent", "fear": "urgency", "gratitude": "positive_intent",
        "grief": "negative_intent", "joy": "positive_intent", "love": "positive_intent", "nervousness": "urgency",
        "optimism": "positive_intent", "pride": "positive_intent", "realization": "inquiry", "relief": "positive_intent",
        "remorse": "negative_intent", "sadness": "negative_intent", "surprise": "positive_intent", "neutral": "neutral"
    }
    category_to_int_mapping = {
        "neutral": 0, "negative_intent": 1, "positive_intent": 2, "inquiry": 3, "urgency": 4
    }
    def map_emotion(emotion_id):
        original_emotion = emotions_dict.get(int(emotion_id), 'other')
        category = emotion_mapping.get(original_emotion, 'other')
        return category_to_int_mapping.get(category, -1)  # Use -1 for any unmapped categories
    data['emotion'] = data['emotion'].apply(map_emotion)
    return data

def sample_data(data, fraction=1.0):
    sampled = data.groupby('emotion').sample(frac=fraction, replace=False)
    return sampled.reset_index(drop=True)

def augment_data(train_data):
    # Initialize the augmenter
    aug = naw.SynonymAug(aug_src='wordnet')

    def augment_comments(comments, num_augments):
        augmented_comments = []
        for comment in comments:
            comment_str = str(comment)  # Ensure the comment is a string
            augmented_comments.extend([str(aug.augment(comment_str)) for _ in range(num_augments)])  # Ensure augmented comments are strings
        return augmented_comments

    def augment_class(data, class_label, target_count):
        class_data = data[data['emotion'] == class_label]
        current_count = len(class_data)
        augment_count = target_count - current_count
        if augment_count > 0:
            num_augments = augment_count // current_count + 1
            augmented_comments = augment_comments(class_data['comment'].tolist(), num_augments)[:augment_count]
            augmented_emotions = [class_label] * len(augmented_comments)
            return pd.DataFrame({'comment': augmented_comments, 'emotion': augmented_emotions})
        return pd.DataFrame()

    # Calculate the class distribution
    class_counts = train_data['emotion'].value_counts()
    max_count = class_counts.max()

    # Augment minority classes
    augmented_data = []
    for class_label, count in class_counts.items():
        if count < max_count:
            augmented_data.append(augment_class(train_data, class_label, max_count))

    if augmented_data:
        augmented_data = pd.concat(augmented_data).reset_index(drop=True)
    else:
        augmented_data = pd.DataFrame(columns=['comment', 'emotion'])

    # Combine with the original data
    augmented_train_data = pd.concat([train_data, augmented_data]).reset_index(drop=True)
    
    # Ensure all comments in the final dataframe are strings
    augmented_train_data['comment'] = augmented_train_data['comment'].apply(str)
    
    return augmented_train_data

def tokenize_data(comments):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    return tokenizer(comments, padding=True, truncation=True, return_tensors='pt')

def transform_data(data_url):
    data = load_data(data_url)
    data = preprocess_data(data)
    data = map_emotions(data)
    data = sample_data(data)
    data = augment_data(data)
    print(f"Data after augmentation: {data.head()}")  # Debugging line to see the first few rows after augmentation
    tokenized_comments = tokenize_data(data['comment'].to_list())
    return tokenized_comments, torch.tensor(data['emotion'].values)

# URLs for train and validation data
train_data_url = 'https://github.com/google-research/google-research/raw/master/goemotions/data/train.tsv'
validation_data_url = 'https://github.com/google-research/google-research/raw/master/goemotions/data/dev.tsv'

# Process train and validation data
train_tokenized_comments, y_train = transform_data(train_data_url)
validation_tokenized_comments, y_validation = transform_data(validation_data_url)

# Prepare data
X_train = train_tokenized_comments['input_ids']
attention_masks_train = train_tokenized_comments['attention_mask']
X_validation = validation_tokenized_comments['input_ids']
attention_masks_validation = validation_tokenized_comments['attention_mask']

print()
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("y_train unique:", np.unique(y_train))


Data after augmentation:                                              comment  emotion
0  That matress smashing into his head at the end...        0
1             That’s what the shake is for, though.         0
2                                    The gospel ruth        0
3               I can respect profiting from morons.        0
4  There is a difference between attacking someon...        0
Comments to tokenize: ['That matress smashing into his head at the end just kills me', 'That’s what the shake is for, though. ', 'The gospel ruth', 'I can respect profiting from morons.', "There is a difference between attacking someone, because they say they don't like someone else and attacking someone, because they kill people."]
Data after augmentation:                                              comment  emotion
0                  The 2019 NRL promo ad is a wrap!!        0
1  We have laws against all that. Get the police ...        0
2  And he'll still be unimaginably rich after the...     

# 1. Model Building

Here's the current research question:

**"Can we predict the sentiment of a textual comment?"**

### Initialization

In [16]:
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    if torch.backends.mps.is_available():
        return torch.device("mps")
    return torch.device("cpu")

def print_device_info(device):
    print(f"Using device: {device}")
    if device.type == "cuda":
        print(f"CUDA Device: {torch.cuda.get_device_name(0)}")
        print(f"CUDA memory allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
    elif device.type == "mps":
        torch.mps.empty_cache()
        print(f"MPS memory allocated: {torch.mps.current_allocated_memory() / 1e9:.2f} GB")
    elif device.type == "cpu":
        print("No GPU available. Using CPU.")

device = get_device()
print_device_info(device)

Using device: mps
MPS memory allocated: 0.44 GB


### Define the model

In [17]:
from transformers import BertForSequenceClassification
# Load the BERT model
# we will use the bert-base-uncased model
# this model will classify the comments into 10 emotions

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
model.to(device);  # Move the model to the GPU

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Defining the emotion dataset

In [18]:
from torch.utils.data import Dataset
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback


# Create a Dataset class
class EmotionDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

# Prepare datasets
train_dataset = EmotionDataset(X_train, attention_masks_train, y_train)
validation_dataset = EmotionDataset(X_validation, attention_masks_validation, y_validation)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Directory to save the model checkpoints
    num_train_epochs=3,  # Total number of training epochs
    per_device_train_batch_size=32,  # Batch size per device during training
    per_device_eval_batch_size=32,  # Batch size for evaluation
    warmup_steps=50,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Strength of weight decay
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=10,  # Log every X updates steps
    evaluation_strategy="steps",  # Evaluate the model every X steps
    save_strategy="steps",  # Save the model checkpoint every X steps
    save_steps=0,  # Steps interval for saving model checkpoint (We dont need to save steps)
    eval_steps=10,  # Steps interval for evaluation
    gradient_accumulation_steps=2,  # Number of updates steps to accumulate before performing a backward/update pass
    fp16=True,  # Use 16-bit (mixed) precision training
    learning_rate=3e-5,  # The initial learning rate for AdamW optimizer
    load_best_model_at_end=True,  # Load the best model when finished training (default metric is loss)
    lr_scheduler_type="reduce_lr_on_plateau",  # Learning rate scheduler type
    metric_for_best_model="eval_loss",  # Use loss to identify the best model
    save_total_limit=2,  # Limit the total amount of checkpoints. Deletes the older checkpoints.
    greater_is_better=False,  # Set to True if the metric to optimize is greater (e.g. Accuracy, F1). False for metrics which are lower (e.g. loss)
    # no_cuda=True if device.type != 'cuda' else False,  # Do not use CUDA even when available
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
trainer.train()

ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [19]:
import matplotlib.pyplot as plt

# Extract loss values from the logs
train_losses = []
eval_losses = []
for log in trainer.state.log_history:
    if 'loss' in log:
        train_losses.append(log['loss'])
    if 'eval_loss' in log:
        eval_losses.append(log['eval_loss'])

# Plot the losses
plt.plot(train_losses, label='Training Loss')
plt.plot(eval_losses, label='Validation Loss')
plt.xlabel('Logging Steps')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

NameError: name 'trainer' is not defined

### Save Model (only from google drive)

In [None]:
# # Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import os

# project_name = "NMA-GoEmotions-Project"
# model_save_path = f'/content/drive/MyDrive/@NMA_Projects/{project_name}/{project_name}/model'

# source_path = './results/'

# # Create the destination directory if it doesn't exist
# os.makedirs(model_save_path, exist_ok=True)

# # Save only the classification layer weights
# classifier_weights = model.classifier.state_dict()
# torch.save(classifier_weights, os.path.join(model_save_path, 'classifier_weights_bert.pt'))

# print(f"Model copied to Google Drive at: {model_save_path}")

Model copied to Google Drive at: /content/drive/MyDrive/@NMA_Projects/NMA-GoEmotions-Project/NMA-GoEmotions-Project/model
