## Data Collection and Feature Engineering steps

In [1]:
from IPython.display import clear_output
!pip install transformers[torch]
!pip install nlpaug
clear_output()

In [2]:
import numpy as np
import pandas as pd
import torch
import nlpaug.augmenter.word as naw
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from transformers import BertTokenizer
# Ensure you have the necessary resources downloaded
nltk.download('punkt')
nltk.download('stopwords')

def load_data(data_url):
    return pd.read_csv(data_url, sep='\t')

def preprocess_data(data):
    header = ["comment", "emotion", "id"]
    data.columns = header
    data = data[['comment', 'emotion']]
    data = data[data['emotion'].apply(lambda x: len(x.split(',')) == 1)]
    data['emotion'] = data['emotion'].apply(lambda x: ''.join(filter(str.isdigit, str(x)))).astype(int)
    return data

def map_emotions(data):
    emotions_dict = {
        0: "admiration", 1: "amusement", 2: "anger", 3: "annoyance", 4: "approval",
        5: "caring", 6: "confusion", 7: "curiosity", 8: "desire", 9: "disappointment",
        10: "disapproval", 11: "disgust", 12: "embarrassment", 13: "excitement", 14: "fear",
        15: "gratitude", 16: "grief", 17: "joy", 18: "love", 19: "nervousness",
        20: "optimism", 21: "pride", 22: "realization", 23: "relief", 24: "remorse",
        25: "sadness", 26: "surprise", 27: "neutral"
    }
    emotion_mapping = {
        "admiration": "positive_intent", "amusement": "positive_intent", "anger": "negative_intent", "annoyance": "negative_intent",
        "approval": "positive_intent", "caring": "positive_intent", "confusion": "inquiry", "curiosity": "inquiry",
        "desire": "positive_intent", "disappointment": "negative_intent", "disapproval": "negative_intent", "disgust": "negative_intent",
        "embarrassment": "negative_intent", "excitement": "positive_intent", "fear": "urgency", "gratitude": "positive_intent",
        "grief": "negative_intent", "joy": "positive_intent", "love": "positive_intent", "nervousness": "urgency",
        "optimism": "positive_intent", "pride": "positive_intent", "realization": "inquiry", "relief": "positive_intent",
        "remorse": "negative_intent", "sadness": "negative_intent", "surprise": "positive_intent", "neutral": "neutral"
    }
    category_to_int_mapping = {
        "neutral": 0, "negative_intent": 1, "positive_intent": 2, "inquiry": 3, "urgency": 4
    }
    def map_emotion(emotion_id):
        original_emotion = emotions_dict.get(int(emotion_id), 'other')
        category = emotion_mapping.get(original_emotion, 'other')
        return category_to_int_mapping.get(category, -1)  # Use -1 for any unmapped categories
    data['emotion'] = data['emotion'].apply(map_emotion)
    return data

def sample_data(data, fraction=1.0):
    sampled = data.groupby('emotion').sample(frac=fraction, replace=False)
    return sampled.reset_index(drop=True)

def augment_data(train_data):
    # Initialize the augmenter
    aug = naw.SynonymAug(aug_src='wordnet')

    def augment_comments(comments, num_augments):
        augmented_comments = []
        for comment in comments:
            comment_str = ' '.join(comment) if isinstance(comment, list) else str(comment)
            for _ in range(num_augments):
                augmented = aug.augment(comment_str)
                # Check if augmented is a list and take the first element if so
                if isinstance(augmented, list):
                    augmented_comments.append(str(augmented[0]))
                else:
                    augmented_comments.append(str(augmented))
        return augmented_comments

    def augment_class(data, class_label, target_count):
        class_data = data[data['emotion'] == class_label]
        current_count = len(class_data)
        augment_count = target_count - current_count
        if augment_count > 0:
            num_augments = augment_count // current_count + 1
            augmented_comments = augment_comments(class_data['comment'].tolist(), num_augments)[:augment_count]
            augmented_emotions = [class_label] * len(augmented_comments)
            return pd.DataFrame({'comment': augmented_comments, 'emotion': augmented_emotions})
        return pd.DataFrame()

    # Calculate the class distribution
    class_counts = train_data['emotion'].value_counts()
    max_count = class_counts.max()

    # Augment minority classes
    augmented_data = []
    for class_label, count in class_counts.items():
        if count < max_count:
            augmented_data.append(augment_class(train_data, class_label, max_count))

    if augmented_data:
        augmented_data = pd.concat(augmented_data).reset_index(drop=True)
    else:
        augmented_data = pd.DataFrame(columns=['comment', 'emotion'])

    # Combine with the original data
    augmented_train_data = pd.concat([train_data, augmented_data]).reset_index(drop=True)

    # Ensure all comments in the final dataframe are strings
    augmented_train_data['comment'] = augmented_train_data['comment'].apply(str)

    return augmented_train_data

def remove_stopwords(data):
    stop_words = set(stopwords.words('english'))
    data['comment'] = data['comment'].apply(lambda comment: ' '.join([word for word in word_tokenize(comment) if word.lower() not in stop_words]))
    return data

def tokenize_data(comments):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    return tokenizer(comments, padding=True, truncation=True, return_tensors='pt')

def transform_data(data_url, augmentation=True):
    data = load_data(data_url)
    data = preprocess_data(data)
    data = map_emotions(data)
    data = sample_data(data)
    if augmentation:
        data = augment_data(data)
    # data = remove_stopwords(data)
    tokenized_comments = tokenize_data(data['comment'].to_list())
    return tokenized_comments, torch.tensor(data['emotion'].values)

# URLs for train and validation data
train_data_url = 'https://github.com/google-research/google-research/raw/master/goemotions/data/train.tsv'
validation_data_url = 'https://github.com/google-research/google-research/raw/master/goemotions/data/dev.tsv'

# Process train and validation data
train_tokenized_comments, y_train = transform_data(train_data_url)
validation_tokenized_comments, y_validation = transform_data(validation_data_url, augmentation=False)

# Prepare data
X_train = train_tokenized_comments['input_ids']
attention_masks_train = train_tokenized_comments['attention_mask']
X_validation = validation_tokenized_comments['input_ids']
attention_masks_validation = validation_tokenized_comments['attention_mask']

print()
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("y_train unique:", np.unique(y_train))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



X_train shape: torch.Size([68200, 316])
y_train shape: torch.Size([68200])
y_train unique: [0 1 2 3 4]


# 1. Model Building

Here's the current research question:

**"Can we predict the sentiment of a textual comment?"**

### Initialization

In [3]:
def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    if torch.backends.mps.is_available():
        return torch.device("mps")
    return torch.device("cpu")

def print_device_info(device):
    print(f"Using device: {device}")
    if device.type == "cuda":
        print(f"CUDA Device: {torch.cuda.get_device_name(0)}")
        print(f"CUDA memory allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
    elif device.type == "mps":
        torch.mps.empty_cache()
        print(f"MPS memory allocated: {torch.mps.current_allocated_memory() / 1e9:.2f} GB")
    elif device.type == "cpu":
        print("No GPU available. Using CPU.")

device = get_device()
print_device_info(device)

Using device: cuda
CUDA Device: Tesla T4
CUDA memory allocated: 0.00 GB


### Define the model

In [4]:
from transformers import BertForSequenceClassification
# Load the BERT model
# we will use the bert-base-uncased model
# this model will classify the comments into 10 emotions

# Modify the BERT model to include dropout
class BertForSequenceClassificationWithDropout(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.dropout = torch.nn.Dropout(p=0.2)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return torch.nn.utils.rnn.PackedSequence(logits=logits, loss=loss, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

### Defining the emotion dataset

In [None]:
from sklearn.model_selection import KFold
from torch.utils.data import Dataset
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import AdamW, get_linear_schedule_with_warmup


# Create a Dataset class
class EmotionDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

# Cross-validation setup
kf = KFold(n_splits=2, shuffle=True)

# Prepare datasets for cross-validation
datasets = []
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    attention_masks_train_fold, attention_masks_val_fold = attention_masks_train[train_index], attention_masks_train[val_index]

    train_dataset_fold = EmotionDataset(X_train_fold, attention_masks_train_fold, y_train_fold)
    val_dataset_fold = EmotionDataset(X_val_fold, attention_masks_val_fold, y_val_fold)

    datasets.append((train_dataset_fold, val_dataset_fold))


from sklearn.metrics import f1_score

def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'f1': f1_score(labels, predictions, average='macro')}



# Define training arguments with advanced scheduler
training_args = TrainingArguments(
    output_dir='./results',  # Directory to save the model checkpoints
    num_train_epochs=30,  # Increased number of epochs
    per_device_train_batch_size=8,  # Smaller batch size
    per_device_eval_batch_size=16,  # Batch size for evaluation
    warmup_steps=100,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Strength of weight decay
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=20,  # Log every X updates steps
    evaluation_strategy="steps",  # Evaluate the model every X steps
    save_strategy="steps",  # Save the model checkpoint every X steps
    save_steps=20,  # Steps interval for saving model checkpoint
    eval_steps=20,  # Steps interval for evaluation
    gradient_accumulation_steps=4,  # Number of updates steps to accumulate before performing a backward/update pass
    fp16=True,  # Use 16-bit (mixed) precision training
    learning_rate=2e-5,  # The initial learning rate for AdamW optimizer
    load_best_model_at_end=True,  # Load the best model when finished training
    lr_scheduler_type="cosine_with_restarts",  # Cosine annealing scheduler with restarts
    save_total_limit=2,  # Limit the total amount of checkpoints. Deletes the older checkpoints.
    metric_for_best_model="f1",
    greater_is_better=True,  # F1 score should be maximized
    max_grad_norm=1.0,  # Added gradient clipping (corrected from gradient_clipping)
)

# Train model with cross-validation
all_train_losses = []
all_eval_losses = []
all_train_f1_scores = []
all_eval_f1_scores = []

for train_dataset_fold, val_dataset_fold in datasets:
    # Initialize a new model for each fold
    model = BertForSequenceClassificationWithDropout.from_pretrained('bert-large-uncased', num_labels=5)
    model.to(device)

    # Calculate the number of training steps
    num_train_steps = len(train_dataset_fold) * training_args.num_train_epochs // training_args.per_device_train_batch_size

    # Create optimizer and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=training_args.learning_rate, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_train_steps)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_fold,
        eval_dataset=val_dataset_fold,
        compute_metrics=compute_f1,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=15)],
        optimizers=(optimizer, scheduler)
    )

    train_result = trainer.train()

    # Extract loss and F1 score values from the logs
    for log in trainer.state.log_history:
        if 'loss' in log:
            all_train_losses.append(log['loss'])
        if 'eval_loss' in log:
            all_eval_losses.append(log['eval_loss'])
        if 'f1' in log:
            all_train_f1_scores.append(log['f1'])
        if 'eval_f1' in log:
            all_eval_f1_scores.append(log['eval_f1'])

    # Evaluate the model on the validation set
    eval_result = trainer.evaluate()
    print(f"Evaluation result: {eval_result}")

# Print summary statistics
print(f"Average train loss: {np.mean(all_train_losses):.4f}")
print(f"Average eval loss: {np.mean(all_eval_losses):.4f}")
print(f"Average train F1 score: {np.mean(all_train_f1_scores):.4f}")
print(f"Average eval F1 score: {np.mean(all_eval_f1_scores):.4f}")



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassificationWithDropout were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1
20,1.7147,1.64441,0.137876
40,1.6351,1.581078,0.233492
60,1.5741,1.516528,0.289801
80,1.4999,1.417321,0.448662
100,1.3501,1.233041,0.505549
120,1.1784,1.101497,0.535111
140,1.0645,0.962895,0.614257
160,0.9513,0.916247,0.64277
180,0.9577,0.853908,0.670149
200,0.8356,0.822837,0.692804


In [None]:
import matplotlib.pyplot as plt

# Plot the losses
plt.plot(all_train_losses, label='Training Loss')
plt.plot(all_eval_losses, label='Validation Loss')
plt.xlabel('Logging Steps')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Cross-Validation Folds')
plt.legend()
plt.show()

In [None]:
plt.subplot(1, 2, 2)
plt.plot(all_train_f1_scores, label='Train F1')
plt.plot(all_eval_f1_scores, label='Eval F1')
plt.title('F1 Score Curves')
plt.xlabel('Step')
plt.ylabel('F1 Score')
plt.legend()

plt.tight_layout()
plt.show()

### Save Model (only from google drive)

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

project_name = "NMA-GoEmotions-Project"
model_save_path = f'/content/drive/MyDrive/@NMA_Projects/{project_name}/{project_name}/model'

source_path = './results/'

# Create the destination directory if it doesn't exist
os.makedirs(model_save_path, exist_ok=True)

# Save only the classification layer weights
classifier_weights = model.classifier.state_dict()
torch.save(classifier_weights, os.path.join(model_save_path, 'classifier_weights_bert_newmapping_f1.pt'))

print(f"Model copied to Google Drive at: {model_save_path}")