## Data Collection

In [6]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer

def load_data(data_url):
    return pd.read_csv(data_url, sep='\t')

# URLs for train
train_data_url = 'https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/train.tsv'
train_data = load_data(train_data_url)
train_data.head(3)

Unnamed: 0,My favourite food is anything I didn't have to cook myself.,27,eebbqej
0,"Now if he does off himself, everyone will thin...",27,ed00q6i
1,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
2,To make her feel threatened,14,ed7ypvh


# Preprocessing

### Keep relevant columns and remove some rows

In [7]:
# Preprocess the data: rename columns and remove multiple emotions
def preprocess_data(data):
    # Define the header names
    header = ["comment", "emotion", "id"]
    # Rename the columns
    data.columns = header
    # Keep only the 'comment' and 'emotion' columns
    data = data[['comment', 'emotion']]
    # Filter out rows with multiple emotions (emotions separated by commas)
    data = data[data['emotion'].apply(lambda x: len(x.split(',')) == 1)]
    # Clean the 'emotion' column by keeping only digit characters and converting to integers
    data['emotion'] = data['emotion'].apply(lambda x: ''.join(filter(str.isdigit, str(x)))).astype(int)
    return data

preprocessed_data = preprocess_data(train_data)
preprocessed_data.head(3)

Unnamed: 0,comment,emotion
0,"Now if he does off himself, everyone will thin...",27
1,WHY THE FUCK IS BAYLESS ISOING,2
2,To make her feel threatened,14


### Filter top emotions for testing

In [8]:
# Filter the data to include only the top N most common emotions
def filter_top_emotions(data, top_n=2):
    # Count the occurrences of each emotion
    emotion_counts = data['emotion'].value_counts()
    # Identify the top N most common emotions
    top_emotions = emotion_counts.head(top_n).index
    # Create a mapping from the original emotion labels to new labels (0, 1, ...)
    label_mapping = {label: new_label for new_label, label in enumerate(top_emotions)}
    # Filter the data to include only the rows with the top N emotions
    filtered_data = data[data['emotion'].isin(top_emotions)].copy()
    # Apply the new label mapping to the 'emotion' column
    filtered_data.loc[:, 'emotion'] = filtered_data['emotion'].map(label_mapping)
    return filtered_data, label_mapping

filtered_train_data, label_mapping = filter_top_emotions(preprocessed_data)
filtered_train_data.head(3)

# Map the new labels to their original emotion names
emotions_dict = {
    0: "admiration", 1: "amusement", 2: "anger", 3: "annoyance", 4: "approval",
    5: "caring", 6: "confusion", 7: "curiosity", 8: "desire", 9: "disappointment",
    10: "disapproval", 11: "disgust", 12: "embarrassment", 13: "excitement", 14: "fear",
    15: "gratitude", 16: "grief", 17: "joy", 18: "love", 19: "nervousness",
    20: "optimism", 21: "pride", 22: "realization", 23: "relief", 24: "remorse",
    25: "sadness", 26: "surprise", 27: "neutral"
}
top_emotions_dict = {label_mapping[k]: emotions_dict[k] for k in label_mapping}

print("Top 2 most common emotions:", [top_emotions_dict[e] for e in sorted(top_emotions_dict)])

Top 2 most common emotions: ['neutral', 'admiration']


### Sample a fraction of the data for each emotion category

In [9]:
# Sample a fraction of the data for each emotion category
def sample_data(data, fraction=0.1):
    # Group the data by 'emotion' and sample a specified fraction of each group
    return data.groupby('emotion', group_keys=False).apply(lambda x: x.sample(frac=fraction)).reset_index(drop=True)

sampled_train_data = sample_data(filtered_train_data)
sampled_train_data.head(3)

  return data.groupby('emotion', group_keys=False).apply(lambda x: x.sample(frac=fraction)).reset_index(drop=True)


Unnamed: 0,comment,emotion
0,You just can’t deal with the fact that I’ve go...,0
1,"Maybe if you held onto the intestines, eventua...",0
2,Define woman please if you're not going to use...,0
