# Emotion Classification with BERT

following https://inside-machinelearning.com/en/efficient-sentences-embedding-visualization-tsne/

Dataset: [GoEmotions](https://ai.googleblog.com/2021/10/goemotions-dataset-for-fine-grained.html)

## 1. Data

In [4]:
import os
import wget
import pandas as pd

In [None]:
url = "https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/"
files = ["goemotions_1.csv", "goemotions_2.csv", "goemotions_3.csv"]

for file in files:
    if not os.path.exists(os.path.join("data", file)):
        wget.download(url + file, "data")

In [5]:
# Create data frames
df1 = pd.read_csv("data/goemotions_1.csv")
df2 = pd.read_csv("data/goemotions_2.csv")
df3 = pd.read_csv("data/goemotions_3.csv")
df = pd.concat([df1, df2, df3], sort=False)

# Clean data
df.drop(["id", "link_id", "parent_id", "created_utc", "rater_id", "example_very_unclear"], axis=1, inplace=True)
df.drop_duplicates(subset=["text", "author", "subreddit"], inplace=True)
df.reset_index(inplace=True, drop=True)

# Extract text and emotion to new data frame
df_analysis = df[["text"]]
df_analysis["text_emotion"] = df.drop(columns=["text", "author", "subreddit"], axis=1, inplace=False).idxmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_analysis["text_emotion"] = df.drop(columns=["text", "author", "subreddit"], axis=1, inplace=False).idxmax(axis=1)


In [6]:
df.head(1)

Unnamed: 0,text,author,subreddit,admiration,amusement,anger,annoyance,approval,caring,confusion,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,Brdd9,nrl,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
df_analysis.head(1)

Unnamed: 0,text,text_emotion
0,That game hurt.,sadness


In [8]:
# Create subset to save computing time (7 out of 27 emotions with 1000 sentences each)
df_analysis = pd.concat([
    df_analysis[df_analysis["text_emotion"] == "joy"].iloc[:1000],
    df_analysis[df_analysis["text_emotion"] == "sadness"].iloc[:1000],
    df_analysis[df_analysis["text_emotion"] == "curiosity"].iloc[:1000],
    df_analysis[df_analysis["text_emotion"] == "neutral"].iloc[:1000],
    df_analysis[df_analysis["text_emotion"] == "love"].iloc[:1000],
    df_analysis[df_analysis["text_emotion"] == "amusement"].iloc[:1000],
    df_analysis[df_analysis["text_emotion"] == "embarrassment"].iloc[:1000],
])

## 2. Transfer Learning

In [2]:
import tensorflow_hub as hub
import tokenization
import numpy as np
import tensorflow as tf
from typing import Union
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

In [9]:
# Load model
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [10]:
def bert_encode(texts: list[str],
                tokenizer: Union[tokenization.FullTokenizer,
                                 tokenization.BasicTokenizer,
                                 tokenization.WordpieceTokenizer,
                                 tokenization.FullSentencePieceTokenizer],
                max_len: int = 512) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    """Encode given texts to fit BERT.
    
    Args:
        texts: List of texts to be encoded
        tokenizer: Tokenizer object to be used for tokenization
        max_len: Maximum input length

    Returns:
        Tuple of tokens, masks, and segments
    """
    all_tokens = []
    all_masks = []
    all_segments = []

    for text in texts:
        text = tokenizer.tokenize(text)

        text = text[:max_len - 2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)

        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len

        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)

    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

train_input = bert_encode(texts=df_analysis.text.values, tokenizer=tokenizer, max_len=100)
train_labels = df_analysis.text_emotion.values

In [None]:
def build_model(bert_layer: hub.KerasLayer, max_len: int=512) -> Model:
    """Build BERT TensorFlow model.
    
    Args:
        bert_layer: Pre-trained BERT layers
        max_len: Maximum input length (shape of the input layer)

    Returns:
        Compiled TensorFlow model
    """
    # Input layer
    input_word_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    # BERT layer
    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]

    # Embedding layer
    flatten = layers.Flatten(name="flatten")
    output_flatten = flatten(clf_output)

    # Output layer
    out = layers.Dense(len(np.unique(train_labels)), activation="sigmoid")(output_flatten)

    # Model
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(learning_rate=2e-6), loss="binary_crossentropy", metrics=["accuracy"])

    return model

model = build_model(bert_layer, max_len=100)

In [None]:
# Create one-hot encoding for emotions
label_dummy = pd.get_dummies(train_labels)
label_dummy.head(2)

#### Alternative 1: Train model locally

In [None]:
# Train model
train_history = model.fit(
    x=train_input,
    y=label_dummy,
    validation_split=0.2,
    epochs=10,
    batch_size=32
)

#### Alternative 2: Train model on Colab, download and import it

https://colab.research.google.com/drive/1ft8j2NLP9PSfSjYu9RfZo2EKt86GnBUy

In [11]:
model = tf.keras.models.load_model("models/EmoBERT")

# 3. TSNE

In [12]:
import numpy as np
import plotly.express as px
from sklearn.manifold import TSNE

In [14]:
# Re-create model up to the "flatten" layer
intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer("flatten").output)

In [15]:
# Get sentence embeddings
sentence_embedded = intermediate_layer_model.predict(x=train_input, verbose=1)



In [16]:
# Get emotions
labels_emotion = df_analysis.text_emotion

In [17]:
# Check if the shapes fit
assert sentence_embedded.shape[0] == labels_emotion.shape[0]

In [18]:
# Reduce embedding dimensions via T-SNE
X = list(sentence_embedded)
X_embedded = TSNE(n_components=2).fit_transform(X)



In [19]:
# Create a dataframe containing the 2D sentence embeddings and their emotions
df_embeddings = pd.DataFrame(X_embedded)
df_embeddings = df_embeddings.rename(columns={0: "x", 1: "y"})
df_embeddings = df_embeddings.assign(label=df_analysis.text_emotion.values)
df_embeddings = df_embeddings.assign(text=df_analysis.text.values) # unmodified base sentences for easier visualization

In [20]:
# Display embeddings
fig = px.scatter(data_frame=df_embeddings,
                 x="x",
                 y="y",
                 color="label",
                 labels={"color": "label"},
                 hover_data=["text"],
                 title="GoEmotions Embedding Visualization")

fig.show()