# Introduction

# 1 - Importing libraries and loading data

## 1.1 - Installing and importing libraries

First, let's install the `transformers` library which contains thousands of pre-trained models, including BERT.

In [6]:
# !pip install transformers
# !pip install emoji
# !pip install contractions
# !pip install tensorflow
!pip install --upgrade numpy



Collecting numpy
  Downloading numpy-1.24.2-cp39-cp39-macosx_10_9_x86_64.whl (19.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m00:01[0mm00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.23.5
    Uninstalling numpy-1.23.5:
      Successfully uninstalled numpy-1.23.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.6.0 requires daal==2021.4.0, which is not installed.
tensorflow 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.24.2 which is incompatible.
numba 0.55.1 requires numpy<1.22,>=1.18, but you have numpy 1.24.2 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.24.2


In [7]:
# Data manipulation libraries
import sys, os
import pandas as pd
import numpy as np
import json

import emoji
import contractions
import re

# Scikit-learn packages
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight

# Packages to define a BERT model
from transformers import TFBertModel, BertTokenizerFast, BertConfig

# Keras and TensorFlow packages
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras import backend as K
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import TruncatedNormal

RuntimeError: module compiled against API version 0xf but this version of numpy is 0xe

SystemError: initialization of _pywrap_checkpoint_reader raised unreported exception

## 1.2 - Loading datasets and lists of emotions

First, let's load our clean data.

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Importing train, validation and test datasets with preprocessed texts and labels
train_GE = pd.read_csv("train_clean.csv")
val_GE = pd.read_csv("val_clean.csv")
test_GE = pd.read_csv("test_clean.csv")

# Shape validation
print(train_GE.shape)
print(val_GE.shape)
print(test_GE.shape)

(43410, 29)
(5426, 29)
(5427, 29)


Let's also load the lists of emotions from GoEmotions and Ekman taxonomies **excluding the the 'Neutral' emotion** this time.

In [None]:
# Loading emotion labels for GoEmotions taxonomy
with open("emotions.txt", "r") as file:
    GE_taxonomy = file.read().split("\n")
GE_taxonomy.remove('neutral')
print("Emotions on GoEmotions taxonomy are : \n{}".format(GE_taxonomy))

print()

# Loading emotion labels for Ekman taxonomy
with open("ekman_labels.txt", "r") as file:
    Ekman_taxonomy = file.read().split("\n")
Ekman_taxonomy.remove('neutral')
print("Emotions on Ekman taxonomy are : \n{}".format(Ekman_taxonomy))

Emotions on GoEmotions taxonomy are : 
['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']

Emotions on Ekman taxonomy are : 
['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise', '']


## 1.3 - Filtering out the 'Neutral' only samples

First, we need to drop the 'Neutral' emotion from all datasets.

In [None]:
train_GE = train_GE.drop(columns=['neutral'])
val_GE = val_GE.drop(columns=['neutral'])
test_GE = test_GE.drop(columns=['neutral'])

Then, we need remove all the samples that have been left without a label.

In [None]:
# Removing samples with only 0 in their labels
train_GE = train_GE.loc[ train_GE.apply(lambda x: sum(x[1:]), axis=1)>0 ]
val_GE = val_GE.loc[ val_GE.apply(lambda x: sum(x[1:]), axis=1)>0 ]
test_GE = test_GE.loc[ test_GE.apply(lambda x: sum(x[1:]), axis=1)>0 ]

# Shape validation
print(train_GE.shape)
print(val_GE.shape)
print(test_GE.shape)

(30587, 28)
(3834, 28)
(3821, 28)


In [None]:
# Preview of data
display(train_GE.head(3))

Unnamed: 0,Clean_text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise
2,why the fuck is bayless isoing,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,to make her feel threatened,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,dirty southern wankers,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Doing so, we have **decreased the number of samples by nearly 30%** of the original data.

#2 - Modeling : BERT (Bidirectional Encoder Representations from Transformers)

Now we can go ahead and start defining our BERT-based model.

##2.1 - Configuration of the base model

First of all, let's define a `max_length` variable. This variable sets a fixed length of sequences to be fed to our model. Therefore, sequences will be either truncated if larger than this value, or completed using padding if smaller. To avoid truncating, we fix this value according to the largest sample of our data.

In [None]:
# Computing max length of samples
full_text = pd.concat([train_GE['Clean_text'], val_GE['Clean_text'], test_GE['Clean_text']])
max_length = full_text.apply(lambda x: len(x.split())).max()
max_length

48

We are going to use BERT's base model which contains almost 110 M trainable parameters. 

Also, in order to match the tokenization and vocabulary used during the training, we are going to use a BERT tokenizer.

In [None]:
# Importing BERT pre-trained model and tokenizer
model_name = 'bert-base-uncased'
config = BertConfig.from_pretrained(model_name, output_hidden_states=False)
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


## 2.2 - Definition of the model architecture

Now that everything is in place, we can create a model based on BERT's main layer, and replace the top layers to reach our main objective (multi-label classification accross **27 possible emotions**).

Our model takes three inputs that result from tokenization:

*   `input_ids`: indices of input sequence tokens in the vocabulary
*   `token_ids`: Segment token indices to indicate first and second portions of the inputs.   0 for sentence A and 1 for sentence B
*   `attention mask`: Mask to avoid performing attention on padding token indices.  0 for masked and 1 for not masked



In [None]:
# function for creating BERT based model
def create_model(nb_labels):
  # Load the MainLayer
  bert = transformer_model.layers[0]

  # Build the model inputs
  input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
  attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32')
  token_ids = Input(shape=(max_length,), name='token_ids', dtype='int32')
  inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}

  # Load the Transformers BERT model as a layer in a Keras model
  bert_model = bert(inputs)[1]
  dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
  pooled_output = dropout(bert_model, training=False)

  # Then build the model output
  emotion = Dense(units=nb_labels, activation="sigmoid", kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='emotion')(pooled_output)
  outputs = emotion

  # And combine it all in a model object
  model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel')

  return model

We use here a `sigmoid` activation function in the last dense layer that is better suited than a `softmax` activation function. In fact, `softmax` shrinks output probabilities for each label so that the sum of probabilities is 1. In our case, each label (emotion) can independently have a probability between 0 and 1, and `sigmoid` allows that.

We can now create our model using 27 labels and visualize a summary.

In [None]:
# Creating a model instance
model = create_model(27)

# Take a look at the model
model.summary()

Model: "BERT_MultiLabel"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask (InputLayer)    [(None, 48)]         0           []                               
                                                                                                  
 input_ids (InputLayer)         [(None, 48)]         0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['attention_mask[0][0]',         
                                thPoolingAndCrossAt               'input_ids[0][0]']              
                                tentions(last_hidde                                               
                                n_state=(None, 48,                                  

##2.3 - Data preprocessing and model training

###2.3.1 - Tokenizing data

Let's go ahead and process our data. We will first separate texts from labels in the train, validation and test datasets, and then tokenize the texts using the BERT tokenizer.

In [None]:
# Creating train, validation and test variables
X_train = train_GE['Clean_text']
y_train = train_GE.loc[:, GE_taxonomy].values.astype(float)

X_val = val_GE['Clean_text']
y_val = val_GE.loc[:, GE_taxonomy].values.astype(float)

X_test = test_GE['Clean_text']
y_test = test_GE.loc[:, GE_taxonomy].values.astype(float)

# Tokenizing train data
train_token = tokenizer(
    text = X_train.to_list(),
    add_special_tokens = True,
    max_length = max_length,
    truncation = True,
    padding = 'max_length', 
    return_tensors = 'tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

# Tokenizing valisation data
val_token = tokenizer(
    text = X_val.to_list(),
    add_special_tokens = True,
    max_length = max_length,
    truncation = True,
    padding = 'max_length', 
    return_tensors = 'tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

# Tokenizing test data
test_token = tokenizer(
    text = X_test.to_list(),
    add_special_tokens = True,
    max_length = max_length,
    truncation = True,
    padding = 'max_length', 
    return_tensors = 'tf',
    return_token_type_ids = True,
    return_attention_mask = True,
    verbose = True)

In [None]:
# Creating BERT compatible inputs with Input Ids, attention masks and token Ids 
train = {'input_ids': train_token['input_ids'], 'attention_mask': train_token['attention_mask'],'token_ids': train_token['token_type_ids']}
val = {'input_ids': val_token['input_ids'], 'attention_mask': val_token['attention_mask'],'token_ids': val_token['token_type_ids']}
test = {'input_ids': test_token['input_ids'], 'attention_mask': test_token['attention_mask'],'token_ids': test_token['token_type_ids']}

During the training phase, we our going to use batches of 16 samples. After each epoch, data will be shuffled. Let's create TensorFlow tensors accordingly.

In [None]:
# Creating TF tensors
train_tensor = tf.data.Dataset.from_tensor_slices((train, y_train)).shuffle(len(train)).batch(16)
val_tensor = tf.data.Dataset.from_tensor_slices((val, y_val)).shuffle(len(val)).batch(16)
test_tensor = tf.data.Dataset.from_tensor_slices((test, y_test)).shuffle(len(test)).batch(16)

### 2.3.2 - Class weights for multi-label and custom loss function

Training requires to monitor the loss function and eventually some other metrics to see how the model behaves throughout the epochs.

Therefore, we need to define a weighted loss function that takes into account  class weights in our multi-label case.

First, we need to compute class weights.

In [None]:
# Function for calculating multilabel class weights
def calculating_class_weights(y_true):
    number_dim = np.shape(y_true)[1]
    weights = np.empty([number_dim, 2])
    for i in range(number_dim):
        weights[i] = compute_class_weight(class_weight='balanced', classes=[0.,1], y=y_true[:, i])
    return weights
    
class_weights = calculating_class_weights(y_train)

Then, we can define a custom crossentropy function in which we multiply the weights.

In [None]:
# Custom loss function for multilabel
def get_weighted_loss(weights):
    def weighted_loss(y_true, y_pred):
        return K.mean((weights[:,0]**(1-y_true))*(weights[:,1]**(y_true))*K.binary_crossentropy(y_true, y_pred), axis=-1)
    return weighted_loss

# 2.3.3 - Model training

In [None]:
# Set an optimizer
optimizer = Adam(
    learning_rate=3.e-05,
    )

# Set loss
loss = get_weighted_loss(class_weights)

# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss)

# train the model
history = model.fit(train_tensor, 
                    epochs=1, 
                    validation_data=val_tensor,
                    )



## 2.4 - Model evaluation

### 2.4.1 - Evaluation on GoEmotions taxonomy

In [22]:
# Save model weights
model.save_weights('/content/drive/MyDrive/Goemotion/bert-weights.hdf5')

Let's generate predictions on test data.

In [23]:
# Making probability predictions on test data
y_pred_proba = model.predict(test)



When making predictions, we only generate probabilities associated with each label. To predict actual labels, we need to add an additional step that transforms these probabilities into labels given a certain threshold.

We define a function to do so with a default threshold set to 0.8.

In [24]:
# from probabilities to labels using a given threshold
def proba_to_labels(y_pred_proba, threshold=0.8):
    
    y_pred_labels = np.zeros_like(y_pred_proba)
    
    for i in range(y_pred_proba.shape[0]):
        for j in range(y_pred_proba.shape[1]):
            if y_pred_proba[i][j] > threshold:
                y_pred_labels[i][j] = 1
            else:
                y_pred_labels[i][j] = 0
                
    return y_pred_labels

In [25]:
# Generate labels
y_pred_labels = proba_to_labels(y_pred_proba)

Let's evaluate these predictions using the evaluation function we defined in the previous notebooks.

In [26]:
# Model evaluation function 
def model_eval(y_true, y_pred_labels, emotions):
    
    # Defining variables
    precision = []
    recall = []
    f1 = []
    
    # Per emotion evaluation      
    idx2emotion = {i: e for i, e in enumerate(emotions)}
    
    for i in range(len(emotions)):
   
        # Computing precision, recall and f1-score
        p, r, f1_score, _ = precision_recall_fscore_support(y_true[:, i], y_pred_labels[:, i], average="binary")
        
        # Append results in lists
        precision.append(round(p, 2))
        recall.append(round(r, 2))
        f1.append(round(f1_score, 2))
    
    # Macro evaluation
    macro_p, macro_r, macro_f1_score, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
    
    # Append results in lists
    precision.append(round(macro_p, 2))
    recall.append(round(macro_r, 2))
    f1.append(round(macro_f1_score, 2))
    
    # Converting results to a dataframe
    df_results = pd.DataFrame({"Precision":precision, "Recall":recall, 'F1':f1})
    df_results.index = emotions+['MACRO-AVERAGE']
    
    return df_results

In [27]:
# Model evaluation
model_eval(y_test, y_pred_labels, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.76,0.58,0.65
amusement,0.76,0.94,0.84
anger,0.41,0.69,0.51
annoyance,0.34,0.52,0.41
approval,0.62,0.32,0.42
caring,0.37,0.64,0.47
confusion,0.32,0.75,0.45
curiosity,0.54,0.83,0.65
desire,0.39,0.6,0.48
disappointment,0.27,0.37,0.31


Looking at the results, we see that this model performs better than the previous one. It looks like **removing the noise brought by the 'Neutral' emotion helped to better distinguish the other emotions.**

### 2.4.2 - Threshold optimization

In the initial evaluation, we set an aribitrary threshold. However, we can also choose a threshold that maximizes a certain metric. 

We define a function that tests a certain number of possible thresholds, and returns the best threshold together with the best predicted labels and best macro f1-score.

In [28]:
# Function that computes labels from probabilities and optimizes the threshold that maximizes f1-score
def proba_to_labels_opt(y_true, y_pred_proba):
    
    '''
    Inputs:
        y_true: Ground truth labels
        y_pred_proba: predicted probabilities
        
    Outputs :
        best_y_pred_labels: preticted labels associated with best threshold
        best_t: best threshold
        best_macro_f1: macro f1-score associated with predicted labels
    '''
    
    # range of possible thresholds
    thresholds = np.arange(0.7, 0.99, 0.01)
    
    # Computing threshold that maximizes macro f1-score 
    best_y_pred_labels = np.zeros_like(y_pred_proba)
    best_t = 0
    best_macro_f1 = 0
    
    # Iterating through possible thresholds
    for t in thresholds:
        
        y_pred_labels = proba_to_labels(y_pred_proba, t)
                             
        _, _, macro_f1, _ = precision_recall_fscore_support(y_true, y_pred_labels, average="macro")
        
        if macro_f1 > best_macro_f1:
            best_macro_f1 = macro_f1
            best_t = t
            best_y_pred_labels = y_pred_labels
            
    return best_y_pred_labels, best_t, best_macro_f1

We can now apply this function to our predicted probabilities and compute optimized label predictions.

In [29]:
# Compute label predictions and corresponding optimal thresholds 
y_pred_labels_opt, threshold_opt, macro_f1_opt = proba_to_labels_opt(y_test, y_pred_proba)
print("The model's threshold is {}".format(threshold_opt))
print("The model's best macro-f1 is {}".format(macro_f1_opt))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The model's threshold is 0.8800000000000001
The model's best macro-f1 is 0.47905239804103694


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
# Model evaluation : Precision, Recall, F-score
model_eval(y_test, y_pred_labels_opt, GE_taxonomy)

Unnamed: 0,Precision,Recall,F1
admiration,0.81,0.44,0.57
amusement,0.83,0.87,0.85
anger,0.52,0.55,0.54
annoyance,0.48,0.27,0.35
approval,0.79,0.14,0.24
caring,0.43,0.51,0.47
confusion,0.42,0.49,0.45
curiosity,0.63,0.73,0.68
desire,0.55,0.46,0.5
disappointment,0.39,0.16,0.23


**Optimizing the threshold** helped us to **slightly improve** the model predictions.

### 2.4.4 - Indirect evaluation on Ekman taxonomy by mapping predictions

Until now, we have only evaluated our model on the GoEmotions taxonomy.

As a reference, we can try to map the true and predicted emotions to the Ekman taxonomy and see how our model performs.

We have already defined the Ekman taxonomy earlier.

Let's define a function that transforms labels from GoEmotions to Ekman taxonomy.

In [31]:
# Function thats maps predictions on GoEmotions taxonomy to Ekman taxonomy
def GE_to_Ekman(GE_labels):
    
    # Create a dataframe of GoEmotions labels
    df_GE = pd.DataFrame(GE_labels, columns=GE_taxonomy)

    # Create an empty dataframe of Ekman labels
    df_Ekman  = pd.DataFrame(np.zeros((len(GE_labels), len(Ekman_taxonomy))), columns=Ekman_taxonomy)

    for i in range(len(df_GE)):

        if df_GE.loc[i,['anger', 'annoyance', 'disapproval']].sum() >= 1:
            df_Ekman.loc[i,'anger'] = 1

        if df_GE.loc[i,'disgust'].sum() >= 1:
            df_Ekman.loc[i,'disgust'] = 1

        if df_GE.loc[i,['fear', 'nervousness']].sum() >= 1:
            df_Ekman.loc[i,'fear'] = 1

        if df_GE.loc[i,['joy', 'amusement', 'approval', 'excitement', 'gratitude',
                        'love', 'optimism', 'relief', 'pride', 'admiration', 'desire','caring']].sum() >= 1:
            df_Ekman.loc[i,'joy'] = 1 

        if df_GE.loc[i,['sadness', 'disappointment', 'embarrassment', 'grief', 'remorse']].sum() >= 1:
            df_Ekman.loc[i,'sadness'] = 1

        if df_GE.loc[i,['surprise', 'realization', 'confusion', 'curiosity']].sum() >= 1:
            df_Ekman.loc[i,'surprise'] = 1

    return df_Ekman.values

We can now apply our function and evaluate the predictions

In [32]:
# Mapping GoEmotion labels to Ekman labels (true and predictions)
y_test_Ekman = GE_to_Ekman(y_test)
y_pred_labels_Ekman = GE_to_Ekman(y_pred_labels_opt)

# Evaluation
model_eval(y_test_Ekman, y_pred_labels_Ekman, Ekman_taxonomy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Precision,Recall,F1
anger,0.79,0.47,0.59
disgust,0.34,0.73,0.47
fear,0.53,0.82,0.64
joy,0.93,0.76,0.84
sadness,0.65,0.51,0.57
surprise,0.7,0.65,0.68
,0.0,0.0,0.0
MACRO-AVERAGE,0.56,0.56,0.54


## 2.5 - Make predictions

To make predictions on a new sample, it needs to be processed using all the different precessing steps we used.

In [33]:
# Retrieving initial preprocessings
def preprocess_corpus(x):
    
    # Adding a space between words and punctation
    x = re.sub( r'([a-zA-Z\[\]])([,;.!?])', r'\1 \2', x)
    x = re.sub( r'([,;.!?])([a-zA-Z\[\]])', r'\1 \2', x)

    # Demojize
    x = emoji.demojize(x)

    # Expand contraction
    x = contractions.fix(x)

    # Lower
    x = x.lower()

    #correct some acronyms/typos/abbreviations  
    x = re.sub(r"lmao", "laughing my ass off", x)  
    x = re.sub(r"amirite", "am i right", x)
    x = re.sub(r"\b(tho)\b", "though", x)
    x = re.sub(r"\b(ikr)\b", "i know right", x)
    x = re.sub(r"\b(ya|u)\b", "you", x)
    x = re.sub(r"\b(eu)\b", "europe", x)
    x = re.sub(r"\b(da)\b", "the", x)
    x = re.sub(r"\b(dat)\b", "that", x)
    x = re.sub(r"\b(dats)\b", "that is", x)
    x = re.sub(r"\b(cuz)\b", "because", x)
    x = re.sub(r"\b(fkn)\b", "fucking", x)
    x = re.sub(r"\b(tbh)\b", "to be honest", x)
    x = re.sub(r"\b(tbf)\b", "to be fair", x)
    x = re.sub(r"faux pas", "mistake", x)
    x = re.sub(r"\b(btw)\b", "by the way", x)
    x = re.sub(r"\b(bs)\b", "bullshit", x)
    x = re.sub(r"\b(kinda)\b", "kind of", x)
    x = re.sub(r"\b(bruh)\b", "bro", x)
    x = re.sub(r"\b(w/e)\b", "whatever", x)
    x = re.sub(r"\b(w/)\b", "with", x)
    x = re.sub(r"\b(w/o)\b", "without", x)
    x = re.sub(r"\b(doj)\b", "department of justice", x)

    # replace some words with multiple occurences of a letter, example "coooool" turns into --> cool
    x = re.sub(r"\b(j+e{2,}z+e*)\b", "jeez", x)
    x = re.sub(r"\b(co+l+)\b", "cool", x)
    x = re.sub(r"\b(g+o+a+l+)\b", "goal", x)
    x = re.sub(r"\b(s+h+i+t+)\b", "shit", x)
    x = re.sub(r"\b(o+m+g+)\b", "omg", x)
    x = re.sub(r"\b(w+t+f+)\b", "wtf", x)
    x = re.sub(r"\b(w+h+a+t+)\b", "what", x)
    x = re.sub(r"\b(y+e+y+|y+a+y+|y+e+a+h+)\b", "yeah", x)
    x = re.sub(r"\b(w+o+w+)\b", "wow", x)
    x = re.sub(r"\b(w+h+y+)\b", "why", x)
    x = re.sub(r"\b(s+o+)\b", "so", x)
    x = re.sub(r"\b(f)\b", "fuck", x)
    x = re.sub(r"\b(w+h+o+p+s+)\b", "whoops", x)
    x = re.sub(r"\b(ofc)\b", "of course", x)
    x = re.sub(r"\b(the us)\b", "usa", x)
    x = re.sub(r"\b(gf)\b", "girlfriend", x)
    x = re.sub(r"\b(hr)\b", "human ressources", x)
    x = re.sub(r"\b(mh)\b", "mental health", x)
    x = re.sub(r"\b(idk)\b", "i do not know", x)
    x = re.sub(r"\b(gotcha)\b", "i got you", x)
    x = re.sub(r"\b(y+e+p+)\b", "yes", x)
    x = re.sub(r"\b(a*ha+h[ha]*|a*ha +h[ha]*)\b", "haha", x)
    x = re.sub(r"\b(o?l+o+l+[ol]*)\b", "lol", x)
    x = re.sub(r"\b(o*ho+h[ho]*|o*ho +h[ho]*)\b", "ohoh", x)
    x = re.sub(r"\b(o+h+)\b", "oh", x)
    x = re.sub(r"\b(a+h+)\b", "ah", x)
    x = re.sub(r"\b(u+h+)\b", "uh", x)

    # Handling emojis
    x = re.sub(r"<3", " love ", x)
    x = re.sub(r"xd", " smiling_face_with_open_mouth_and_tightly_closed_eyes ", x)
    x = re.sub(r":\)", " smiling_face ", x)
    x = re.sub(r"^_^", " smiling_face ", x)
    x = re.sub(r"\*_\*", " star_struck ", x)
    x = re.sub(r":\(", " frowning_face ", x)
    x = re.sub(r":\^\(", " frowning_face ", x)
    x = re.sub(r";\(", " frowning_face ", x)
    x = re.sub(r":\/",  " confused_face", x)
    x = re.sub(r";\)",  " wink", x)
    x = re.sub(r">__<",  " unamused ", x)
    x = re.sub(r"\b([xo]+x*)\b", " xoxo ", x)
    x = re.sub(r"\b(n+a+h+)\b", "no", x)
    
    # Handling special cases of text
    x = re.sub(r"h a m b e r d e r s", "hamberders", x)
    x = re.sub(r"b e n", "ben", x)
    x = re.sub(r"s a t i r e", "satire", x)
    x = re.sub(r"y i k e s", "yikes", x)
    x = re.sub(r"s p o i l e r", "spoiler", x)
    x = re.sub(r"thankyou", "thank you", x)
    x = re.sub(r"a^r^o^o^o^o^o^o^o^n^d", "around", x)

    # Remove special characters and numbers replace by space + remove double space
    x = re.sub(r"\b([.]{3,})"," dots ", x)
    x = re.sub(r"[^A-Za-z!?_]+"," ", x)
    x = re.sub(r"\b([s])\b *","", x)
    x = re.sub(r" +"," ", x)
    x = x.strip()

    return x     

Now we can define a prediction function that takes one or more samples, and outputs the detected emotions from the model.

In [34]:
def predict_samples(text_samples, model, threshold):
    
    # Text preprocessing and cleaning
    text_samples_clean = [preprocess_corpus(text) for text in text_samples]
    
    # Tokenizing train data
    samples_token = tokenizer(
        text = text_samples_clean,
        add_special_tokens = True,
        max_length = max_length,
        truncation = True,
        padding = 'max_length', 
        return_tensors = 'tf',
        return_token_type_ids = True,
        return_attention_mask = True,
        verbose = True,
    )
    
    # Preparing to feed the model
    samples = {'input_ids': samples_token['input_ids'],
               'attention_mask': samples_token['attention_mask'],
               'token_ids': samples_token['token_type_ids']
              }
    
    # Probability predictions
    samples_pred_proba = model.predict(samples)
    
    # Label prediction using threshold
    samples_pred_labels = proba_to_labels(samples_pred_proba)
            
    samples_pred_labels_df = pd.DataFrame(samples_pred_labels)
    samples_pred_labels_df = samples_pred_labels_df.apply(lambda x: [GE_taxonomy[i] for i in range(len(x)) if x[i]==1], axis=1)
    
    #return list(samples_pred_labels_df)
    return pd.DataFrame({"Text":text_samples, "Emotions":list(samples_pred_labels_df)})

Let's try on few examples.

In [37]:
# Predict samples
predict_samples(["My favourite food is anything I didn't have to cook myself", "are you kiddin me ??!!", "red","I love to play football","If you did this again I will fight with you"], model, threshold_opt)



Unnamed: 0,Text,Emotions
0,My favourite food is anything I didn't have to...,"[joy, love]"
1,are you kiddin me ??!!,"[curiosity, surprise]"
2,red,[]
3,I love to play football,[love]
4,If you did this again I will fight with you,"[caring, optimism]"
