In [1]:
from pathlib import Path
import urllib.request
import pandas as pd

# HuggingFace
from transformers import RobertaTokenizerFast, TFRobertaForSequenceClassification, pipeline

# Datasets Folder
datasets_folder = Path("./datasets/goemotions")

# Get Emotion Classes

In [2]:
classes = urllib.request.urlopen('https://raw.githubusercontent.com/google-research/google-research'
'/master/goemotions/data/emotions.txt').read().decode('utf8').split('\n')
num_classes = len(classes)
print(f"Number of Classes: {num_classes}")
print(classes)

Number of Classes: 28
['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


# Loading the Dataset

In [3]:
goemotions_1 = pd.read_csv(datasets_folder / "goemotions_1.csv")
goemotions_2 = pd.read_csv(datasets_folder / "goemotions_2.csv")
goemotions_3 = pd.read_csv(datasets_folder / "goemotions_3.csv")
print("GoEmotions 1: ", goemotions_1.shape)
print("GoEmotions 2: ", goemotions_2.shape)
print("GoEmotions 3: ", goemotions_3.shape)

# Concatenate all of the datasets
goemotions = pd.concat([goemotions_1, goemotions_2, goemotions_3])
print("- - - - - - - - - - - - - - - - - - -\nGoEmotions Concatenated: ", goemotions.shape)

GoEmotions 1:  (70000, 37)
GoEmotions 2:  (70000, 37)
GoEmotions 3:  (71225, 37)
- - - - - - - - - - - - - - - - - - -
GoEmotions Concatenated:  (211225, 37)


In [4]:
# Check the Dataset
goemotions.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


# Instantiate EmoRoBERTa Tokenizer, Model, and Pipeline

In [5]:
# Get the EmoRoBERTa from HuggingFace
tokenizer = RobertaTokenizerFast.from_pretrained("arpanghoshal/EmoRoBERTa")
model = TFRobertaForSequenceClassification.from_pretrained("arpanghoshal/EmoRoBERTa")
emotion = pipeline('sentiment-analysis', model='arpanghoshal/EmoRoBERTa', top_k=None)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


# Try the Pretrained Model EmoRoBERTa

In [6]:
import re
from collections import defaultdict

texts = """
My Dearest [Partner's Name],

As I sit down to write this letter, my heart is overflowing with the love that I hold for you. Words alone cannot express the depth of my feelings, but I hope that these humble words can convey a fraction of the love that resides within me.

From the moment I met you, my life has been filled with an abundance of joy and happiness. Your presence brings light into my world, and your laughter is like music to my ears. Every moment spent with you is a treasure, and I am grateful for each second we share together.

Your kindness, compassion, and unwavering support have touched my soul in ways I never thought possible. You are my rock, my confidant, and my greatest source of strength. In your embrace, I find solace and comfort, knowing that I am loved unconditionally.

With each passing day, my love for you grows stronger and deeper. You are the one I want to share my hopes, dreams, and aspirations with. You are the one I want to build a future with, filled with laughter, adventure, and endless love.

As I pen these words, I want you to know that you are cherished beyond measure. You are the most beautiful soul I have ever known, and I am endlessly grateful to have you in my life.

I love you more than words can say, and I will spend the rest of my days showing you just how much you mean to me.

Forever and always,
[Your Name]


"""

def predict_emotions(texts):
    # Split the huge chunk of text into a string list
    text_list = re.split(r'[.!?;\n]', texts)
    text_list = [text.strip() for text in text_list if text.strip()]

    # Create a list of all predictions per text
    predictions_per_text = []
    for text in text_list:
        predictions_per_text.append(emotion(text)[0])
    
    # Create a defaultdict to aggregate scores for each label
    total = defaultdict(float)

    # Iterate over each list and aggregate the scores
    for prediction in predictions_per_text:
        for emotion_dict in prediction:
            label = emotion_dict['label']
            score = emotion_dict['score']
            total[label] += score
    
    # Convert the defaultdict to a list of dictionaries
    result = [{'label': label, 'score': score} for label, score in total.items()]
    # Sort the result in descending order
    sorted_result = sorted(result, key=lambda x: x['score'], reverse=True)
    return {"predictions" : sorted_result}

predict_emotions(texts)


{'predictions': [{'label': 'love', 'score': 6.231944722414482},
  {'label': 'admiration', 'score': 2.246449695812771},
  {'label': 'joy', 'score': 2.210572298936313},
  {'label': 'neutral', 'score': 2.0135865128822843},
  {'label': 'desire', 'score': 1.2290927699887106},
  {'label': 'optimism', 'score': 0.9187107055731758},
  {'label': 'gratitude', 'score': 0.833778717849782},
  {'label': 'approval', 'score': 0.4793453911697725},
  {'label': 'caring', 'score': 0.24826980605575955},
  {'label': 'amusement', 'score': 0.15367175083520124},
  {'label': 'pride', 'score': 0.14663361136626918},
  {'label': 'grief', 'score': 0.062015252020501066},
  {'label': 'embarrassment', 'score': 0.04561865358846262},
  {'label': 'disgust', 'score': 0.024121032478433335},
  {'label': 'remorse', 'score': 0.022114704694104148},
  {'label': 'anger', 'score': 0.021033011355029885},
  {'label': 'disapproval', 'score': 0.019088391250988934},
  {'label': 'relief', 'score': 0.017980166965571698},
  {'label': 'exc

In [7]:
def predict_emotions(text):
    prediction = emotion(text)[0]
    sorted_prediction = sorted(prediction, key=lambda x: x['score'], reverse=True)
    predicted_emotion = sorted_prediction[0]['label']
    probability = round(sorted_prediction[0]['score'] * 100, 1)
    return {"prediction":f"Emotion: {predicted_emotion} ({probability})%"}

predict_emotions("I am sorry for your loss")

{'prediction': 'Emotion: grief (76.9)%'}

In [8]:
teks = ""
for i in range(1020):
    teks += "d"
print(teks)
predict_emotions(teks)

dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd

{'prediction': 'Emotion: neutral (89.8)%'}

#### Based on this, the model can only accept at most 1020 characters

In [9]:
teks = ""
for i in range(1021):
    teks += "d"
print(teks)
predict_emotions(teks)

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd

InvalidArgumentError: Exception encountered when calling layer 'embeddings' (type TFRobertaEmbeddings).

{{function_node __wrapped__ResourceGather_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[0,512] = 514 is not in [0, 514) [Op:ResourceGather] name: 

Call arguments received by layer 'embeddings' (type TFRobertaEmbeddings):
  • input_ids=tf.Tensor(shape=(1, 513), dtype=int32)
  • position_ids=None
  • token_type_ids=tf.Tensor(shape=(1, 513), dtype=int32)
  • inputs_embeds=None
  • past_key_values_length=0
  • training=False