In [1]:
import pandas as pd
from pathlib import Path
import re

# ML Libraries
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizerFast, TFRobertaForSequenceClassification, pipeline

# Datasets Folder
datasets_folder = Path("../datasets/goemotions")

## Get the Classes

In [2]:
classes = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

## Load the Dataset

In [3]:
goemotions_1 = pd.read_csv(datasets_folder / "goemotions_1.csv")
goemotions_2 = pd.read_csv(datasets_folder / "goemotions_2.csv")
goemotions_3 = pd.read_csv(datasets_folder / "goemotions_3.csv")
print("GoEmotions 1: ", goemotions_1.shape)
print("GoEmotions 2: ", goemotions_2.shape)
print("GoEmotions 3: ", goemotions_3.shape)

# Concatenate all of the datasets
goemotions = pd.concat([goemotions_1, goemotions_2, goemotions_3])
print("- - - - - - - - - - - - - - - - - - -\nGoEmotions Concatenated: ", goemotions.shape)

GoEmotions 1:  (70000, 37)
GoEmotions 2:  (70000, 37)
GoEmotions 3:  (71225, 37)
- - - - - - - - - - - - - - - - - - -
GoEmotions Concatenated:  (211225, 37)


## Extract the Test Data with a random_state of 23 for Keras Model

In [4]:
text = goemotions['text']
labels = goemotions[classes]

text_train, text_test, labels_train, labels_test = train_test_split(
    text, labels, test_size=0.20, random_state=23
)

In [5]:
test_data = pd.DataFrame()
test_data['text'] = text_test
test_data['emotion'] = labels_test[classes].idxmax(axis=1)
test_data = test_data.reset_index()
test_data = test_data.drop(columns=['index'])

In [6]:
# Function to remove unknown symbols from text
def remove_unknown_symbols(text):
    # Define a regular expression pattern to match characters that are not within the range of alphanumeric, space, and common punctuation characters
    pattern = re.compile(r'[^A-Za-z0-9\s.,?!\'"-]')
    # Replace unknown symbols with an empty string
    return re.sub(pattern, '', text)

In [7]:
test_data['text'] = test_data['text'].apply(remove_unknown_symbols)
test_data.head()

Unnamed: 0,text,emotion
0,"It's delicate, so it's clear for those flesh t...",neutral
1,I wanna die!,anger
2,Man I love how venezuela is a dictatorship whe...,admiration
3,Thank you. Ive been doing more research about ...,gratitude
4,I mean it changes that game at least because i...,excitement


## Load EmoRoBERTa

In [8]:
tokenizer = RobertaTokenizerFast.from_pretrained("arpanghoshal/EmoRoBERTa")
model = TFRobertaForSequenceClassification.from_pretrained("arpanghoshal/EmoRoBERTa")
emotion = pipeline('sentiment-analysis', model='arpanghoshal/EmoRoBERTa', return_all_scores= True)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


## Define Model Testing Functions

In [9]:
def predictTopEmotionEmo(text):
  if len(tokenizer.encode(text)) > 512:
        text = text[:512]
        
  emotions = emotion(text)[0]
  result = sorted(emotions, key=lambda x: x['score'], reverse=True)
  return result[0]['label']

In [10]:
predictTopEmotionEmo("I am sad")

'sadness'

# Functional Testing

### Accuracy Test for the EmoRoBERTa Model

In [None]:
correct = 0
total = len(test_data)
for index, row in test_data.iterrows():
    predicted_emotion = predictTopEmotionEmo(row['text'])
    actual_emotion = row['emotion']
    if predicted_emotion == actual_emotion:
        correct += 1

accuracy = correct / total
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 47.39%


### Input Limit Test

In [22]:
import random
import string

def generate_random_sentence(length):
    # Define the pool of characters to choose from
    pool = string.ascii_letters + string.digits + string.punctuation + ' '

    # Generate random characters to form the sentence
    sentence = ''.join(random.choice(pool) for _ in range(length))

    return sentence

In [31]:
ten_char = generate_random_sentence(10)
hundred_char = generate_random_sentence(100)
thousand_char = generate_random_sentence(1000)
ten_thousand_char = generate_random_sentence(10000)
hundred_thousand_char = generate_random_sentence(100000)
million_char = generate_random_sentence(1000000)

In [32]:
print("Ten Characters               :", f"{'Success' if predictTopEmotionEmo(ten_char) else 'Fail'}")
print("Hundred Characters           :", f"{'Success' if predictTopEmotionEmo(hundred_char) else 'Fail'}")
print("Thousand Characters          :", f"{'Success' if predictTopEmotionEmo(thousand_char) else 'Fail'}")
print("Ten Thousand Characters      :", f"{'Success' if predictTopEmotionEmo(ten_thousand_char) else 'Fail'}")
print("Hundred Thousand Characters  :", f"{'Success' if predictTopEmotionEmo(hundred_thousand_char) else 'Fail'}")
print("Million Characters           :", f"{'Success' if predictTopEmotionEmo(million_char) else 'Fail'}")

Ten Characters               : Success
Hundred Characters           : Success
Thousand Characters          : Success
Ten Thousand Characters      : Success
Hundred Thousand Characters  : Success
Million Characters           : Success
