In [2]:
import pandas as pd
from pathlib import Path
import joblib
import re

# ML Libraries
import tensorflow as tf
import keras
from sklearn.model_selection import train_test_split

# Paths
datasets_folder = Path("../datasets/goemotions")
models_folder = Path("../models")


## Get the Classes

In [3]:
classes = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

## Load the Dataset

In [4]:
goemotions_1 = pd.read_csv(datasets_folder / "goemotions_1.csv")
goemotions_2 = pd.read_csv(datasets_folder / "goemotions_2.csv")
goemotions_3 = pd.read_csv(datasets_folder / "goemotions_3.csv")
print("GoEmotions 1: ", goemotions_1.shape)
print("GoEmotions 2: ", goemotions_2.shape)
print("GoEmotions 3: ", goemotions_3.shape)

# Concatenate all of the datasets
goemotions = pd.concat([goemotions_1, goemotions_2, goemotions_3])
print("- - - - - - - - - - - - - - - - - - -\nGoEmotions Concatenated: ", goemotions.shape)

GoEmotions 1:  (70000, 37)
GoEmotions 2:  (70000, 37)
GoEmotions 3:  (71225, 37)
- - - - - - - - - - - - - - - - - - -
GoEmotions Concatenated:  (211225, 37)


In [5]:
goemotions.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


## Extract the Test Data with a random_state of 23 for Keras Model

In [6]:
text = goemotions['text']
labels = goemotions[classes]

text_train, text_test, labels_train, labels_test = train_test_split(
    text, labels, test_size=0.20, random_state=23
)

In [7]:
test_data = pd.DataFrame()
test_data['text'] = text_test
test_data['emotion'] = labels_test[classes].idxmax(axis=1)
test_data = test_data.reset_index()
test_data = test_data.drop(columns=['index'])

In [8]:
# Function to remove unknown symbols from text
def remove_unknown_symbols(text):
    # Define a regular expression pattern to match characters that are not within the range of alphanumeric, space, and common punctuation characters
    pattern = re.compile(r'[^A-Za-z0-9\s.,?!\'"-]')
    # Replace unknown symbols with an empty string
    return re.sub(pattern, '', text)

In [9]:
test_data['text'] = test_data['text'].apply(remove_unknown_symbols)
test_data.head()

Unnamed: 0,text,emotion
0,"It's delicate, so it's clear for those flesh t...",neutral
1,I wanna die!,anger
2,Man I love how venezuela is a dictatorship whe...,admiration
3,Thank you. Ive been doing more research about ...,gratitude
4,I mean it changes that game at least because i...,excitement


## Load the Models

In [11]:
keras_model = keras.models.load_model(str(models_folder / 'emo_modelV2.keras'), compile=True)




In [12]:
with open(models_folder / 'emotion_classifier_pipe_lr.pkl', 'rb') as f:
    lr_model = joblib.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## Define Model Testing Functions

In [13]:
def predictTopEmotionKeras(text):
  scores = keras_model(tf.constant([text]))['dense_1'][0]
  emotions = [{'label': label, 'score': score} for label, score in zip(classes, scores.numpy())]
  result = sorted(emotions, key=lambda x: x['score'], reverse=True)
  return result[0]['label']

In [14]:
predictTopEmotionKeras("I am sad")

'sadness'

In [15]:
def predictTopEmotionLR(text):
  emotions = [{'label': label, 'score': score} for label, score in zip(lr_model.classes_, lr_model.predict_proba([text])[0])]
  result = sorted(emotions, key=lambda x: x['score'], reverse=True)
  return result[0]['label']

In [16]:
predictTopEmotionLR("I am sad")

'sadness'

# Functional Testing

### Accuracy Test for the Keras Model

In [15]:
correct = 0
total = len(test_data)
for index, row in test_data.iterrows():
    predicted_emotion = predictTopEmotionKeras(row['text'])
    actual_emotion = row['emotion']
    if predicted_emotion == actual_emotion:
        correct += 1

accuracy = correct / total
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 40.76%


### Accuracy Test for the LR Model

In [16]:
correct = 0
total = len(test_data)
for index, row in test_data.iterrows():
    predicted_emotion = predictTopEmotionLR(row['text'])
    actual_emotion = row['emotion']
    if predicted_emotion == actual_emotion:
        correct += 1

accuracy = correct / total
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 39.99%


## Input Limit Test

In [17]:
import random
import string

def generate_random_sentence(length):
    # Define the pool of characters to choose from
    pool = string.ascii_letters + string.digits + string.punctuation + ' '

    # Generate random characters to form the sentence
    sentence = ''.join(random.choice(pool) for _ in range(length))

    return sentence

In [22]:
ten_char = generate_random_sentence(10)
hundred_char = generate_random_sentence(100)
thousand_char = generate_random_sentence(1000)
ten_thousand_char = generate_random_sentence(10000)
hundred_thousand_char = generate_random_sentence(100000)
million_char = generate_random_sentence(1000000)

### Input Limit Test for the Keras Model

In [23]:
print("Ten Characters               :", f"{'Success' if predictTopEmotionKeras(ten_char) else 'Fail'}")
print("Hundred Characters           :", f"{'Success' if predictTopEmotionKeras(hundred_char) else 'Fail'}")
print("Thousand Characters          :", f"{'Success' if predictTopEmotionKeras(thousand_char) else 'Fail'}")
print("Ten Thousand Characters      :", f"{'Success' if predictTopEmotionKeras(ten_thousand_char) else 'Fail'}")
print("Hundred Thousand Characters  :", f"{'Success' if predictTopEmotionKeras(hundred_thousand_char) else 'Fail'}")
print("Million Characters           :", f"{'Success' if predictTopEmotionKeras(million_char) else 'Fail'}")

Ten Characters               : Success
Hundred Characters           : Success
Thousand Characters          : Success
Ten Thousand Characters      : Success
Hundred Thousand Characters  : Success
Million Characters           : Success


### Input Limit Test for the LR Model

In [24]:
print("Ten Characters               :", f"{'Success' if predictTopEmotionLR(ten_char) else 'Fail'}")
print("Hundred Characters           :", f"{'Success' if predictTopEmotionLR(hundred_char) else 'Fail'}")
print("Thousand Characters          :", f"{'Success' if predictTopEmotionLR(thousand_char) else 'Fail'}")
print("Ten Thousand Characters      :", f"{'Success' if predictTopEmotionLR(ten_thousand_char) else 'Fail'}")
print("Hundred Thousand Characters  :", f"{'Success' if predictTopEmotionLR(hundred_thousand_char) else 'Fail'}")
print("Million Characters           :", f"{'Success' if predictTopEmotionLR(million_char) else 'Fail'}")

Ten Characters               : Success
Hundred Characters           : Success
Thousand Characters          : Success
Ten Thousand Characters      : Success
Hundred Thousand Characters  : Success
Million Characters           : Success
