# EDA: GoEmotion Data sets (merged ones)

In [1]:
from pathlib import Path
from typing import *

In [2]:
import sys

PROJ_ROOT = Path().cwd().parent

if str(PROJ_ROOT) not in sys.path:
    sys.path.append(str(PROJ_ROOT))

In [3]:
import numpy as np
import pandas as pd
import scipy.stats as stats

In [4]:
import altair as alt


def adhoc_theme():
    theme_dict = {
        "config": {
            "view": {"height": 400, "width": 800},
            "title": {"fontSize": 24, "fontWeight": "normal", "titleAlign": "center"},
            "axisLeft": {"labelFontSize": 14, "titleFontSize": 16},
            "axisRight": {"labelFontSize": 14, "titleFontSize": 16},
            "header": {"labelFontSize": 14, "titleFontSize": 16, "titleAlign": "left"},
            "axisBottom": {"labelFontSize": 14, "titleFontSize": 16},
            "legend": {"labelFontSize": 12, "titleFontSize": 14},
            "range": {"category": {"scheme": "category10"}},
        }
    }
    return theme_dict


alt.themes.register("adhoc_theme", adhoc_theme)
alt.themes.enable("adhoc_theme")

alt.data_transformers.enable("default", max_rows=30000)
## needed for large data set

DataTransformerRegistry.enable('default')

## Data preprocessing

In [5]:
from lib.preprocessing import Preprocessor

preprocessor = Preprocessor(with_lemmtatization=False)

In [6]:
with pd.option_context("display.max_colwidth", None):
    display(preprocessor.df_train.sample(n=3, random_state=1))

LOADED: 43410 x 29


Unnamed: 0_level_0,text,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
efh3oek,I know the feeling.,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
eczx96b,You seem slightly sensitive... I was actually thinking we had been infiltrated by the normies,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
eekor8l,"Instant fucking turn off, ugh. Fuck us for wanting equality across the spectrum",0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
preprocessor.count_emotions()

Unnamed: 0,n_label,n_text,pct_text
0,1,36308,83.639714
1,2,6541,15.067957
2,3,532,1.225524
3,4,28,0.064501
4,5,1,0.002304


In [8]:
preprocessor.bar_chart_label_proportion()

In [9]:
%%time

preprocessor.bar_chart_count_docs_by_length()

CPU times: user 4.24 s, sys: 7.65 ms, total: 4.24 s
Wall time: 4.24 s


In [10]:
preprocessor.heatmap_label_correlation()

### Signal words

Here a signal word is a text token which increases the proportion of a specific label. In other words it is a token with high conditional probability `P(label|token)`.

In [11]:
%%time

# preprocessor.histogram_positive_rate()  ## around 3 min

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.34 µs


In [12]:
%%time

# preprocessor.bar_chart_of_top5_signal_words()

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 11.4 µs


## Baseline model with TFIDF

In [13]:
X_train, Y_train = preprocessor.get_train_X_and_Y()
X_train.shape, Y_train.shape

((43410,), (43410, 28))

In [14]:
from emo_classifier.model import TfidfClassifier

tfidf_classifier = TfidfClassifier(tokenizer=preprocessor.tokenizer, min_df=43)

In [15]:
from sklearn.linear_model import LogisticRegression

param_grid = {"C": [0.1, 1.0, 10, 100]}
tfidf_classifier.set_model(
    "plr", LogisticRegression(random_state=0, solver="liblinear", fit_intercept=False), param_grid
)

In [16]:
%%time

tfidf_classifier.fit(X_train, Y_train)

CPU times: user 6.7 s, sys: 192 ms, total: 6.89 s
Wall time: 42.5 s


Unnamed: 0,rank_test_score,mean_test_score,std_test_score,mean_train_score,mean_fit_time,param_estimator__C
2,1,0.287604,0.005908,0.320848,14.692958,10.0
3,2,0.285071,0.004004,0.338429,18.27831,100.0
1,3,0.26602,0.005101,0.282883,6.686909,1.0
0,4,0.200151,0.002492,0.205471,3.75731,0.1


In [17]:
%%time

Y_hat = tfidf_classifier.predict_proba(X_train)
Y_hat.shape

CPU times: user 2.65 s, sys: 3.75 ms, total: 2.66 s
Wall time: 2.65 s


(43410, 28)

In [18]:
idx2label = preprocessor.emotions

(
    alt.Chart(pd.DataFrame(Y_hat, columns=idx2label, index=preprocessor.df_train.index).sample(n=5000))
    .transform_fold(fold=idx2label, as_=["emotion", "probability"])
    .mark_bar()
    .encode(
        x=alt.X("probability:Q", bin=alt.Bin(maxbins=30), axis=alt.Axis(format="%"), title=None),
        y=alt.Y("count():Q", title=None),
        facet=alt.Facet("emotion:N", columns=6),
    )
    .properties(width=120, height=100, title="Histogram of predictions")
)

In [19]:
tfidf_classifier.save()

SAVED: /home/stdiff/Entwicklung/Python/68-emo-classifier/emo_classifier/data/TfidfClassifier.joblib


## Evaluation of the model on the dev set

In [20]:
tfidf_classifier = TfidfClassifier.load()

LOADED: TfidfClassifier instance


In [21]:
%%time

X_dev, Y_dev = preprocessor.get_dev_X_and_Y()
Y_hat_dev = tfidf_classifier.predict_proba(X_dev)

LOADED: 5426 x 29
CPU times: user 832 ms, sys: 0 ns, total: 832 ms
Wall time: 840 ms


In [22]:
from lib.evaluation import PredictionOnDevSetEvaluator

evaluator_dev = PredictionOnDevSetEvaluator(Y_true=Y_dev, Y_prob=Y_hat_dev, X_text=preprocessor.df_dev["text"])
thresholds = evaluator_dev.thresholds()
thresholds.save()

SAVED: /home/stdiff/Entwicklung/Python/68-emo-classifier/emo_classifier/data/Thresholds.json


In [23]:
evaluator_dev.macro_f1_score()

0.3348767073014497

In [24]:
evaluator_dev.best_thresholds.sort_values(by="f1_score", ascending=False)

Unnamed: 0,label,threshold,precision,recall,f1_score,positive_rate,actual_positive_rate
15,gratitude,0.543539,0.930599,0.824022,0.874074,0.058238,0.065979
1,amusement,0.506422,0.772926,0.584158,0.665414,0.04202,0.055842
20,neutral,0.303547,0.511548,0.840317,0.635955,0.01843,0.038518
18,love,0.536678,0.718593,0.56746,0.634146,0.036491,0.046443
0,admiration,0.534125,0.732026,0.459016,0.564232,0.056211,0.089937
14,fear,0.530732,0.6875,0.366667,0.478261,0.008662,0.016587
21,optimism,0.502713,0.722772,0.349282,0.470968,0.000553,0.002764
25,remorse,0.53524,0.709677,0.323529,0.444444,0.012532,0.026355
22,pride,0.573901,1.0,0.266667,0.421053,0.003686,0.023406
26,sadness,0.527744,0.608696,0.293706,0.396226,0.009031,0.023774


In [25]:
evaluator_dev.metrics_scatter_plot()

In [26]:
evaluator_dev.positive_rate_scatter_plot()

In [27]:
with pd.option_context("display.max_colwidth", None):
    display(evaluator_dev.false_positive_by_label().sort_values(by="label").head(18))

Unnamed: 0_level_0,probability,label,text,true_labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ef3huhl,0.994957,admiration,"Georgetown - great food, great shops/ the ""exorcist"" steps are located there, too. the exorcist steps",neutral
eepgcla,0.997405,admiration,That really is fantastic,excitement
ee6qx9u,0.999323,admiration,This is beautiful and heartbreaking :(,sadness
eczvsr7,0.999426,amusement,"No, this sub adores puig lol",neutral
ef1l5oh,0.999999,amusement,"It’s neither. It’s an acrostic, lol.",neutral
ef0v8ga,0.998464,amusement,Lol! Exactly. ☹,approval
edp0uwr,0.961293,anger,I'd rather be bitter and angry than miserable.,neutral
edu6hca,0.978501,anger,For fuck sake... just go and remind them it's the quiet train!,annoyance
ed5bb5g,0.955163,anger,Oil barons hate him!,"annoyance,disapproval"
eevz90x,0.904581,annoyance,This is full of bullshit,anger


In [28]:
with pd.option_context("display.max_colwidth", None):
    display(evaluator_dev.false_negative_by_label().sort_values(by="label").head(18))

Unnamed: 0_level_0,probability,label,text,true_labels
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
eed6tft,0.002895,admiration,Nukeville sounds like a nicer neighborhood. Like no broken and burnt houses and like children playing in the street and people having barbecues,admiration
ed412mo,0.002601,admiration,"Yeah people that take money from their children don't just look at their Bank account and think ""Wow a 4000 dollar deposit magically happened.""",admiration
eda2ojj,0.005235,admiration,Sick burn. I like it.,admiration
eewao3i,0.002276,amusement,A thousand times this! I just found some of my old childhood stuff from the 80s with a 2 bar and it is fucking awesome.,"amusement,joy"
edcqbz0,0.00192,amusement,so true cuz i’m pretty miserable but my instagram life looks amazing,amusement
eej85k7,0.00172,amusement,I love it when people barge into my house and demand to know why I don't like them.,amusement
edrhoxh,0.002255,anger,"The FDA has plenty to criticize. But like here, it's usually criticized horribly off base. It needs to grow some balls and actually enforce things.","anger,annoyance"
ef8scux,0.002446,anger,I'm honestly so pissed off for you. That whole situation is insane. Good luck moving on. No one deserves to be treated like that.,"anger,gratitude"
ee1cn14,0.001603,anger,"Ha, who cares about them supporters...the most important thing for society is that justice is served and the man jailed! Scream all you want.","anger,disapproval"
eda8pj6,0.002231,annoyance,You cant love this girl if she already has a GF whom she sounds like she loves. You'll just ruin a relationship,annoyance


## Model evaluation on the test set

In [29]:
%%time

X_test, Y_test = preprocessor.get_test_X_and_Y()
Y_hat_test = tfidf_classifier.predict_proba(X_test)

LOADED: 5427 x 29
CPU times: user 776 ms, sys: 0 ns, total: 776 ms
Wall time: 771 ms


In [30]:
from lib.evaluation import PredictionOnTestSetEvaluator

evaluator_test = PredictionOnTestSetEvaluator(Y_true=Y_test, Y_prob=Y_hat_test, thresholds=thresholds)

In [31]:
from lib.evaluation import PredictionOnTestSetEvaluator

evaluator_test = PredictionOnTestSetEvaluator(Y_true=Y_test, Y_prob=Y_hat_test, thresholds=thresholds)
evaluator_test.macro_f1_score()

  _warn_prf(average, modifier, msg_start, len(result))


0.2961275045626222

In [32]:
evaluator_test.metrics_scatter_plot()

In [33]:
evaluator_test.positive_rate_scatter_plot()

## Environment

In [34]:
%load_ext watermark
%watermark -v -n -m -p numpy,scipy,sklearn,pandas,matplotlib,seaborn,altair,torch

Python implementation: CPython
Python version       : 3.9.1
IPython version      : 7.30.0

numpy     : 1.21.4
scipy     : 1.7.3
sklearn   : 0.24.2
pandas    : 1.3.4
matplotlib: not installed
seaborn   : not installed
altair    : 4.1.0
torch     : 1.9.1

Compiler    : GCC 7.3.0
OS          : Linux
Release     : 5.3.18-59.34-default
Machine     : x86_64
Processor   : x86_64
CPU cores   : 8
Architecture: 64bit

