### Multi-Class Text Classification for Emotions using BERT

In [None]:
# ! pip install numpy pandas scikit-learn ipykernel jupyter matplotlib seaborn evaluate 'transformers[torch]' transformers pytorch tqdm datasets huggingface_hub ipywidgets torch xformers plotnine

In [None]:
import numpy as np
import pandas as pd
import random

# We need the sys package to load modules from another directory:
import sys
sys.path.append('../')
from preprocessing.preprocessors import *
from training.bert_func import *

from sklearn.metrics import classification_report

#### Get the data

In [None]:
df = pd.read_csv("../data/GoEmotions.csv")
df_clean = clean_df(df)

In [None]:
r, c = df_clean.shape
print(f"The data has {r} row and {c} columns")
df_clean.head(3)

In [None]:
pivoted_df = create_pivoted_df(df_clean)
hierarchical_df = add_hierarchical_levels(pivoted_df)

In [None]:
r, c = hierarchical_df.shape
print(f"The data has {r} row and {c} columns")
hierarchical_df.head(3)

In [None]:
# use majority vote for texts with more than one label (from different raters) to only have one gold standard
majority_vote_df = majority_voted_df(hierarchical_df)
r, c = majority_vote_df.shape
print(f"The majority voted data has {r} row and {c} columns")

clustered_df = hierarchical_df.merge(majority_vote_df, on=['id', 'level0'], how='inner')

majority_vote_df.head(3)

In [None]:
r, c = clustered_df.shape
print(f"The data has {r} row and {c} columns")
clustered_df.head(3)

In [None]:
# Keep only the rows with distinct values in the 'id' column
distinct_df = clustered_df.drop_duplicates(subset='id', keep='first')
r, c = distinct_df.shape
print(f"The data has {r} row and {c} columns")
distinct_df

In [None]:
# create a sample for tests
# distinct_df = distinct_df.sample(n=1000, replace=False, random_state=123)

In [None]:
"""
#Classifier
from huggingface_hub import notebook_login
notebook_login()
"""

In [None]:
bert = "bert-base-cased"
models_dir = "../models/bert_base_cased/"
results_dir = "../results/bert_base_cased/"

### BERT for level 0 -> 27 +1 emotions
following: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [None]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

sadness_example = random.sample(list(distinct_df.id[distinct_df.level0 == "sadness"]), k=1) # example for annoyance
distinct_df.query('id==@sadness_example') 

In [None]:
# check if data set is balanced
classCounts_0 = distinct_df.level0.value_counts()
print(classCounts_0)
# -> not balanced

In [None]:
numberOfDocuments_0 = len(distinct_df)
numberOfDocuments_0

In [None]:
dataset_0, results_0, tokenized_testing_data_0, testing_data_0, label2id_0 = get_bert(distinct_df, "level0", bert, models_dir, results_dir)

In [None]:
# check if data set is balanced
classCounts_0 = pd.DataFrame(testing_data_0).level0.value_counts() 
print(classCounts_0)
# -> not balanced

#### Evaluation

In [None]:
df_results_0 = pd.DataFrame.from_dict(results_0)
df_id_0 =  pd.DataFrame(dataset_0["id"])
df_id_0 = df_id_0.reset_index()
df_results_0["id"] = df_id_0["id"]
df_results_0

In [None]:
data_classifies_0 = pd.merge(dataset_0, df_results_0, on='id', how='left') # merge classified data with original training data
data_classifies_0.rename(columns={'label_y': 'LABEL_pred'}, inplace=True) # contain sgoldstandard and cluster of results
data_classifies_0["LABEL_pred_num"] = data_classifies_0["LABEL_pred"].map(label2id_0.get)
data_classifies_0.to_pickle(results_dir + "data_classified_level0.pkl") 
data_classifies_0

In [None]:
test_data_0 = data_classifies_0.query(f'id in {tokenized_testing_data_0["id"]}')
test_data_0

In [None]:
r, c = pd.DataFrame(testing_data_0).shape
print(f"The test data has {r} row and {c} columns")

In [None]:
report_0 = pd.DataFrame(classification_report(test_data_0.level0, test_data_0.LABEL_pred, output_dict=True)).transpose()
report_0.to_csv(results_dir + "model_level0_report.csv")
print(report_0)
# level0 -> gold standard , LABEL_pred -> prediction


In [None]:
#Final Classification/Viz

final_0 = pd.DataFrame(test_data_0['LABEL_pred'].value_counts()/test_data_0['LABEL_pred'].value_counts().sum()) # ratio
final_0.to_csv(results_dir + "model_level0_testdata_frequency.csv")
print(final_0.shape)
final_0

### BERT for level 1 -> 17 + 1 emotions

In [None]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

dis_sad_example = random.sample(list(distinct_df.id[distinct_df.level1 == "dis_sad"]), k=1) # example for annoyance
distinct_df.query('id==@dis_sad_example')  


In [None]:
# check if data set is balanced
classCounts_1 = distinct_df.level1.value_counts() 
print(classCounts_1)
# -> not balanced

In [None]:
numberOfDocuments_1 = len(distinct_df)
numberOfDocuments_1

In [None]:
dataset_1, results_1, tokenized_testing_data_1, testing_data_1, label2id_1 = get_bert(distinct_df, "level1", bert, models_dir, results_dir)

#### Evaluation

In [None]:
df_results_1 = pd.DataFrame.from_dict(results_1)
df_id_1 =  pd.DataFrame(dataset_1["id"])
df_id_1 = df_id_1.reset_index()
df_results_1["id"] = df_id_1["id"]
df_results_1

In [None]:
data_classifies_1 = pd.merge(dataset_1, df_results_1, on='id', how='left') # merge classified data with original training data
data_classifies_1.rename(columns={'label_y': 'LABEL_pred'}, inplace=True) # contain sgoldstandard and cluster of results
data_classifies_1["LABEL_pred_num"] = data_classifies_1["LABEL_pred"].map(label2id_1.get)
data_classifies_1.to_pickle(results_dir + "data_classified_level1.pkl") 
data_classifies_1

In [None]:
test_data_1 = data_classifies_1.query(f'id in {tokenized_testing_data_1["id"]}')
test_data_1

In [None]:
r, c = pd.DataFrame(testing_data_1).shape
print(f"The test data has {r} row and {c} columns")

In [None]:
report_1= pd.DataFrame(classification_report(test_data_1.level1, test_data_1.LABEL_pred, output_dict=True)).transpose()
report_1.to_csv(results_dir + "model_level1_report.csv")
print(report_1)
# level1 -> gold standard , LABEL_pred -> prediction

In [None]:
#Final Classification/Viz

final_1 = pd.DataFrame(test_data_1['LABEL_pred'].value_counts()/test_data_1['LABEL_pred'].value_counts().sum()) # ratio
final_1.to_csv(results_dir + "model_level1_testdata_frequency.csv")
print(final_1.shape)
final_1

### BERT for level 2 -> 11 + 1 emotions

In [None]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

dis_sad_gri_example = random.sample(list(distinct_df.id[distinct_df.level2 == "dis_sad_gri"]), k=1) # example for annoyance
distinct_df.query('id==@dis_sad_gri_example')

In [None]:
# check if data set is balanced
classCounts_2 = distinct_df.level2.value_counts() 
print(classCounts_2)
# -> not balanced

In [None]:
numberOfDocuments_2 = len(distinct_df)
numberOfDocuments_2

In [None]:
dataset_2, results_2, tokenized_testing_data_2, testing_data_2, label2id_2 = get_bert(distinct_df, "level2", bert, models_dir, results_dir)

#### Evaluation

In [None]:
df_results_2 = pd.DataFrame.from_dict(results_2)
df_id_2 =  pd.DataFrame(dataset_2["id"])
df_id_2 = df_id_2.reset_index()
df_results_2["id"] = df_id_2["id"]
df_results_2

In [None]:
data_classifies_2 = pd.merge(dataset_2, df_results_2, on='id', how='left') # merge classified data with original training data
data_classifies_2.rename(columns={'label_y': 'LABEL_pred'}, inplace=True) # contain sgoldstandard and cluster of results
data_classifies_2["LABEL_pred_num"] = data_classifies_2["LABEL_pred"].map(label2id_2.get)
data_classifies_2.to_pickle(results_dir + "data_classified_level2.pkl") 
data_classifies_2

In [None]:
test_data_2 = data_classifies_2.query(f'id in {tokenized_testing_data_2["id"]}')
test_data_2

In [None]:
r, c = pd.DataFrame(testing_data_2).shape
print(f"The test data has {r} row and {c} columns")

In [None]:
report_2 = pd.DataFrame(classification_report(test_data_2.level2, test_data_2.LABEL_pred, output_dict=True)).transpose()
report_2.to_csv(results_dir + "model_level2_report.csv")
print(report_2)
# level2 -> gold standard , LABEL_pred -> prediction

In [None]:
#Final Classification/Viz

final_2 = pd.DataFrame(test_data_2['LABEL_pred'].value_counts()/test_data_2['LABEL_pred'].value_counts().sum()) # ratio
final_2.to_csv(results_dir + "model_level2_testdata_frequency.csv")
print(final_2.shape)
final_2

### BERT for level 3 -> 7 + 1 emotions

In [None]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

rem_emb_dis_sad_gri_example = random.sample(list(distinct_df.id[distinct_df.level3 == "rem_emb_dis_sad_gri"]), k=1) # example for annoyance
distinct_df.query('id==@rem_emb_dis_sad_gri_example')

In [None]:
# check if data set is balanced
classCounts_3 = distinct_df.level3.value_counts() 
print(classCounts_3)
# -> not balanced

In [None]:
numberOfDocuments_3 = len(distinct_df)
numberOfDocuments_3

In [None]:
dataset_3, results_3, tokenized_testing_data_3, testing_data_3, label2id_3 = get_bert(distinct_df, "level3", bert, models_dir, results_dir)

#### Evaluation

In [None]:
df_results_3 = pd.DataFrame.from_dict(results_3)
df_id_3 =  pd.DataFrame(dataset_3["id"])
df_id_3 = df_id_3.reset_index()
df_results_3["id"] = df_id_3["id"]
df_results_3

In [None]:
data_classifies_3 = pd.merge(dataset_3, df_results_3, on='id', how='left') # merge classified data with original training data
data_classifies_3.rename(columns={'label_y': 'LABEL_pred'}, inplace=True) # contain sgoldstandard and cluster of results
data_classifies_3["LABEL_pred_num"] = data_classifies_3["LABEL_pred"].map(label2id_3.get)
data_classifies_3.to_pickle(results_dir + "data_classified_level3.pkl") 
data_classifies_3

In [None]:
test_data_3 = data_classifies_3.query(f'id in {tokenized_testing_data_3["id"]}')
test_data_3

In [None]:
r, c = pd.DataFrame(testing_data_3).shape
print(f"The test data has {r} row and {c} columns")

In [None]:
report_3 = pd.DataFrame(classification_report(test_data_3.level3, test_data_3.LABEL_pred, output_dict=True)).transpose()
report_3.to_csv(results_dir + "model_level3_report.csv")
print(report_3)
# level3 -> gold standard , LABEL_pred -> prediction

In [None]:
#Final Classification/Viz

final_3 = pd.DataFrame(test_data_3['LABEL_pred'].value_counts()/test_data_3['LABEL_pred'].value_counts().sum()) # ratio
final_3.to_csv(results_dir + "model_level3_testdata_frequency.csv")
print(final_3.shape)
final_3

### BERT for plutchik -> 14 + 1 emotions

In [None]:
pd.options.display.max_colwidth = None # default value is 50, max would be "None"
pd.set_option('display.max_rows', 50) # default value is 10, max would be "None"

grief_example = random.sample(list(distinct_df.id[distinct_df.plutchik == "grief"]), k=1) # example for annoyance
distinct_df.query('id==@grief_example')

In [None]:
# check if data set is balanced
classCounts_p = distinct_df.plutchik.value_counts() 
print(classCounts_p)
# -> not balanced

In [None]:
numberOfDocuments_p = len(distinct_df)
numberOfDocuments_p

In [None]:
dataset_p, results_p, tokenized_testing_data_p, testing_data_p, label2id_p = get_bert(distinct_df, "plutchik", bert, models_dir, results_dir)

#### Evaluation

In [None]:
df_results_p = pd.DataFrame.from_dict(results_p)
df_id_p =  pd.DataFrame(dataset_2["id"])
df_id_p = df_id_p.reset_index()
df_results_p["id"] = df_id_p["id"]
df_results_p

In [None]:
data_classifies_p = pd.merge(dataset_p, df_results_p, on='id', how='left') # merge classified data with original training data
data_classifies_p.rename(columns={'label_y': 'LABEL_pred'}, inplace=True) # contain sgoldstandard and cluster of results
data_classifies_p["LABEL_pred_num"] = data_classifies_p["LABEL_pred"].map(label2id_p.get)
data_classifies_p.to_pickle(results_dir + "data_classified_plutchik.pkl") 
data_classifies_p

In [None]:
test_data_p = data_classifies_p.query(f'id in {tokenized_testing_data_p["id"]}')
test_data_p

In [None]:
r, c = pd.DataFrame(testing_data_p).shape
print(f"The test data has {r} row and {c} columns")

In [None]:
report_p = pd.DataFrame(classification_report(test_data_p.plutchik, test_data_p.LABEL_pred, output_dict=True)).transpose()
report_p.to_csv(results_dir + "model_plutchik_report.csv")
print(report_p)
# plutchik -> gold standard , LABEL_pred -> prediction

In [None]:
#Final Classification/Viz

final_p = pd.DataFrame(test_data_p['LABEL_pred'].value_counts()/test_data_p['LABEL_pred'].value_counts().sum()) # ratio
final_p.to_csv(results_dir + "model_plutchik_testdata_frequency.csv")
print(final_p.shape)
final_p