In [1]:
from datasets import load_dataset
import pandas as pd
from pathlib import Path
from collections import Counter
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset = load_dataset("go_emotions", name="raw")
simplified = load_dataset("go_emotions")

In [5]:
ids = dataset["train"]["id"]
user_ids = dataset["train"]["rater_id"]

In [6]:
len(set(user_ids)), len(set(ids))

(82, 58011)

In [7]:
tr_ids = simplified["train"]["id"]
val_ids = simplified["validation"]["id"]
te_ids = simplified["test"]["id"]

In [8]:
len(tr_ids), len(val_ids), len(te_ids)

(43410, 5426, 5427)

In [9]:
raw_train_ids = [i for i in ids if i in tr_ids]

In [10]:
len(raw_train_ids), len(set(raw_train_ids))

(155502, 43410)

In [11]:
df_raw = pd.DataFrame(dataset["train"])
df_train_simply = pd.DataFrame(simplified["train"])
df_val_simply = pd.DataFrame(simplified["validation"])
df_test_simply = pd.DataFrame(simplified["test"])

In [12]:
# df_raw.shape
df_raw.columns

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [4]:
data_path = Path("../data/goemotions/")
raw_path = data_path / "raw.csv"
train_simple = data_path / "train_simple.csv"
val_simple = data_path / "val_simple.csv"
test_simple = data_path / "test_simple.csv"

train_path_raw = data_path / "train_raw.csv"
val_path_raw = data_path / "val_raw.csv"
test_path_raw = data_path / "test_raw.csv"

train_path = data_path / "train.csv"
val_path = data_path / "val.csv"
test_path = data_path / "test.csv"

In [5]:
# df_raw.to_csv(raw_path, index=False)
# df_train_simply.to_csv(train_simple, index=False)
# df_val_simply.to_csv(val_simple, index=False)
# df_test_simply.to_csv(test_simple, index=False)

In [6]:
# df_train = df_raw[df_raw["id"].isin(df_train_simply["id"])]
# df_val = df_raw[df_raw["id"].isin(df_val_simply["id"])]
# df_test = df_raw[df_raw["id"].isin(df_test_simply["id"])]
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)


In [7]:
df_train.shape, df_val.shape, df_test.shape

((155501, 37), (19440, 37), (19470, 37))

In [8]:
trainers = set(df_train.rater_id.unique())
valers = set(df_val.rater_id.unique())
testers = set(df_test.rater_id.unique())
(valers - trainers), (testers - trainers), (trainers - valers), (trainers - testers)

(set(), set(), set(), set())

In [9]:
# df_train_v2 = df_train[df_train.rater_id != 68]
# # df_train[df_train.id == "efcldxm"]
# df_train_v2.shape, df_train.shape

In [14]:
def analyze_raters_split(df):
    rater_counter = Counter(df.rater_id.tolist())
    top_5 = rater_counter.most_common(5)
    worst_5 = rater_counter.most_common()[-5:]
    print(len(rater_counter))
    return top_5, worst_5    

In [15]:
def check_duplicates(df1, df2, column):
    duplicates = df1[df1[column].isin(df2[column])][column]
    dup_num = duplicates.shape[0]
    unique_dup = duplicates.unique().shape[0]
    unique1 = df1[column].unique().shape[0]
    unique2 = df2[column].unique().shape[0]
    print("duplicates", dup_num)
    print("unique_dup", unique_dup)
    print("unique1", unique1-unique_dup)
    print("unique2", unique2-unique_dup)    

In [16]:
# print("train-val")
# check_duplicates(df_train_v2, df_val, "rater_id")
# check_duplicates(df_train_v2, df_val, "id")
# print("train-test")
# check_duplicates(df_train_v2, df_test, "rater_id")
# check_duplicates(df_train_v2, df_test, "id")
# print("val-test")
# check_duplicates(df_val, df_test, "rater_id")
# check_duplicates(df_val, df_test, "id")


In [17]:
print("train")
print(analyze_raters_split(df_train))
print("val")
print(analyze_raters_split(df_val))
print("test")
print(analyze_raters_split(df_test))


train
81
([(4, 7899), (61, 7434), (37, 6879), (2, 4355), (62, 4125)], [(69, 63), (45, 51), (53, 45), (0, 31), (47, 3)])
val
81
([(4, 948), (61, 922), (37, 820), (2, 532), (52, 530)], [(45, 7), (53, 6), (80, 6), (0, 2), (47, 2)])
test
81
([(4, 1010), (61, 919), (37, 878), (2, 553), (52, 544)], [(80, 8), (65, 8), (0, 7), (53, 6), (47, 2)])


In [23]:
# df_train_v2.to_csv(train_path_raw, index=False)
# df_val.to_csv(val_path_raw, index=False)
# df_test.to_csv(test_path_raw, index=False)

In [24]:
FINAL_COLUMNS = ["text", "rater_id", "emotions"]
EMOTIONS = ['admiration','amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral']

In [25]:
# def extract_emotions(sample):
#     emo_cols = sample[EMOTIONS]
#     mask = emo_cols.to_numpy(dtype=bool)
#     emos = " ".join(np.array(EMOTIONS)[mask].tolist())
#     return emos

# def change_row(sample):
#     id = sample["rater_id"]
#     text = sample["text"]
#     emotions = extract_emotions(sample)

In [26]:
# sample = df_train_v2.iloc[21]
# out = extract_emotions(sample)
# out

'somethin something else'