# Dataset Preparation
---

In [1]:
from pathlib import Path
path = Path().absolute()

## Load original dataset
---

In [2]:
import re

data = []

with open(path / "assets/original.txt") as file:
    for line in file.read().splitlines():
        # remove suffix ";"
        # split data by ";"
        data.append(
            re.sub(";$",'',line)
            .split(';')
        )

## Create dataframe
---

In [3]:
import pandas as pd

df = pd.DataFrame(data, columns=["text","target_emotion"])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            18000 non-null  object
 1   target_emotion  17771 non-null  object
dtypes: object(2)
memory usage: 281.4+ KB


In [5]:
df.fillna('', inplace=True)

In [6]:
df["target_emotion"].value_counts()

kesedihan                               5478
kegembiraan                             5373
kemarahan                               2310
ketakutan                               2113
cinta                                   1440
kejutan                                  623
                                         229
 kesedihan                               211
 sukacita                                 62
 kemarahan                                62
kegembiraanjo                             28
ketakutanfe                                7
kemarahanan                                6
kegembiraan jo                             4
 ​​kesedihan                               4
 kesedihanad                               3
 ​​cinta                                   3
kejutansur                                 3
 ​​takut                                   3
sukacita                                   3
 ​​sukacita                                2
 terkejut                                  2
kegembiraa

### Normalized classes
---

In [7]:
true_labels = {
    "cinta": [
        "cinta kesayangannya",
        "sayang"
    ],
    "gembira": [
        "kegembiraanjo",
        "kegembiraan jo",
        "kegembiraan now",
        "kegembiraan saat ini",
        "kegembiraan saya",
        "kegembiraan them",
        "kegembiraan upcoming",
        "kegembiraan world",
        "kegembiraan yang glamor",
        "kegembiraan",
        "kegembiraan",
        "kegembiraanjo",
        "senang",
        "kegembiraan",
        "kegembiraanjo"
    ],
    "kaget": [
        "kejutan",
        "kejutansur",
        "kejutanyang luar biasa dan menyentuh",
        "terkejut"
    ],
    "marah": [
        "amarahmu",
        "kemarahan",
        "kemarahan",
        "kemarahanan",
        "kemarahanan"
    ],
    "sedih": [
        "kesedihan",
        "kesedihan",
        "kesedihanad",
        "kesedihanini.",
        "kesedihanlonely",
        "kesedihanlove",
        "kesedihans",
        "kesedihanyang diperlukan",
        "kesedihan"
    ],
    "sukacita": [
        "bersukacita",
        "sukacitajo"
    ],
    "takut": [
        "ketakutan intricate",
        "ketakutan",
        "ketakutanfe"
    ]
}

In [8]:
for i in range(len(df)):
    target_emotion: str = df.at[i,"target_emotion"]
    target_emotion = target_emotion.replace(u'\u200b','')
    target_emotion = target_emotion.strip()

    for true_label, labels in true_labels.items():
        if target_emotion in labels:
            target_emotion = true_label

    df.at[i,"target_emotion"] = target_emotion

df["target_emotion"].value_counts()

sedih           5701
gembira         5419
marah           2383
takut           2124
cinta           1445
kaget            629
                 229
sukacita          69
mengherankan       1
Name: target_emotion, dtype: int64

### Delete rows with defective and low frequency target emotion
---

In [9]:
df.drop(df[df["target_emotion"].isin(['',"sukacita","mengherankan"])].index, inplace=True)

df["target_emotion"].value_counts()

sedih      5701
gembira    5419
marah      2383
takut      2124
cinta      1445
kaget       629
Name: target_emotion, dtype: int64

## Export clean dataset
---

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17701 entries, 0 to 17998
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            17701 non-null  object
 1   target_emotion  17701 non-null  object
dtypes: object(2)
memory usage: 414.9+ KB


In [11]:
df.to_csv(path / "assets/dataset.csv", sep=";", index=False)