**Install project requirements**

In [1]:
# !pip install -r requirements.txt

**Import libraries**

In [2]:
import pandas as pd

import os

**Define input data parameters**

In [3]:
input_path = os.path.join("data", "raw")
input_file = os.path.join(input_path, "SetFit_emotion.json")
input_file

'data\\raw\\SetFit_emotion.json'

**Load input data**

In [4]:
df = pd.read_json(path_or_buf=input_file, orient="records", lines=True)
df

Unnamed: 0,text,label,label_text,split
0,i didnt feel humiliated,0,sadness,train
1,i can go from feeling so hopeless to so damned...,0,sadness,train
2,im grabbing a minute to post i feel greedy wrong,3,anger,train
3,i am ever feeling nostalgic about the fireplac...,2,love,train
4,i am feeling grouchy,3,anger,train
...,...,...,...,...
19995,im having ssa examination tomorrow in the morn...,0,sadness,validation
19996,i constantly worry about their fight against n...,1,joy,validation
19997,i feel its important to share this info for th...,1,joy,validation
19998,i truly feel that if you are passionate enough...,1,joy,validation


**Check label classes balance**

In [5]:
df["label_text"].value_counts(normalize=True)

joy         0.33805
sadness     0.28985
anger       0.13545
fear        0.11865
love        0.08205
surprise    0.03595
Name: label_text, dtype: float64

**Check subset splits proportion**

In [6]:
df["split"].value_counts(normalize=True)

train         0.8
test          0.1
validation    0.1
Name: split, dtype: float64

**Define custom function to convert the pandas DataFrame into accepted fastText data format**

In [7]:
def save_model_data(df: pd.DataFrame, text: str, label: str, split: str) -> None:
    """
    Save pandas DataFrame subsets into fastText accepted data format.
    
    Parameters
    ----------
    df : pd.DataFrame
        input dataframe
    text : str
        text column
    label : str
        label column
    split : str
        split column
    
    Returns
    -------
    None
    """
    output_path = os.path.join("data", "processed")
    os.makedirs(name=output_path, exist_ok=True)
    
    for subset in df[split].unique():
        df_subset = df.loc[df[split]==subset, [label, text]]
        output_file = os.path.join(output_path, f"{subset}.txt")
        with open(file=output_file,  mode="w") as file:
            for lbl, txt in zip(df_subset[label], df_subset[text]):
                file.write(f"__label__{lbl} {txt}\n")
        print(f"The file {output_file} has been saved.")

**Call the custom function to save the data**

In [8]:
save_model_data(df=df, text="text", label="label_text", split="split")

The file data\processed\train.txt has been saved.
The file data\processed\test.txt has been saved.
The file data\processed\validation.txt has been saved.
