# emotions

In [39]:
import os
import numpy as np
import pandas as pd
from datasets import load_dataset

# Paths and Variables

In [40]:
dataset_name = "emotions"

In [41]:
input_dir = './data'
output_dir = f'./../../processed/{dataset_name}/'
os.makedirs(output_dir, exist_ok=True)
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

# Get data from huggingface datasets

In [42]:
data = load_dataset("dair-ai/emotion", "split")

In [43]:
len(data['train']['text']), len(data['validation']['text']), len(data['test']['text'])

(16000, 2000, 2000)

## Combine train and validation data

In [44]:
train_data = data['train']['text']
train_labels = data['train']['label']
val_data = data['validation']['text']
val_labels = data['validation']['label']
test_data = data['test']['text']
test_labels = data['test']['label']

train_data.extend(val_data)
train_labels.extend(val_labels)

In [45]:
id_col = "id"
target_col = "label"
text_col = "text"

In [46]:
train_data = pd.DataFrame({id_col: range(len(train_data)), text_col: train_data, target_col: train_labels})
train_data.head()

Unnamed: 0,id,text,label
0,0,i didnt feel humiliated,0
1,1,i can go from feeling so hopeless to so damned...,0
2,2,im grabbing a minute to post i feel greedy wrong,3
3,3,i am ever feeling nostalgic about the fireplac...,2
4,4,i am feeling grouchy,3


In [47]:
test_data = pd.DataFrame({id_col: range(len(test_data)), text_col: test_data, target_col: test_labels})
test_data.head()

Unnamed: 0,id,text,label
0,0,im feeling rather rotten so im not very ambiti...,0
1,1,im updating my blog because i feel shitty,0
2,2,i never make her separate from me because i do...,0
3,3,i left with my bouquet of red and yellow tulip...,1
4,4,i was feeling a little vain when i did this one,0


In [48]:
data = pd.concat([train_data, test_data], axis=0)

In [49]:
# Drop duplicates
train_data.drop_duplicates(subset = [id_col], keep='first', inplace=True)
test_data.drop_duplicates(subset = [id_col], keep='first', inplace=True)

# Shuffle Data

In [50]:
# shuffle data
train_data = train_data.sample(frac=1, random_state=42)
train_data.head()

Unnamed: 0,id,text,label
2574,2574,i figure that if i do enough radio appearances...,0
7496,7496,when a boy tried to fool me so he would be ok ...,3
9210,9210,im not feeling too hot this week so it has bee...,2
5456,5456,i feel rejected like my peers dont really unde...,0
736,736,i feel privileged to have narrated erik prince...,1


In [51]:
test_key = test_data[[id_col, target_col]].copy()
test_data = test_data.drop(columns=[target_col])

# Utility to Save DF as a zipped file

In [52]:
def save_df_to_zipped_csv(df, ftype=None): 
    if ftype is not None: 
        suffix = f'_{ftype}'
    else: 
        suffix = ''
        
    zipped_f_name = f'{dataset_name}{suffix}.zip'
    archive_f_name = f'{dataset_name}{suffix}.csv'   
    compression_opts = dict(method='zip',
                        archive_name=archive_f_name)      
    df.to_csv(os.path.join(output_dir, zipped_f_name), index=False, compression=compression_opts )

# Save Main Data File

In [53]:
# save as zipped file 
save_df_to_zipped_csv(data)

In [54]:
# zip files
save_df_to_zipped_csv(train_data, "train")
save_df_to_zipped_csv(test_data, "test")
save_df_to_zipped_csv(test_key, "test_key")