In [1]:
import pandas as pd

splits = {'train': 'split/train-00000-of-00001.parquet', 'validation': 'split/validation-00000-of-00001.parquet', 'test': 'split/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/dair-ai/emotion/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/dair-ai/emotion/" + splits["validation"])
df_test = pd.read_parquet("hf://datasets/dair-ai/emotion/" + splits["test"])

In [None]:
def print_emotion_distribution(df, dataset_name):
    total = len(df)
    distribution = df['label'].value_counts()
    percentages = (distribution / total * 100).round(2)
    
    print(f"\n{dataset_name} Dataset Distribution:")
    print("-" * 50)
    print(f"{'Emotion':<10} {'Label':<8} {'Count':<8} {'Percentage':<10}")
    print("-" * 50)
    
    label_to_emotion = {
        0: 'sadness',
        1: 'joy',
        2: 'love',
        3: 'anger',
        4: 'fear',
        5: 'surprise'
    }
    
    for label in sorted(distribution.index):
        count = distribution[label]
        percentage = percentages[label]
        emotion = label_to_emotion[label]
        print(f"{emotion:<10} {label:<8} {count:<8} {percentage:>6.2f}%")
    
    print(f"\nTotal samples: {total}")

# Distribution for each dataset
print_emotion_distribution(df_train, "Training")
print_emotion_distribution(df_val, "Validation")
print_emotion_distribution(df_test, "Test")


Training Dataset Distribution:
--------------------------------------------------
Emotion    Label    Count    Percentage
--------------------------------------------------
sadness    0        4666      29.16%
joy        1        5362      33.51%
love       2        1304       8.15%
anger      3        2159      13.49%
fear       4        1937      12.11%
surprise   5        572        3.58%

Total samples: 16000

Validation Dataset Distribution:
--------------------------------------------------
Emotion    Label    Count    Percentage
--------------------------------------------------
sadness    0        550       27.50%
joy        1        704       35.20%
love       2        178        8.90%
anger      3        275       13.75%
fear       4        212       10.60%
surprise   5        81         4.05%

Total samples: 2000

Test Dataset Distribution:
--------------------------------------------------
Emotion    Label    Count    Percentage
--------------------------------------------

In [2]:
df_train

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,1
15998,i feel like this was such a rude comment and i...,3


In [3]:
df_val

Unnamed: 0,text,label
0,im feeling quite sad and sorry for myself but ...,0
1,i feel like i am still looking at a blank canv...,0
2,i feel like a faithful servant,2
3,i am just feeling cranky and blue,3
4,i can have for a treat or if i am feeling festive,1
...,...,...
1995,im having ssa examination tomorrow in the morn...,0
1996,i constantly worry about their fight against n...,1
1997,i feel its important to share this info for th...,1
1998,i truly feel that if you are passionate enough...,1


In [4]:
df_test

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,0
1,im updating my blog because i feel shitty,0
2,i never make her separate from me because i do...,0
3,i left with my bouquet of red and yellow tulip...,1
4,i was feeling a little vain when i did this one,0
...,...,...
1995,i just keep feeling like someone is being unki...,3
1996,im feeling a little cranky negative after this...,3
1997,i feel that i am useful to my people and that ...,1
1998,im feeling more comfortable with derby i feel ...,1


In [None]:
import csv
import json

csv_file_path = 'test.csv'
jsonl_file_path = 'test.jsonl'

label_to_emotion = {
    '0': 'sadness',
    '1': 'joy',
    '2': 'love',
    '3': 'anger',
    '4': 'fear',
    '5': 'surprise'
}

system_prompt = """
            You are an empathetic mental health support chatbot. 
            Analyze the user's message to understand their core emotional state. 
            Identify and state the single primary emotion. 
            Only output one of these emotions as a response: sadness, joy, love, anger, fear, or surprise. 
            Do not output any other text.
            """


try:
    with open(csv_file_path, 'r', encoding='utf-8') as infile, \
         open(jsonl_file_path, 'w', encoding='utf-8') as outfile:

        reader = csv.reader(infile)
        header = next(reader)

        if header != ['text', 'label']:
             print(f"Warning: Expected header ['text', 'label'], found {header}")

        line_count = 0
        error_count = 0
        for row in reader:
            try:
                if len(row) != 2:
                    error_count += 1
                    continue

                text, label_str = row
                emotion_name = label_to_emotion.get(label_str)

                if not text:
                     error_count += 1
                     continue

                if emotion_name is None:
                    error_count += 1
                    continue

                json_object = {
                    "messages": [
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": text.strip()}, # Strip leading/trailing whitespace
                        {"role": "assistant", "content": emotion_name}
                    ]
                }

                outfile.write(json.dumps(json_object) + '\n')
                line_count += 1

            except Exception as e:
                print(f"Error processing row: {row} - {e}")
                error_count += 1

    print(f"Wrote {line_count} lines to {jsonl_file_path}")
    if error_count > 0:
        print(f"Skipped {error_count} rows due to errors")

except FileNotFoundError:
    print(f"File '{csv_file_path}' not found")


Conversion complete.
Successfully wrote 2000 lines to test.jsonl


In [None]:
import random

def trim_jsonl(input_file, output_file, num_records):
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        lines = infile.readlines()
        random.shuffle(lines)
        trimmed_lines = lines[:num_records]
        outfile.writelines(trimmed_lines)
        print(f"Trimmed {num_records} records from {input_file} and saved to {output_file}")

# Randomly sample X records
trim_jsonl('train.jsonl', 'train_100.jsonl', 100)
trim_jsonl('val.jsonl', 'val_50.jsonl', 50)
trim_jsonl('test.jsonl', 'test_10.jsonl', 10)

Trimmed 100 records from train.jsonl and saved to train_small.jsonl
Trimmed 50 records from val.jsonl and saved to val_small.jsonl
Trimmed 10 records from test.jsonl and saved to test_small.jsonl


In [None]:
# Randomly sample X records
trim_jsonl('train.jsonl', 'train_1000.jsonl', 1000)
trim_jsonl('train.jsonl', 'train_2000.jsonl', 2000)
trim_jsonl('train.jsonl', 'train_3000.jsonl', 3000)
trim_jsonl('train.jsonl', 'train_4000.jsonl', 4000)
trim_jsonl('train.jsonl', 'train_5000.jsonl', 5000)
trim_jsonl('train.jsonl', 'train_6000.jsonl', 6000)
trim_jsonl('train.jsonl', 'train_7000.jsonl', 7000)
trim_jsonl('train.jsonl', 'train_8000.jsonl', 8000)
trim_jsonl('train.jsonl', 'train_9000.jsonl', 9000)
trim_jsonl('train.jsonl', 'train_10000.jsonl', 10000)

trim_jsonl('val.jsonl', 'val_100.jsonl', 100)
trim_jsonl('val.jsonl', 'val_200.jsonl', 200)
trim_jsonl('val.jsonl', 'val_300.jsonl', 300)
trim_jsonl('val.jsonl', 'val_400.jsonl', 400)
trim_jsonl('val.jsonl', 'val_500.jsonl', 500)
trim_jsonl('val.jsonl', 'val_600.jsonl', 600)
trim_jsonl('val.jsonl', 'val_700.jsonl', 700)
trim_jsonl('val.jsonl', 'val_800.jsonl', 800)
trim_jsonl('val.jsonl', 'val_900.jsonl', 900)
trim_jsonl('val.jsonl', 'val_1000.jsonl', 1000)


Trimmed 1000 records from train.jsonl and saved to train_1000.jsonl
Trimmed 2000 records from train.jsonl and saved to train_2000.jsonl
Trimmed 3000 records from train.jsonl and saved to train_3000.jsonl
Trimmed 4000 records from train.jsonl and saved to train_4000.jsonl
Trimmed 5000 records from train.jsonl and saved to train_5000.jsonl
Trimmed 6000 records from train.jsonl and saved to train_6000.jsonl
Trimmed 7000 records from train.jsonl and saved to train_7000.jsonl
Trimmed 8000 records from train.jsonl and saved to train_8000.jsonl
Trimmed 9000 records from train.jsonl and saved to train_9000.jsonl
Trimmed 10000 records from train.jsonl and saved to train_10000.jsonl
Trimmed 100 records from val.jsonl and saved to val_100.jsonl
Trimmed 200 records from val.jsonl and saved to val_200.jsonl
Trimmed 300 records from val.jsonl and saved to val_300.jsonl
Trimmed 400 records from val.jsonl and saved to val_400.jsonl
Trimmed 500 records from val.jsonl and saved to val_500.jsonl
Trimmed 

In [None]:
# Do label-emotion mapping in CSV format
import csv

csv_file_path = 'test.csv'
output_csv_file_path = 'test_with_emotion.csv'

label_to_emotion = {
    '0': 'sadness',
    '1': 'joy',
    '2': 'love',
    '3': 'anger',
    '4': 'fear',
    '5': 'surprise'
}

try:
    with open(csv_file_path, 'r', encoding='utf-8') as infile, \
         open(output_csv_file_path, 'w', encoding='utf-8', newline='') as outfile:

        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        header = next(reader) # Skip the header row
        header.append('emotion') # Add the new column header
        writer.writerow(header) # Write the header row to the output file

        for row in reader:
            text, label_str = row
            emotion_name = label_to_emotion.get(label_str)
            row.append(emotion_name) # Add the emotion name to the row
            writer.writerow(row) # Write the row to the output file

    print(f"Wrote to {output_csv_file_path}")
    
except FileNotFoundError:
    print(f"File '{csv_file_path}' not found")



Conversion complete.
Successfully wrote to test_with_emotion.csv
