**standardizes and balances MELD text data into a RAVDESS-compatible format through systematic preprocessing and stratified sampling.**

#RAVDESS Labels

| Code | Emotion   |
| ---- | --------- |
| 01   | Neutral   |
| 02   | Calm      |
| 03   | Happy     |
| 04   | Sad       |
| 05   | Angry     |
| 06   | Fearful   |
| 07   | Disgust   |
| 08   | Surprised |


#MELD Labels

| Label ID | Emotion  |
| -------- | -------- |
| 0        | Neutral  |
| 1        | Joy      |
| 2        | Surprise |
| 3        | Sadness  |
| 4        | Anger    |
| 5        | Disgust  |
| 6        | Fear     |


| Unified Code (RAVDESS-style) | Emotion Name | RAVDESS   | MELD            |
| ---------------------------- | ------------ | --------- | --------------- |
| 01                           | Neutral      | Neutral   | Neutral         |
| 02                           | Calm         | Calm      | — (not present) |
| 03                           | Happy        | Happy     | Joy             |
| 04                           | Sad          | Sad       | Sadness         |
| 05                           | Angry        | Angry     | Anger           |
| 06                           | Fearful      | Fearful   | Fear            |
| 07                           | Disgust      | Disgust   | Disgust         |
| 08                           | Surprised    | Surprised | Surprise        |


In [1]:
#connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import os


In [8]:
# -------- paths --------
INPUT_DIR  = "/content/drive/MyDrive/Dissertion/Data/RAW_MELD_DATA"   # folder containing multiple CSVs

OUTPUT_CSV = "/content/drive/MyDrive/Dissertion/Data/Only_text/MELD.csv"


TEXT_COL = "Utterance"   # or "utterance"
EMO_COL  = "Emotion"     # or "emotion"

MELD_TO_RAVDESS = {
    "neutral": 1,     # neutral
    "joy": 3,         # happy
    "sadness": 4,     # sad
    "anger": 5,       # angry
    "fear": 6,        # fearful
    "disgust": 7,     # disgust
    "surprise": 8     # surprised
}



In [9]:
all_rows = []

for file in os.listdir(INPUT_DIR):
    if not file.endswith(".csv"):
        continue

    file_path = os.path.join(INPUT_DIR, file)
    print(f"Processing: {file}")

    df = pd.read_csv(file_path)

    # normalize emotion text
    df[EMO_COL] = df[EMO_COL].str.lower().str.strip()

    # map MELD -> RAVDESS
    df["label"] = df[EMO_COL].map(MELD_TO_RAVDESS)

    # drop unmapped / missing labels
    df = df.dropna(subset=["label"])
    df["label"] = df["label"].astype(int)

    # keep only text + label
    temp_df = df[[TEXT_COL, "label"]]
    temp_df.columns = ["text", "label"]

    all_rows.append(temp_df)




Processing: 2_text_file.csv
Processing: 1_text_file.csv


In [10]:
# -------- merge all files --------
merged_df = pd.concat(all_rows, ignore_index=True)

# -------- class-balanced sampling --------
TARGET_SIZE = 3000
labels = sorted(merged_df["label"].unique())

num_classes = len(labels)
samples_per_class = TARGET_SIZE // num_classes
remainder = TARGET_SIZE % num_classes

balanced_parts = []

for i, label in enumerate(labels):
    class_df = merged_df[merged_df["label"] == label]

    n = samples_per_class + (1 if i < remainder else 0)

    if len(class_df) < n:
        raise ValueError(f"Not enough samples for label {label}")

    balanced_parts.append(
        class_df.sample(n=n, random_state=42)
    )




In [None]:
# -------- combine & shuffle --------
final_df = pd.concat(balanced_parts, ignore_index=True)
final_df = final_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# -------- save --------
final_df.to_csv(OUTPUT_CSV, index=False)

In [12]:
print("Total samples:", len(final_df))
print(final_df["label"].value_counts().sort_index())


Total samples: 3000
label
1    429
3    429
4    429
5    429
6    428
7    428
8    428
Name: count, dtype: int64


In [13]:
#print first 10 rows
final_df.head(10)

Unnamed: 0,text,label
0,Oh my God….What’s he gonna do now?,6
1,"Oh, can I throw up in my diaper genie?",4
2,"Oh no, you’ll have to come.",6
3,Fine.,1
4,What are you crazy?!,7
5,"I don't want to be single, okay? I just... I j...",4
6,Would you mind spending some time on my siadic...,5
7,"Oh! Ok, um, ok, um,",6
8,"Take your shirt off, and let's see what we're ...",1
9,Really?,8
