In [None]:
import pandas as pd
import numpy as np
import re

seed = 42

df = pd.read_csv("dataset_clean.csv")

text_col = "text"
label_cols = [c for c in df.columns if c != text_col]

def clean_text(t):
    t = str(t)
    t = t.replace("[NAME]", "person").replace("[name]", "person")
    t = t.replace("&amp;", "and")
    t = re.sub(r"http\S+|www\.\S+", " ", t)
    t = re.sub(r"@\w+", " ", t)
    t = re.sub(r"\s+", " ", t)
    t = t.lower()
    emoji_pattern = re.compile(
        "["u"\U0001F300-\U0001F5FF"
        u"\U0001F600-\U0001F64F"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F700-\U0001F77F"
        u"\U0001F780-\U0001F7FF"
        u"\U0001F800-\U0001F8FF"
        u"\U0001F900-\U0001F9FF"
        u"\U0001FA00-\U0001FA6F"
        u"\U0001FA70-\U0001FAFF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    t = emoji_pattern.sub(" ", t)
    t = re.sub(r"[^a-z0-9!?.,;:'\"()\-\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["text_clean"] = df[text_col].astype(str).apply(clean_text)

df["num_labels"] = df[label_cols].sum(axis=1)

mask_pure_neutral = (df["neutral"] == 1) & (df["num_labels"] == 1)
mask_other = ~mask_pure_neutral

df_neutral = df[mask_pure_neutral]
df_other = df[mask_other]

label_sums_other = df_other[label_cols].sum()
max_other = label_sums_other.drop("neutral").max()
target_neutral = int(max_other * 1.2)
keep_frac = min(1.0, target_neutral / max(len(df_neutral), 1))

if len(df_neutral) > 0 and keep_frac < 1.0:
    df_neutral_down = df_neutral.sample(frac=keep_frac, random_state=seed)
else:
    df_neutral_down = df_neutral

df_bal = pd.concat([df_other, df_neutral_down], axis=0)
df_bal = df_bal.sample(frac=1, random_state=seed).reset_index(drop=True)

df_bal = df_bal.drop(columns=[text_col, "num_labels"])
df_bal = df_bal[df_bal[label_cols].sum(axis=1) > 0].reset_index(drop=True)

cols = ["text_clean"] + label_cols
df_bal = df_bal[cols]

out_path = "dataset_preprocessed_final.csv"
df_bal.to_csv(out_path, index=False)

print("Saved:", out_path, "shape:", df_bal.shape)
print(df_bal[label_cols].sum().sort_values(ascending=False).head(15))


Saved dataset_preprocessed4.csv
Shape: (54497, 31)
Label frequencies:
neutral           8936
approval          5512
admiration        4964
annoyance         4211
disapproval       3528
gratitude         3179
curiosity         2967
realization       2746
optimism          2684
amusement         2680
disappointment    2650
joy               2419
love              2374
anger             2349
confusion         2283
sadness           2048
caring            1867
excitement        1744
surprise          1657
disgust           1618
desire            1176
fear               977
remorse            802
embarrassment      785
nervousness        573
relief             435
pride              428
grief              220
dtype: int64
