In [1]:
import pandas as pd

# === File paths ===
ORIGINAL_FILE = "muse_final_emotion_dataset.csv"       # source of emotion labels
NEW_LYRICS_FILE = "skipped_muse_with_lyrics.csv"          # track,artist,_2,lyrics
OUTPUT_FILE = "skipped_phase2_fetched_with_labels.csv"

# === Load datasets ===
original = pd.read_csv(ORIGINAL_FILE)
new_fetched = pd.read_csv(NEW_LYRICS_FILE)

# === Clean / normalize new fetched dataset ===
# Drop the unnecessary '_2' column if present
if '_2' in new_fetched.columns:
    new_fetched = new_fetched.drop(columns=['_2'])

# Ensure proper column naming (in case order is different)
new_fetched.columns = ['track', 'artist', 'lyrics']

# === Normalize columns for matching ===
for df in [original, new_fetched]:
    df['track'] = df['track'].astype(str).str.strip().str.lower()
    df['artist'] = df['artist'].astype(str).str.strip().str.lower()

# === Merge lyrics with emotion labels ===
merged = pd.merge(
    new_fetched,
    original[['track', 'artist', 'final_emotion']],
    on=['track', 'artist'],
    how='left'  # keep all fetched songs
)

# === Save result ===
merged.to_csv(OUTPUT_FILE, index=False)
print(f"✅ Merged skipped songs with emotion labels → {OUTPUT_FILE}")
print(f"📊 Total rows: {len(merged)}")
missing_emotion = merged['final_emotion'].isna().sum()
print(f"⚠️ Missing emotion labels: {missing_emotion}")


✅ Merged skipped songs with emotion labels → skipped_phase2_fetched_with_labels.csv
📊 Total rows: 3766
⚠️ Missing emotion labels: 0


In [2]:
df2 = pd.read_csv("skipped_phase2_fetched_with_labels.csv")

df2.columns

Index(['track', 'artist', 'lyrics', 'final_emotion'], dtype='object')

In [3]:

print("\n📊 Emotion distribution after first fetching:")
print(df2['final_emotion'].value_counts())


📊 Emotion distribution after first fetching:
final_emotion
sadness       680
reflective    617
joy           612
calm          512
excitement    480
romantic      441
neutral       323
fear           56
anger          45
Name: count, dtype: int64


In [13]:
import pandas as pd

# === File paths ===
FILE1 = "skipped_phase2_fetched_with_labels.csv"   # track,artist,lyrics,final_emotion
FILE2 = "skipped_lyrics.csv"   # track,artist,final_emotion,reason,lyrics
OUTPUT = "merged_clean.csv"

# === Load both CSVs ===
df1 = pd.read_csv(FILE1)
df2 = pd.read_csv(FILE2)

# === Standardize both to common columns ===
df1_clean = df1[["track", "artist", "final_emotion", "lyrics"]]

# df2 may have 'reason' column — ignore it
df2_clean = df2[["track", "artist", "final_emotion", "lyrics"]]

# === Merge them together ===
merged_df = pd.concat([df1_clean, df2_clean], ignore_index=True)

# === Optional: drop duplicates based on (track, artist) ===
merged_df.drop_duplicates(subset=["track", "artist"], inplace=True)

# === Save merged CSV ===
merged_df.to_csv(OUTPUT, index=False)

print(f"✅ Merged CSV saved → {OUTPUT}")
print(f"📊 Total rows: {len(merged_df)}")


✅ Merged CSV saved → merged_clean.csv
📊 Total rows: 3371


In [16]:

print("\n📊 Emotion distribution after first fetching:")
print(merged_df['final_emotion'].value_counts())


📊 Emotion distribution after first fetching:
final_emotion
sadness       622
joy           531
reflective    508
excitement    491
calm          486
romantic      366
neutral       313
fear           36
anger          18
Name: count, dtype: int64


In [None]:
import pandas as pd

# === File paths ===
FILE1 = "muse_final_emo.csv"   # track,artist,lyrics,final_emotion
FILE2 = "merged_clean.csv"   # track,artist,final_emotion,reason,lyrics
OUTPUT = "merged_clean.csv"

# === Load both CSVs ===
df1 = pd.read_csv(FILE1)
df2 = pd.read_csv(FILE2)

# === Standardize both to common columns ===
df1_clean = df1[["track", "artist", "final_emotion", "lyrics"]]

# df2 may have 'reason' column — ignore it
df2_clean = df2[["track", "artist", "final_emotion", "lyrics"]]

# === Merge them together ===
merged_df = pd.concat([df1_clean, df2_clean], ignore_index=True)

# === Optional: drop duplicates based on (track, artist) ===
merged_df.drop_duplicates(subset=["track", "artist"], inplace=True)

# === Save merged CSV ===
merged_df.to_csv(OUTPUT, index=False)

print(f"✅ Merged CSV saved → {OUTPUT}")
print(f"📊 Total rows: {len(merged_df)}")
