In [2]:
import csv

# === File paths ===
ORIGINAL = "muse_final_emotion_dataset.csv"     # your 90k MUSE dataset
FETCHED = "muse_with_lyrics.csv"   # final fetched dataset (success)
FAILED = "og_failed_ones.csv"         # failed lyrics fetches
SKIPPED_OUT = "skipped_phase2.csv"  # final skipped list

def make_pair(track: str, artist: str) -> str:
    return f"{track.strip().lower()}|{artist.strip().lower()}"

# --- Load original ---
print("🔄 Loading original dataset...")
original_pairs = set()
with open(ORIGINAL, newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        original_pairs.add(make_pair(row['track'], row['artist']))
print(f"📊 Original total: {len(original_pairs)}")

# --- Load fetched ---
print("🔄 Loading fetched dataset...")
fetched_pairs = set()
with open(FETCHED, newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        # ensure lyrics exist
        lyrics = row.get('lyrics', '').strip()
        if lyrics and lyrics.lower() != 'lyrics_not_found':
            fetched_pairs.add(make_pair(row['track'], row['artist']))
print(f"✅ Fetched total: {len(fetched_pairs)}")

# --- Load failed ---
print("🔄 Loading failed dataset...")
failed_pairs = set()
with open(FAILED, newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        failed_pairs.add(make_pair(row['track'], row['artist']))
print(f"❌ Failed total: {len(failed_pairs)}")

# --- Subtract to get skipped ---
unfetched_pairs = original_pairs - fetched_pairs
skipped_pairs = unfetched_pairs - failed_pairs

print(f"🚀 Unfetched total: {len(unfetched_pairs)}")
print(f"📝 Skipped (untouched) total: {len(skipped_pairs)}")

# --- Write skipped pairs back to CSV ---
with open(SKIPPED_OUT, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['track', 'artist'])
    for p in sorted(skipped_pairs):
        track, artist = p.split('|', 1)
        writer.writerow([track, artist])

print(f"💾 Skipped list written to: {SKIPPED_OUT}")


🔄 Loading original dataset...
📊 Original total: 90001
🔄 Loading fetched dataset...
✅ Fetched total: 42322
🔄 Loading failed dataset...
❌ Failed total: 34645
🚀 Unfetched total: 47679
📝 Skipped (untouched) total: 13043
💾 Skipped list written to: skipped_phase2.csv


In [None]:
import pandas as pd

# Original dataset with emotion labels
original = pd.read_csv("muse_final_emotion_dataset.csv")

# Skipped songs with newly fetched lyrics
skipped_fetched = pd.read_csv("akipped_muse_with_lyrics.csv")

# Normalize columns for matching
for df in [original, skipped_fetched]:
    df['track'] = df['track'].astype(str).str.strip().str.lower()
    df['artist'] = df['artist'].astype(str).str.strip().str.lower()

# Merge lyrics with emotion labels
merged = pd.merge(
    skipped_fetched,
    original[['track', 'artist', 'final_emotion']],  # add other columns if needed
    on=['track', 'artist'],
    how='left'  # left join keeps all fetched songs
)

merged.to_csv("skipped_phase2_fetched_with_labels.csv", index=False)
print("✅ Merged skipped songs with emotion labels")
