In [1]:
import csv
import json
from pathlib import Path

# ---- CONFIG ----
ORIGINAL_CSV = "muse_final_emotion_dataset.csv"       # your 90k dataset
PROGRESS_JSON = "progress_smh.json"         # your big progress file
ORIGINAL_PAIRS_CSV = "original_pairs.csv"
PROGRESS_PAIRS_JSON = "progress_pairs2.json"
SKIPPED_CSV = "skipped_clean2.csv"


def make_pair(track: str, artist: str) -> str:
    """Normalize track-artist pair key for matching."""
    return f"{track.strip().lower()}|{artist.strip().lower()}"


# --- STEP 1: Build original_pairs.csv ---
print("🔄 Loading original dataset...")
original_pairs = set()
with open(ORIGINAL_CSV, newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        track = row['track']
        artist = row['artist']
        pair = make_pair(track, artist)
        original_pairs.add(pair)

# Save original_pairs for reference
with open(ORIGINAL_PAIRS_CSV, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['pair'])
    for p in sorted(original_pairs):
        writer.writerow([p])

print(f"📊 Original unique pairs: {len(original_pairs)}")
print(f"💾 Saved original pairs to: {ORIGINAL_PAIRS_CSV}")


# --- STEP 2: Build processed_pairs.json ---
print("🔄 Loading progress file...")
with open(PROGRESS_JSON, 'r', encoding='utf-8') as f:
    progress_data = json.load(f)

processed_pairs = set()

for key in progress_data.keys():
    # Keys in progress_smh.json are already pair-like but normalize anyway
    processed_pairs.add(key.strip().lower())

with open(PROGRESS_PAIRS_JSON, 'w', encoding='utf-8') as f:
    json.dump(sorted(list(processed_pairs)), f, ensure_ascii=False, indent=2)

print(f"✅ Processed pairs found: {len(processed_pairs)}")
print(f"💾 Saved processed pairs to: {PROGRESS_PAIRS_JSON}")


# --- STEP 3: Subtract ---
print("➖ Subtracting processed from original to find skipped...")
skipped_pairs = original_pairs - processed_pairs
print(f"🚀 Skipped pair count: {len(skipped_pairs)}")


# --- STEP 4: Convert skipped pairs back to track/artist and save ---
with open(SKIPPED_CSV, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['track', 'artist'])
    for p in sorted(skipped_pairs):
        track, artist = p.split('|', 1)
        writer.writerow([track, artist])

print(f"📝 Skipped songs list saved to: {SKIPPED_CSV}")
print("✅ Phase 2 clean skip list generation complete!")


🔄 Loading original dataset...
📊 Original unique pairs: 90001
💾 Saved original pairs to: original_pairs.csv
🔄 Loading progress file...
✅ Processed pairs found: 78338
💾 Saved processed pairs to: progress_pairs2.json
➖ Subtracting processed from original to find skipped...
🚀 Skipped pair count: 90001
📝 Skipped songs list saved to: skipped_clean2.csv
✅ Phase 2 clean skip list generation complete!
