In [3]:
import requests, json, textwrap, os, time

TEST_Q = {
    "term": "Daft Punk Get Lucky",
    "media": "music",
    "entity": "song",
    "limit": 1,
    "country": "US",
}
UA = "Mozilla/5.0 (ConnectivitySmokeTest/1.0)"

try:
    r = requests.get("https://itunes.apple.com/search", params=TEST_Q, headers={"User-Agent": UA}, timeout=10)
    print("HTTP:", r.status_code)
    print("URL :", r.url)
    js = r.json()
    print("resultCount:", js.get("resultCount"))
    if js.get("results"):
        item = js["results"][0]
        print("artistName:", item.get("artistName"))
        print("trackName :", item.get("trackName"))
        print("previewUrl:", item.get("previewUrl"))
    else:
        print("No results payload:", js)
except Exception as e:
    print("❌ Exception:", repr(e))

HTTP: 403
URL : https://itunes.apple.com/search?term=Daft+Punk+Get+Lucky&media=music&entity=song&limit=1&country=US
❌ Exception: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')


In [4]:
# JUPYTER NOTEBOOK CELL — FAST downloader using Deezer previews (no auth), verbose + resume

import os, re, csv, time, sys, random, math
import requests
import pandas as pd
from pathlib import Path
from rapidfuzz import fuzz
from tqdm.auto import tqdm

# ============== CONFIG ==============
DATA_DIR         = "../../data"
RAW_FILE         = "SpotifyFeatures.csv"

OUT_DIR          = "../input"
OUT_CSV_NAME     = "songs_dataset_6k.csv"
WORK_DIR         = "../work"
AUDIO_DIR        = "../output/previews"

TARGET_TOTAL     = 6000                 # previews to collect
QPS              = 10.0                 # throttle (requests/sec)
RETRIES          = 1                    # quick retry
HTTP_TIMEOUT     = 8                    # per request timeout
USER_AGENT       = "Mozilla/5.0 (FastDeezerSampler/1.0)"
RANDOM_SEED      = 42
LOG_EVERY_N_OK   = 50
MIN_SCORE_ACCEPT = 66.0                 # fuzzy match threshold (artist+title)
RESULTS_LIMIT    = 5                    # Deezer top-N to scan

# ============== HELPERS ==============
def clean_text(s: str) -> str:
    s = (s or "")
    s = s.replace("’","'").replace("‘","'").replace("“",'"').replace("”",'"')
    s = re.sub(r"\(.*?\)|\[.*?\]", " ", s)
    s = re.sub(r"\s+(feat|ft)\.?\s+.*$", " ", s, flags=re.IGNORECASE)
    s = re.sub(r"\s+-\s*(radio|single|album|remaster|edit).*$", " ", s, flags=re.IGNORECASE)
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

def score_candidate(q_artist: str, q_track: str, cand_artist: str, cand_title: str) -> float:
    # weight track slightly higher than artist
    sa = fuzz.token_set_ratio(q_artist, clean_text(cand_artist))
    st = fuzz.token_set_ratio(q_track,  clean_text(cand_title))
    return 0.4 * sa + 0.6 * st

def deezer_search_preview(artist: str, track: str, *, session: requests.Session):
    """
    Use Deezer search to find a 30s MP3 preview.
    Query both (artist+track) and (track only) quickly and pick best-scoring.
    Returns: (preview_url, best_score, matched_artist, matched_title) or (None, 0, None, None)
    """
    qA, qT = clean_text(artist), clean_text(track)
    best = (None, 0.0, None, None)

    # Pass 1: artist+track
    q = f'artist:"{artist}" track:"{track}"'
    params = {"q": q, "limit": RESULTS_LIMIT}
    try:
        r = session.get("https://api.deezer.com/search", params=params, timeout=HTTP_TIMEOUT)
        js = r.json()
        for item in js.get("data", []):
            cand_artist = item.get("artist", {}).get("name", "")
            cand_title  = item.get("title", "")
            sc = score_candidate(qA, qT, cand_artist, cand_title)
            if sc > best[1] and item.get("preview"):
                best = (item["preview"], sc, cand_artist, cand_title)
                if best[1] >= MIN_SCORE_ACCEPT:
                    return best
    except Exception:
        pass

    # Pass 2: track-only
    q = f'track:"{track}"'
    params = {"q": q, "limit": RESULTS_LIMIT}
    try:
        r = session.get("https://api.deezer.com/search", params=params, timeout=HTTP_TIMEOUT)
        js = r.json()
        for item in js.get("data", []):
            cand_artist = item.get("artist", {}).get("name", "")
            cand_title  = item.get("title", "")
            sc = score_candidate(qA, qT, cand_artist, cand_title)
            if sc > best[1] and item.get("preview"):
                best = (item["preview"], sc, cand_artist, cand_title)
                if best[1] >= MIN_SCORE_ACCEPT:
                    return best
    except Exception:
        pass

    return best if best[0] else (None, 0.0, None, None)

def slugify(s: str) -> str:
    return re.sub(r'[^a-zA-Z0-9]+', '_', (s or "").lower()).strip('_')[:60]

def download(url: str, dst: Path, sess: requests.Session):
    try:
        with sess.get(url, stream=True, timeout=30) as resp:
            if resp.status_code != 200:
                return False, resp.status_code, 0
            total = 0
            with open(dst, "wb") as f:
                for ch in resp.iter_content(64*1024):
                    if ch:
                        f.write(ch); total += len(ch)
        return True, 200, total
    except Exception:
        return False, 0, 0

# ============== SETUP ==============
data_dir  = Path(DATA_DIR); raw_path = data_dir / RAW_FILE
out_dir   = Path(OUT_DIR);  work_dir = Path(WORK_DIR)
audio_dir = Path(AUDIO_DIR); out_csv = out_dir / OUT_CSV_NAME

out_dir.mkdir(parents=True, exist_ok=True)
work_dir.mkdir(parents=True, exist_ok=True)
audio_dir.mkdir(parents=True, exist_ok=True)

print("Loading:", raw_path)
df = pd.read_csv(raw_path)
if not {"artist_name","track_name"}.issubset(df.columns):
    raise ValueError("CSV must contain 'artist_name' and 'track_name' columns")

# Randomize order (we no longer enforce per-genre quotas)
df = df.sample(frac=1.0, random_state=RANDOM_SEED).reset_index(drop=True)

# Manifest
manifest_path = work_dir / "deezer_manifest_verbose.csv"
fields = ["row_idx","artist","track","matched_artist","matched_title","score",
          "preview_url","audio_file","status","http_status","bytes","notes"]
mf = open(manifest_path, "w", newline="", encoding="utf-8")
mw = csv.DictWriter(mf, fieldnames=fields); mw.writeheader()

# Resume
existing = {p.name for p in audio_dir.glob("*.mp3")}
selected = []
ok = 0
start = time.time()
sleep = 1.0 / max(0.1, QPS)

# If an earlier CSV exists, load to continue
if out_csv.exists():
    try:
        prev = pd.read_csv(out_csv)
        if {"audio_file"}.issubset(prev.columns):
            already = {str(x) for x in prev["audio_file"].dropna().astype(str)}
            # Only count those that exist on disk
            ok_files = already & existing
            if ok_files:
                keep_rows = prev[prev["audio_file"].isin(list(ok_files))].to_dict("records")
                selected.extend(keep_rows)
                ok = len(keep_rows)
                print(f"Resuming: {ok} rows already on disk & in CSV")
    except Exception:
        pass

try:
    pbar = tqdm(total=TARGET_TOTAL, unit="trk", desc="Downloading previews (Deezer)", leave=True)
    pbar.update(min(ok, TARGET_TOTAL))

    with requests.Session() as sess:
        sess.headers.update({"User-Agent": USER_AGENT, "Accept": "application/json"})

        for i, row in df.iterrows():
            if ok >= TARGET_TOTAL:
                break

            artist = str(row["artist_name"]).strip()
            track  = str(row["track_name"]).strip()
            if not artist or not track:
                mw.writerow({"row_idx": i, "artist": artist, "track": track,
                             "matched_artist":"", "matched_title":"", "score":"",
                             "preview_url":"", "audio_file":"", "status":"skip_missing",
                             "http_status":"", "bytes":"", "notes":""})
                continue

            base   = f"{slugify(artist)}_{slugify(track)}_{i}"
            fname  = f"{base}.mp3"
            fpath  = audio_dir / fname

            # Skip existing file (resume)
            if fname in existing and fpath.exists() and fpath.stat().st_size > 0:
                out_row = row.to_dict()
                out_row["preview_url"] = ""
                out_row["audio_file"]  = fname
                selected.append(out_row)
                ok += 1
                pbar.update(1)
                if ok % LOG_EVERY_N_OK == 0:
                    elapsed = time.time() - start
                    rate = ok / max(1e-6, elapsed)
                    tqdm.write(f"[resume] ok={ok}/{TARGET_TOTAL}  rate={rate:.2f} trk/s  elapsed={elapsed/60:.1f}m")
                continue

            # Lookup on Deezer
            preview_url, score, m_artist, m_title = None, 0.0, None, None
            for _ in range(RETRIES + 1):
                preview_url, score, m_artist, m_title = deezer_search_preview(
                    artist, track, session=sess
                )
                if preview_url:
                    break
                time.sleep(0.2)

            if not preview_url:
                mw.writerow({"row_idx": i, "artist": artist, "track": track,
                             "matched_artist":"", "matched_title":"", "score": round(score,1),
                             "preview_url":"", "audio_file": fname, "status":"not_found",
                             "http_status":"", "bytes":"", "notes":""})
                tqdm.write(f"[miss] {i}  {artist} — {track}  (score={score:.1f})")
                time.sleep(sleep)
                continue

            # Download MP3 preview
            ok_dl, code, nbytes = download(preview_url, fpath, sess)
            if ok_dl:
                out_row = row.to_dict()
                out_row["preview_url"] = preview_url
                out_row["audio_file"]  = fname
                selected.append(out_row)
                mw.writerow({"row_idx": i, "artist": artist, "track": track,
                             "matched_artist": m_artist or "", "matched_title": m_title or "",
                             "score": round(score,1), "preview_url": preview_url, "audio_file": fname,
                             "status":"downloaded", "http_status": code, "bytes": nbytes, "notes":""})
                ok += 1
                pbar.update(1)
                if ok % LOG_EVERY_N_OK == 0:
                    elapsed = time.time() - start
                    rate = ok / max(1e-6, elapsed)
                    tqdm.write(f"[ok] {ok}/{TARGET_TOTAL}  {artist} — {track}  bytes={nbytes}  rate={rate:.2f} trk/s")
            else:
                mw.writerow({"row_idx": i, "artist": artist, "track": track,
                             "matched_artist": m_artist or "", "matched_title": m_title or "",
                             "score": round(score,1), "preview_url": preview_url, "audio_file": fname,
                             "status":"download_failed", "http_status": code, "bytes": nbytes, "notes":""})
                tqdm.write(f"[dl-fail] {i}  {artist} — {track}  http={code}")

            time.sleep(1.0 / max(0.1, QPS))

    pbar.close()
finally:
    mf.close()

# Save final CSV
out_df = pd.DataFrame(selected)
if len(out_df) > TARGET_TOTAL:
    out_df = out_df.sample(n=TARGET_TOTAL, random_state=4242).reset_index(drop=True)
out_dir.mkdir(parents=True, exist_ok=True)
out_df.to_csv(out_csv, index=False)

elapsed = time.time() - start
rate = (len(out_df) / max(1e-6, elapsed))
print("\n======== Summary ========")
print(f"Target total: {TARGET_TOTAL}")
print(f"Selected rows (downloaded/resumed): {len(out_df)}")
print(f"Throughput: {rate:.2f} tracks/sec  |  Elapsed: {elapsed/60:.1f} min")
print(f"✅ Final CSV: {out_csv}")
print(f"🎵 Audio dir: {AUDIO_DIR}")
print(f"📝 Manifest:  {manifest_path}")

Loading: ../../data/SpotifyFeatures.csv


Downloading previews (Deezer):   0%|          | 0/6000 [00:00<?, ?trk/s]

[miss] 23  Randy Edelman — A Steep But Beautiful Climb  (score=0.0)
[ok] 50/6000  Mike Baldwin — Vodka & Cigarettes  bytes=479827  rate=0.82 trk/s
[miss] 57  Kami-sama, I have noticed — 破滅のオレンジ  (score=0.0)
[miss] 63  Chief Keef — I Just Wanna (feat. Mac Miller)  (score=0.0)
[ok] 100/6000  Moby — Sunday (B-Sides)  bytes=479827  rate=0.82 trk/s
[miss] 137  Wolfgang Amadeus Mozart — Requiem in D Minor, K. 626: 1. Introitus: Requiem - Live  (score=0.0)
[miss] 143  James Horner — Iris: Pt. 2 - Instrumental  (score=0.0)
[ok] 150/6000  Cults — Go Outside  bytes=479827  rate=0.82 trk/s
[miss] 156  Chuck Loeb — Love Is All (feat. Kim Waters)  (score=0.0)
[miss] 158  Jim Norton — Clean It off, B---  (score=0.0)
[miss] 203  Hillsong Worship — Thank You Jesus - Alternate Version/Live  (score=0.0)
[ok] 200/6000  NEEDTOBREATHE — Forever On Your Side (with JOHNNYSWIM)  bytes=479827  rate=0.82 trk/s
[ok] 250/6000  Too $hort — Blow the Whistle  bytes=479827  rate=0.84 trk/s
[miss] 261  Trentemøller — 