In [10]:
!pip install dali-dataset
import DALI

Collecting dali-dataset
  Downloading DALI_dataset-1.1-py3-none-any.whl.metadata (751 bytes)
Collecting youtube-dl (from dali-dataset)
  Downloading youtube_dl-2021.12.17-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading DALI_dataset-1.1-py3-none-any.whl (15 kB)
Downloading youtube_dl-2021.12.17-py2.py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m79.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube-dl, dali-dataset
Successfully installed dali-dataset-1.1 youtube-dl-2021.12.17


In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
from pathlib import Path

ROOT = Path("/content/drive/MyDrive/DALI_v2.0/annot_tismir")
print("ROOT exists:", ROOT.exists())
print("Top-level items (first 30):")
for p in sorted(ROOT.iterdir())[:30]:
    print(" -", p.name)

ROOT exists: True
Top-level items (first 30):
 - 0003f8140a2d475bb8a1cf960a0c694e.gz
 - 0004fd87ae074354820320acb3816ef9.gz
 - 00070c7c333849e4a3725b906c339042.gz
 - 000b837fdd034df58df3f511f3c49d5e.gz
 - 000cba3f7b854ace8c9bfa162621f581.gz
 - 001560cf57384035b3f841bcf75fcbb9.gz
 - 001651b1987148ea96f964df906186d9.gz
 - 0040a98850534097a4eb6aa024b6e141.gz
 - 00481f42fd4d49d49697cba8e3353c43.gz
 - 004909ba6fcf4a55befec466b290bba8.gz
 - 0054d4d2b1f340a088b9d911bdd31f28.gz
 - 00589ea530a14ff89d5c82c7858d8e09.gz
 - 005fff73788b49a48e8a4769dc6bbdfc.gz
 - 0061f1d7604d43d78d81c9e6372be0b1.gz
 - 00658116343c4502ac99de27aca47192.gz
 - 006ade6dab944b9fbd0362f41cc34566.gz
 - 006b5d1db6a447039c30443310b60c6f.gz
 - 006e98d91bb64c71bb7f1a68890ae358.gz
 - 0070a3bcd0014be19282c286f338a596.gz
 - 0077f871bfb04762b88b28519ca5a9b3.gz
 - 007dea0fa17c441c946ddcbb62344892.gz
 - 0092a10b4f0c402d801a13a4faa30c18.gz
 - 009a7e3ad74e40318fc4e1bb424b831f.gz
 - 00ae0ee86a09429c8aa387c56dbf09d1.gz
 - 00b18defb9864da

In [20]:
!apt-get -qq update
!apt-get -qq install -y ffmpeg
!pip -q install -U yt-dlp librosa soundfile pandas tqdm

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [31]:
from pathlib import Path

# DALI annotations folder (.gz files)
ANNOT_DIR = Path("/content/drive/MyDrive/DALI_v2.0/annot_tismir")  # <-- our path

# Where to save the downloaded 30s wav segments (30-60)
SEG_WAV_DIR = Path("/content/dali_en_wav")
SEG_WAV_DIR.mkdir(parents=True, exist_ok=True)

# Where to save 3s clips
CLIPS_DIR = Path("/content/drive/MyDrive/dali_en_clips_3s")
CLIPS_DIR.mkdir(parents=True, exist_ok=True)

# Output CSVs
OUT_CLIPS_CSV = "/content/drive/MyDrive/dali_english_1500songs_with_lyrics_genre.csv"
OUT_SONGS_CSV = "/content/drive/MyDrive/dali_english_1500songs_songlevel_stats.csv"
OUT_FAIL_CSV  = "/content/drive/MyDrive/dali_english_download_failures.csv"

TARGET_SONGS = 1500
START_SEC = 30.0
END_SEC   = 60.0
TAKE_SEC  = END_SEC - START_SEC
CLIP_SEC  = 3.0
CLIPS_PER_SONG = int(TAKE_SEC / CLIP_SEC)
TARGET_SR = 22050

REQUIRE_GENRE = True

MIN_NONEMPTY_CLIPS_PER_SONG = 3

In [22]:
import gzip, pickle
import pandas as pd
from tqdm.auto import tqdm

gz_files = sorted(ANNOT_DIR.glob("*.gz"))
print("Annotation .gz files:", len(gz_files))

rows = []
for p in tqdm(gz_files, desc="Indexing DALI metadata"):
    dali_id = p.stem
    try:
        with gzip.open(p, "rb") as f:
            ann = pickle.load(f)

        info = ann.info or {}
        md = info.get("metadata", {}) or {}
        audio = info.get("audio", {}) or {}

        lang = (md.get("language") or "").strip().lower()
        genres = md.get("genres") or []
        genres_str = ",".join(genres) if isinstance(genres, list) else str(genres or "")
        genres_str = genres_str.strip()

        url = audio.get("url") or audio.get("youtube") or audio.get("link") or ""
        working = audio.get("working")

        rows.append({
            "dali_id": dali_id,
            "language": lang,
            "genres": genres_str,
            "has_genre": (genres_str != ""),
            "youtube_url": url,
            "working": working
        })
    except Exception as e:
        rows.append({"dali_id": dali_id, "error": str(e)})

df = pd.DataFrame(rows)

df_en = df[(df["language"] == "english") & (df["youtube_url"].astype(str).str.len() > 0)].copy()

if df_en["working"].notna().any():
    df_work = df_en[df_en["working"].astype(str).str.lower().isin(["true","1","yes"])].copy()
    if len(df_work) > 0:
        df_en = df_work

if REQUIRE_GENRE:
    df_en = df_en[df_en["has_genre"]].copy()

df_en = df_en.sample(frac=1.0, random_state=42).reset_index(drop=True)

print("English candidates (with URL):", (df["language"]=="english").sum())
print("English pool used:", len(df_en))
df_en.head()

Annotation .gz files: 7756


Indexing DALI metadata:   0%|          | 0/7756 [00:00<?, ?it/s]

English candidates (with URL): 5913
English pool used: 4451


Unnamed: 0,dali_id,language,genres,has_genre,youtube_url,working
0,2e004e76948c4bc9963c80d9d6467b21,english,"Pop,Pop internationale",True,yQAV7bYQH9Y,False
1,dc0943f1c6da488f81007f5f8adac2a0,english,"Pop,Rock",True,q95ZKqE5dfU,False
2,8b92f9a050e44ad2826b7967a3de45b3,english,"Electro,Dance,Pop,Pop internationale,Variété I...",True,AaaHcb_SIrE,False
3,f212b38426d141b99b8f2d0fd6024aa5,english,Pop,True,Qc5Ezjgo3wM,False
4,4833f3c690d246d287d3eda64290de7c,english,Rock,True,0AvuweztG4Q,False


In [23]:
import os, sys, subprocess

def download_30to60_wav(dali_id: str, url: str, out_dir: Path) -> Path | None:
    """
    Downloads only [30,60] seconds to WAV: out_dir/{dali_id}.wav
    """
    out_tmpl = str(out_dir / f"{dali_id}.%(ext)s")
    section = f"*{int(START_SEC)}-{int(END_SEC)}"

    cmd = [
        sys.executable, "-m", "yt_dlp",
        "--no-playlist",
        "--download-sections", section,
        "--force-keyframes-at-cuts",
        "-x", "--audio-format", "wav", "--audio-quality", "0",
        "-o", out_tmpl,
        url
    ]

    p = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    wav_path = out_dir / f"{dali_id}.wav"
    return wav_path if (p.returncode == 0 and wav_path.exists()) else None

In [24]:
import numpy as np
import librosa
import soundfile as sf

def load_dali_segments(dali_id: str):
    """
    Returns list of (t0, t1, text) using aligned lines; fallback to words.
    """
    p = ANNOT_DIR / f"{dali_id}.gz"
    with gzip.open(p, "rb") as f:
        ann = pickle.load(f)

    annot = None
    try:
        annot = ann.annotations.get("annot", ann.annotations)
    except Exception:
        return []

    if not isinstance(annot, dict):
        return []

    segs = []
    for level in ["lines", "words"]:
        if level in annot:
            for x in annot[level]:
                try:
                    t0, t1 = float(x["time"][0]), float(x["time"][1])
                    txt = (x.get("text") or x.get("word") or "").strip()
                    if txt and t1 >= t0:
                        segs.append((t0, t1, txt))
                except Exception:
                    pass
            if segs:
                break
    return segs

def text_in_window(segs, t0, t1):
    parts = []
    for a, b, txt in segs:
        if b >= t0 and a <= t1:
            parts.append(txt)
    return " ".join(parts).strip()

def split_30s_wav_to_10clips(wav_path: Path, out_song_dir: Path, target_sr=22050):
    """
    Takes the downloaded 30s segment, forces exactly 30s (pad/truncate),
    writes 10 clips of 3s each. Returns clip paths.
    """
    y, sr = librosa.load(str(wav_path), sr=target_sr, mono=True)
    y = y.astype(np.float32)

    full_len = int(TAKE_SEC * target_sr)
    clip_len = int(CLIP_SEC * target_sr)

    if len(y) < full_len:
        y = librosa.util.fix_length(y, size=full_len)
    else:
        y = y[:full_len]

    out_song_dir.mkdir(parents=True, exist_ok=True)
    clip_paths = []
    for i in range(CLIPS_PER_SONG):
        s = i * clip_len
        e = s + clip_len
        clip = y[s:e]
        out_path = out_song_dir / f"clip_{i:02d}.wav"
        sf.write(str(out_path), clip, target_sr, subtype="PCM_16")
        clip_paths.append(out_path)

    return clip_paths

In [32]:
import pandas as pd
from tqdm.auto import tqdm

already_done = set()
if os.path.exists(OUT_CLIPS_CSV):
    try:
        prev = pd.read_csv(OUT_CLIPS_CSV)
        already_done = set(prev["dali_id"].astype(str).unique())
        print("Resume: already completed songs:", len(already_done))
    except Exception:
        pass

clip_rows = []
song_rows = []
fail_rows = []

kept_songs = 0

for _, r in tqdm(df_en.iterrows(), total=len(df_en), desc="Build English dataset (30-60)"):
    if kept_songs >= TARGET_SONGS:
        break

    dali_id = str(r["dali_id"])
    url = r["youtube_url"]
    genres = r["genres"]

    if dali_id in already_done:
        kept_songs += 1
        continue

    segs = []
    try:
        segs = load_dali_segments(dali_id)
    except Exception:
        segs = []

    seg_wav = SEG_WAV_DIR / f"{dali_id}.wav"
    if not seg_wav.exists():
        seg_wav = download_30to60_wav(dali_id, url, SEG_WAV_DIR)

    if seg_wav is None or not Path(seg_wav).exists():
        fail_rows.append({"dali_id": dali_id, "url": url})
        continue

    song_dir = CLIPS_DIR / dali_id
    try:
        clip_paths = split_30s_wav_to_10clips(Path(seg_wav), song_dir, target_sr=TARGET_SR)
    except Exception as e:
        fail_rows.append({"dali_id": dali_id, "url": url, "error": str(e)})
        continue

    nonempty = 0
    for i, cp in enumerate(clip_paths):
        abs_t0 = START_SEC + i * CLIP_SEC
        abs_t1 = abs_t0 + CLIP_SEC

        lyr = text_in_window(segs, abs_t0, abs_t1) if segs else ""
        if lyr:
            nonempty += 1

        clip_rows.append({
            "dali_id": dali_id,
            "clip_idx": i,
            "abs_t0": round(abs_t0, 3),
            "abs_t1": round(abs_t1, 3),
            "rel_t0": round(i * CLIP_SEC, 3),
            "rel_t1": round((i+1) * CLIP_SEC, 3),
            "clip_path": str(cp),
            "language": "english",
            "genres": genres,
            "lyrics_text": lyr,
            "lyrics_len": len(lyr),
        })

    if nonempty < MIN_NONEMPTY_CLIPS_PER_SONG:
        clip_rows = clip_rows[:-CLIPS_PER_SONG]
        continue

    song_rows.append({
        "dali_id": dali_id,
        "genres": genres,
        "nonempty_lyric_clips_in_30to60": nonempty,
        "segment_wav_path": str(seg_wav),
        "clips_dir": str(song_dir)
    })

    kept_songs += 1

df_clips = pd.DataFrame(clip_rows)
df_songs = pd.DataFrame(song_rows)
df_fail  = pd.DataFrame(fail_rows)

if os.path.exists(OUT_CLIPS_CSV):
    try:
        old = pd.read_csv(OUT_CLIPS_CSV)
        df_clips = pd.concat([old, df_clips], ignore_index=True)
    except Exception:
        pass

df_clips.to_csv(OUT_CLIPS_CSV, index=False)
df_songs.to_csv(OUT_SONGS_CSV, index=False)
df_fail.to_csv(OUT_FAIL_CSV, index=False)

print("Saved ->", OUT_CLIPS_CSV)
print("Saved ->", OUT_SONGS_CSV)
print("Saved ->", OUT_FAIL_CSV)

print("\nFinal clip rows:", len(df_clips))
print("Unique songs:", df_clips["dali_id"].nunique())
print("Non-empty lyric clips:", (df_clips["lyrics_len"] > 0).sum(), "/", len(df_clips))
print("Failures:", len(df_fail))

Build English dataset (30-60):   0%|          | 0/4451 [00:00<?, ?it/s]

Saved -> /content/drive/MyDrive/dali_english_1500songs_with_lyrics_genre.csv
Saved -> /content/drive/MyDrive/dali_english_1500songs_songlevel_stats.csv
Saved -> /content/drive/MyDrive/dali_english_download_failures.csv

Final clip rows: 15000
Unique songs: 1500
Non-empty lyric clips: 14102 / 15000
Failures: 0


In [5]:
import kagglehub
from pathlib import Path

banglabeats_root = Path(kagglehub.dataset_download("thisisjibon/banglabeats3sec"))

print("BanglaBeats root:", banglabeats_root)

Using Colab cache for faster access to the 'banglabeats3sec' dataset.
BanglaBeats root: /kaggle/input/banglabeats3sec


In [2]:
!apt-get -qq update
!apt-get -qq install -y ffmpeg
!pip -q install -U faster-whisper librosa soundfile pandas tqdm

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m157.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.8/38.8 MB[0m [31m72.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m137.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m5.0 MB/s[0

In [13]:
from pathlib import Path

BANGLA_ROOT = Path("/kaggle/input/banglabeats3sec/wavs3sec")

OUT_30S_DIR = Path("/content/banglabeats_30s")
OUT_30S_DIR.mkdir(parents=True, exist_ok=True)

OUT_CSV = "/content/drive/MyDrive/banglabeats_1500songs_with_lyrics_genre.csv"
OUT_SONGS_CSV = "/content/drive/MyDrive/banglabeats_1500songs_songlevel_transcripts.csv"

TARGET_SONGS = 1500
SR = 22050
START_SEC = 0.0
END_SEC   = 30.0
TAKE_SEC  = END_SEC - START_SEC
CLIP_SEC  = 3.0
CLIPS_PER_SONG = int(TAKE_SEC / CLIP_SEC)
LANGUAGE_LABEL = "bengali"

In [14]:
import random
from collections import defaultdict

AUDIO_EXTS = {".wav", ".mp3", ".flac", ".ogg", ".m4a", ".aac", ".au"}

def numeric_stem(p: Path):
    s = p.stem.strip()
    return int(s) if s.isdigit() else None

all_groups = []

genre_dirs = [d for d in BANGLA_ROOT.iterdir() if d.is_dir()]
print("Genre folders:", len(genre_dirs))

for gdir in sorted(genre_dirs):
    files = [p for p in gdir.iterdir() if p.is_file() and p.suffix.lower() in AUDIO_EXTS]
    nums = []
    for p in files:
        n = numeric_stem(p)
        if n is not None:
            nums.append((n, p))
    if not nums:
        continue

    groups = defaultdict(dict)
    for n, p in nums:
        gid = (n - 1) // CLIPS_PER_SONG
        idx = (n - 1) % CLIPS_PER_SONG
        groups[gid][idx] = p

    for gid, d in groups.items():
        if len(d) == CLIPS_PER_SONG:
            segs = [d[i] for i in range(CLIPS_PER_SONG)]
            song_id = f"{gdir.name}__{gid:06d}"
            all_groups.append({"song_id": song_id, "genre": gdir.name, "gid": gid, "seg_paths": segs})

print("Complete 30s groups found:", len(all_groups))

if len(all_groups) < TARGET_SONGS:
    raise ValueError(f"Not enough complete groups. Need {TARGET_SONGS}, found {len(all_groups)}")

random.seed(42)
selected_groups = random.sample(all_groups, TARGET_SONGS)
print("Selected groups:", len(selected_groups))

Genre folders: 8
Complete 30s groups found: 1617
Selected groups: 1500


In [15]:
import os
import numpy as np
import pandas as pd
import librosa
import soundfile as sf
from tqdm.auto import tqdm
from faster_whisper import WhisperModel

device = "cuda" if os.path.exists("/proc/driver/nvidia/version") else "cpu"
compute_type = "float16" if device == "cuda" else "int8"
print("ASR device:", device, "compute_type:", compute_type)

model = WhisperModel("small", device=device, compute_type=compute_type)

def load_exact_3s(path: Path, sr=22050):
    y, _ = librosa.load(str(path), sr=sr, mono=True)
    y = y.astype(np.float32)
    need = int(CLIP_SEC * sr)
    if len(y) < need:
        y = librosa.util.fix_length(y, size=need)
    else:
        y = y[:need]
    return y

def reconstruct_30s_wav(seg_paths, out_path: Path, sr=22050):
    clips = [load_exact_3s(p, sr=sr) for p in seg_paths]
    y30 = np.concatenate(clips, axis=0)
    sf.write(str(out_path), y30, sr, subtype="PCM_16")
    return out_path

def transcribe_segments(wav_path: Path, language="bn"):
    segments, info = model.transcribe(
        str(wav_path),
        language=language,
        beam_size=5,
        vad_filter=True
    )
    out = []
    for s in segments:
        txt = (s.text or "").strip()
        if txt:
            out.append((float(s.start), float(s.end), txt))
    return out

def text_in_window(segs, t0, t1):
    parts = []
    for a, b, txt in segs:
        if b >= t0 and a <= t1:
            parts.append(txt)
    return " ".join(parts).strip()

ASR device: cuda compute_type: float16


In [17]:
already_done = set()
if Path(OUT_CSV).exists():
    prev = pd.read_csv(OUT_CSV)
    already_done = set(prev["song_id"].astype(str).unique())
    print("Resume: already completed songs:", len(already_done))

clip_rows = []
song_rows = []

kept = 0
for item in tqdm(selected_groups, desc="BanglaBeats: reconstruct 30s -> ASR -> assign"):
    song_id = item["song_id"]
    genre = item["genre"]
    seg_paths = item["seg_paths"]

    if song_id in already_done:
        kept += 1
        continue

    wav30_path = OUT_30S_DIR / f"{song_id}.wav"
    if not wav30_path.exists():
        reconstruct_30s_wav(seg_paths, wav30_path, sr=SR)

    segs = transcribe_segments(wav30_path, language="bn")
    full_text = " ".join([t for _, _, t in segs]).strip()

    song_rows.append({
        "song_id": song_id,
        "genres": genre,
        "wav30_path": str(wav30_path),
        "transcript_text": full_text,
        "transcript_len": len(full_text),
        "num_asr_segments": len(segs)
    })

    for clip_idx, clip_path in enumerate(seg_paths):
        abs_t0 = START_SEC + clip_idx * CLIP_SEC
        abs_t1 = abs_t0 + CLIP_SEC

        lyr = text_in_window(segs, abs_t0, abs_t1) if segs else ""
        clip_rows.append({
            "song_id": song_id,
            "clip_idx": clip_idx,
            "abs_t0": round(abs_t0, 3),
            "abs_t1": round(abs_t1, 3),
            "rel_t0": round(clip_idx * CLIP_SEC, 3),
            "rel_t1": round((clip_idx + 1) * CLIP_SEC, 3),
            "clip_path": str(clip_path),
            "language": LANGUAGE_LABEL,
            "genres": genre,
            "lyrics_text": lyr,
            "lyrics_len": len(lyr)
        })

    kept += 1

df_new_clips = pd.DataFrame(clip_rows)
df_new_songs = pd.DataFrame(song_rows)

if Path(OUT_CSV).exists():
    old = pd.read_csv(OUT_CSV)
    df_new_clips = pd.concat([old, df_new_clips], ignore_index=True)

df_new_clips.to_csv(OUT_CSV, index=False)
df_new_songs.to_csv(OUT_SONGS_CSV, index=False)

print("Saved ->", OUT_CSV)
print("Saved ->", OUT_SONGS_CSV)
print("Final clip rows:", len(df_new_clips))
print("Unique songs:", df_new_clips['song_id'].nunique())
print("Non-empty lyric clips:", (df_new_clips["lyrics_len"] > 0).sum(), "/", len(df_new_clips))

Saved -> /content/drive/MyDrive/banglabeats_1500songs_with_lyrics_genre.csv
Saved -> /content/drive/MyDrive/banglabeats_1500songs_songlevel_transcripts.csv
Final clip rows: 15000
Unique songs: 1500
Non-empty lyric clips: 4331 / 15000


In [18]:
DALI_CSV = "/content/drive/MyDrive/dali_english_1500songs_with_lyrics_genre.csv"
BN_CSV   = "/content/drive/MyDrive/banglabeats_1500songs_with_lyrics_genre.csv"

OUT_DIR = "/content/drive/MyDrive/hybrid_dataset"
OUT_DIR

'/content/drive/MyDrive/hybrid_dataset'

In [19]:
from pathlib import Path
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

OUT_MANIFEST_CSV = str(Path(OUT_DIR) / "hybrid_en_bn_3sec_30000_manifest.csv")
OUT_MANIFEST_PARQUET = str(Path(OUT_DIR) / "hybrid_en_bn_3sec_30000_manifest.parquet")

In [22]:
import pandas as pd

df_dali = pd.read_csv(DALI_CSV)
df_bn   = pd.read_csv(BN_CSV)

print("DALI:", df_dali.shape)
print("Bangla:", df_bn.shape)
df_dali.head(), df_bn.head()

DALI: (15000, 11)
Bangla: (15000, 11)


(                            dali_id  clip_idx  abs_t0  abs_t1  rel_t0  rel_t1  \
 0  2e004e76948c4bc9963c80d9d6467b21         0    30.0    33.0     0.0     3.0   
 1  2e004e76948c4bc9963c80d9d6467b21         1    33.0    36.0     3.0     6.0   
 2  2e004e76948c4bc9963c80d9d6467b21         2    36.0    39.0     6.0     9.0   
 3  2e004e76948c4bc9963c80d9d6467b21         3    39.0    42.0     9.0    12.0   
 4  2e004e76948c4bc9963c80d9d6467b21         4    42.0    45.0    12.0    15.0   
 
                                            clip_path language  \
 0  /content/drive/MyDrive/dali_en_clips_3s/c3d765...  english   
 1  /content/drive/MyDrive/dali_en_clips_3s/c3d765...  english   
 2  /content/drive/MyDrive/dali_en_clips_3s/c3d765...  english   
 3  /content/drive/MyDrive/dali_en_clips_3s/c3d765...  english   
 4  /content/drive/MyDrive/dali_en_clips_3s/c3d765...  english   
 
                    genres                                        lyrics_text  \
 0  Pop,Pop internationale 

In [23]:
import numpy as np

# ---- Standardize DALI ----
dali = df_dali.copy()

# Make a shared song_id column
if "song_id" not in dali.columns:
    if "dali_id" in dali.columns:
        dali["song_id"] = dali["dali_id"].astype(str)
    else:
        raise ValueError("DALI CSV must contain either song_id or dali_id.")

# Ensure required columns exist
for col in ["clip_idx","abs_t0","abs_t1","rel_t0","rel_t1","clip_path","genres","lyrics_text","lyrics_len","language"]:
    if col not in dali.columns:
        # Fill missing with sensible defaults
        if col in ["clip_idx"]:
            dali[col] = 0
        elif col in ["abs_t0","abs_t1","rel_t0","rel_t1","lyrics_len"]:
            dali[col] = 0.0
        else:
            dali[col] = ""

dali["source_dataset"] = "dali"
dali["language"] = "english"
dali["genres"] = dali["genres"].fillna("").astype(str)
dali["lyrics_text"] = dali["lyrics_text"].fillna("").astype(str)
dali["lyrics_len"] = dali["lyrics_len"].fillna(0).astype(int)
dali["clip_idx"] = dali["clip_idx"].astype(int)

# ---- Standardize BanglaBeats ----
bn = df_bn.copy()

# Bangla file uses song_id already (from grouping). If not, derive from parent folder.
if "song_id" not in bn.columns:
    bn["song_id"] = bn["clip_path"].astype(str).apply(lambda p: Path(p).parent.name)

for col in ["clip_idx","abs_t0","abs_t1","rel_t0","rel_t1","clip_path","genres","lyrics_text","lyrics_len","language"]:
    if col not in bn.columns:
        if col in ["clip_idx"]:
            bn[col] = 0
        elif col in ["abs_t0","abs_t1","rel_t0","rel_t1","lyrics_len"]:
            bn[col] = 0.0
        else:
            bn[col] = ""

bn["source_dataset"] = "banglabeats"
bn["language"] = "bengali"
bn["genres"] = bn["genres"].fillna("").astype(str)
bn["lyrics_text"] = bn["lyrics_text"].fillna("").astype(str)
bn["lyrics_len"] = bn["lyrics_len"].fillna(0).astype(int)
bn["clip_idx"] = bn["clip_idx"].astype(int)

# ---- Select common columns ----
cols = [
    "song_id","clip_idx",
    "abs_t0","abs_t1","rel_t0","rel_t1",
    "clip_path","language","source_dataset","genres",
    "lyrics_text","lyrics_len"
]
dali = dali[cols].copy()
bn   = bn[cols].copy()

# ---- Balance (15k each, but robust if counts differ) ----
SEED = 42
target_per_lang = 15000
n = min(target_per_lang, len(dali), len(bn))

dali = dali.sample(frac=1.0, random_state=SEED).head(n).reset_index(drop=True)
bn   = bn.sample(frac=1.0, random_state=SEED).head(n).reset_index(drop=True)

# ---- Add IDs and combine ----
def make_clip_uid(df):
    return (
        df["source_dataset"].astype(str) + "::" +
        df["song_id"].astype(str) + "::" +
        df["clip_idx"].astype(int).astype(str)
    )

dali["clip_uid"] = make_clip_uid(dali)
bn["clip_uid"]   = make_clip_uid(bn)

df_all = pd.concat([dali, bn], ignore_index=True)
df_all = df_all.sample(frac=1.0, random_state=SEED).reset_index(drop=True)

df_all["language_id"] = df_all["language"].map({"english": 0, "bengali": 1}).astype(int)
df_all["has_lyrics"] = df_all["lyrics_len"] > 0

# Extra lightweight text features (good for EDA)
df_all["word_count"] = df_all["lyrics_text"].astype(str).str.split().apply(len)
df_all["char_count"] = df_all["lyrics_text"].astype(str).apply(len)

print("Unified:", df_all.shape)
print(df_all["language"].value_counts())
df_all.head()

Unified: (30000, 17)
language
english    15000
bengali    15000
Name: count, dtype: int64


Unnamed: 0,song_id,clip_idx,abs_t0,abs_t1,rel_t0,rel_t1,clip_path,language,source_dataset,genres,lyrics_text,lyrics_len,clip_uid,language_id,has_lyrics,word_count,char_count
0,8247bc1d70ba4c4a9a9ae8cad93480ff,6,48.0,51.0,18.0,21.0,/content/drive/MyDrive/dali_en_clips_3s/c3d765...,english,dali,Pop,i've searched for the perfect love all my life,46,dali::8247bc1d70ba4c4a9a9ae8cad93480ff::6,0,True,9,46
1,Adhunik__000167,2,6.0,9.0,6.0,9.0,/kaggle/input/banglabeats3sec/wavs3sec/Adhunik...,bengali,banglabeats,Adhunik,,0,banglabeats::Adhunik__000167::2,1,False,0,0
2,Rock__000184,1,3.0,6.0,3.0,6.0,/kaggle/input/banglabeats3sec/wavs3sec/Rock/18...,bengali,banglabeats,Rock,,0,banglabeats::Rock__000184::1,1,False,0,0
3,Folk__000014,2,6.0,9.0,6.0,9.0,/kaggle/input/banglabeats3sec/wavs3sec/Folk/14...,bengali,banglabeats,Folk,,0,banglabeats::Folk__000014::2,1,False,0,0
4,743189d4a1cc4dadbb7312d3b17defaf,3,39.0,42.0,9.0,12.0,/content/drive/MyDrive/dali_en_clips_3s/c3d765...,english,dali,"Jazz,Pop",and children listen to hear sleigh bells in th...,53,dali::743189d4a1cc4dadbb7312d3b17defaf::3,0,True,10,53


In [24]:
# Save unified dataset
df_all.to_csv(OUT_MANIFEST_CSV, index=False)
print("Saved ->", OUT_MANIFEST_CSV)

try:
    df_all.to_parquet(OUT_MANIFEST_PARQUET, index=False)
    print("Saved ->", OUT_MANIFEST_PARQUET)
except Exception as e:
    print("Parquet save skipped:", e)

Saved -> /content/drive/MyDrive/hybrid_dataset/hybrid_en_bn_3sec_30000_manifest.csv
Saved -> /content/drive/MyDrive/hybrid_dataset/hybrid_en_bn_3sec_30000_manifest.parquet
