# Jamendo vs Song Describer: Duration, Genres, Text Lengths

Metadata-only comparison of the JamendoMaxCaps and Song Describer datasets.

In [4]:
import json
from collections import Counter

import numpy as np
import pandas as pd
from datasets import load_dataset
from huggingface_hub import hf_hub_download, list_repo_files


In [6]:
jamendo_repo = "amaai-lab/JamendoMaxCaps"
song_repo = "renumics/song-describer-dataset"

jamendo_records = []
for filename in list_repo_files(jamendo_repo, repo_type="dataset"):
    if not filename.endswith(".jsonl") or not filename[0].isdigit():
        continue
    local_path = hf_hub_download(jamendo_repo, filename=filename, repo_type="dataset")
    with open(local_path, "r", encoding="utf-8") as handle:
        for line in handle:
            if len(jamendo_records) >= 5000:
                break
            item = json.loads(line)
            duration = item.get("duration")
            genres = item.get("musicinfo", {}).get("tags", {}).get("genres", []) or []
            jamendo_records.append({
                "track_id": str(item.get("id")),
                "duration": float(duration) if isinstance(duration, (int, float)) else np.nan,
                "genres": [g for g in genres if isinstance(g, str)],
            })
    if len(jamendo_records) >= 5000:
        break

jamendo_df = pd.DataFrame(jamendo_records)
jamendo_df


Unnamed: 0,track_id,duration,genres
0,121251,254.0,[]
1,120968,219.0,[]
2,121551,178.0,[]
3,121885,216.0,[]
4,117561,281.0,"[rock, electronic, drumnbass]"
...,...,...,...
4995,129234,115.0,[]
4996,129542,266.0,"[indie, shoegaze, rock]"
4997,129584,366.0,"[electronic, breakcore]"
4998,129270,289.0,[]


In [7]:
caption_index = {}
caption_path = hf_hub_download(jamendo_repo, filename="final_caption30sec.jsonl", repo_type="dataset")
with open(caption_path, "r", encoding="utf-8") as handle:
    for line in handle:
        item = json.loads(line)
        track_id = str(item.get("id"))
        text = item.get("caption")
        if track_id and isinstance(text, str):
            caption_index[track_id] = text.strip()

jamendo_df["caption"] = jamendo_df["track_id"].map(caption_index).fillna("")
jamendo_df["caption_chars"] = jamendo_df["caption"].apply(len)
jamendo_df["caption_words"] = jamendo_df["caption"].apply(lambda text: len(text.split()) if text else 0)
jamendo_df[["track_id", "duration", "caption_chars"]].head()


Unnamed: 0,track_id,duration,caption_chars
0,121251,254.0,489
1,120968,219.0,257
2,121551,178.0,484
3,121885,216.0,363
4,117561,281.0,388


In [8]:
song_dataset = load_dataset(song_repo, split="train")
limit = min(len(song_dataset), 5000)
song_slice = song_dataset.select(range(limit))
song_df = song_slice.to_pandas()[['caption', 'duration']].copy()
song_df['caption'] = song_df['caption'].astype(str)
song_df['caption_chars'] = song_df['caption'].apply(len)
song_df['caption_words'] = song_df['caption'].apply(lambda text: len(text.split()) if text else 0)
song_df.head()


Unnamed: 0,caption,duration,caption_chars,caption_words
0,Electronic music that has a constant melody th...,202.5,162,25
1,acoustic guitar solo track with consistent rhy...,140.2,114,16
2,Upbeat fast tempo with a blues rock feel that ...,160.1,59,12
3,A classic chord progression used in a playful ...,160.1,78,15
4,Uplifting English rock and roll song with a po...,160.1,61,10


In [9]:
jamendo_genre_counts = Counter()
for values in jamendo_df["genres"]:
    for genre in values:
        cleaned = genre.strip().lower()
        if cleaned:
            jamendo_genre_counts[cleaned] += 1

jamendo_genre_vocab = set(jamendo_genre_counts.keys())
jamendo_genre_counts.most_common(10)


[('electronic', 1085),
 ('rock', 577),
 ('ambient', 489),
 ('techno', 229),
 ('pop', 164),
 ('filmscore', 148),
 ('experimental', 123),
 ('downtempo', 118),
 ('chillout', 117),
 ('dance', 106)]

In [10]:
song_genre_counts = Counter()
for caption in song_df["caption"]:
    tokens = set(token.strip(",!?()\"'").lower() for token in caption.split())
    for genre in jamendo_genre_vocab:
        if genre in tokens:
            song_genre_counts[genre] += 1

song_genre_counts.most_common(10)


[('rock', 98),
 ('pop', 79),
 ('electronic', 77),
 ('folk', 26),
 ('jazz', 24),
 ('classical', 24),
 ('dance', 23),
 ('indie', 23),
 ('ambient', 21),
 ('intro', 19)]

In [12]:
jamendo_duration_series = jamendo_df["duration"].dropna()
song_duration_series = pd.Series([value for value in song_df["duration"] if isinstance(value, (int, float))])

duration_summary = pd.DataFrame({
    "Jamendo": [
        jamendo_duration_series.count(),
        jamendo_duration_series.mean(),
        jamendo_duration_series.median(),
        jamendo_duration_series.std(),
        jamendo_duration_series.min(),
        jamendo_duration_series.max(),
    ],
    "Song Describer": [
        song_duration_series.count(),
        song_duration_series.mean(),
        song_duration_series.median(),
        song_duration_series.std(),
        song_duration_series.min(),
        song_duration_series.max(),
    ],
}, index=["count", "mean_sec", "median_sec", "std_sec", "min_sec", "max_sec"])

duration_summary


Unnamed: 0,Jamendo,Song Describer
count,5000.0,746.0
mean_sec,261.148,222.401743
median_sec,227.0,216.0
std_sec,218.692799,74.733716
min_sec,5.0,34.0
max_sec,4037.0,666.0


In [13]:
jamendo_text_summary = pd.Series({
    "count": jamendo_df["caption_chars"].count(),
    "mean_chars": jamendo_df["caption_chars"].mean(),
    "median_chars": jamendo_df["caption_chars"].median(),
    "mean_words": jamendo_df["caption_words"].mean(),
    "median_words": jamendo_df["caption_words"].median(),
})

song_text_summary = pd.Series({
    "count": song_df["caption_chars"].count(),
    "mean_chars": song_df["caption_chars"].mean(),
    "median_chars": song_df["caption_chars"].median(),
    "mean_words": song_df["caption_words"].mean(),
    "median_words": song_df["caption_words"].median(),
})

pd.concat([jamendo_text_summary.rename("Jamendo"), song_text_summary.rename("Song Describer")], axis=1)


Unnamed: 0,Jamendo,Song Describer
count,5000.0,746.0
mean_chars,359.7088,100.174263
median_chars,353.0,93.0
mean_words,61.3964,16.821716
median_words,61.0,15.0


In [14]:
jamendo_genre_table = pd.Series(jamendo_genre_counts).sort_values(ascending=False).head(20)
song_genre_table = pd.Series(song_genre_counts).sort_values(ascending=False).head(20)

pd.concat([
    jamendo_genre_table.rename("Jamendo"),
    song_genre_table.rename("Song Describer"),
], axis=1).fillna(0).astype(int)


Unnamed: 0,Jamendo,Song Describer
electronic,1085,77
rock,577,98
ambient,489,21
techno,229,7
pop,164,79
filmscore,148,0
experimental,123,9
downtempo,118,0
chillout,117,0
dance,106,23
