In [2]:
# precompute_songs_data.ipynb

import pandas as pd
import numpy as np
import json
from tqdm import tqdm

# ---- Step 0: Config ----
INPUT_CSV = "../../data-pipeline/input/songs_dataset_6k.csv"
OUTPUT_JSON = "../../data/songs_data.json"

# ---- Step 1: Load CSV ----
print("Loading dataset...")
df = pd.read_csv(INPUT_CSV)
print("Shape:", df.shape)
print(df.head())

# ---- Step 2: Choose features ----
FEATURES = [
    "danceability","energy","valence","tempo",
    "acousticness","instrumentalness","speechiness","loudness"
]

print("\nChecking missing values:")
print(df[FEATURES].isna().sum())

# Drop rows with missing features
df = df.dropna(subset=FEATURES).reset_index(drop=True)
print("After dropna:", df.shape)

# ---- Step 3: Normalize ----
X = df[FEATURES].to_numpy(dtype=np.float32)
means, stds = X.mean(axis=0), X.std(axis=0)
X_norm = (X - means) / stds

print("\nFeature stats:")
for f, m, s in zip(FEATURES, means, stds):
    print(f"{f}: mean={m:.3f}, std={s:.3f}")

# ---- Step 4: Build JSON structure ----
songs = []
for i in tqdm(range(len(df)), desc="Processing songs"):
    row = df.iloc[i]
    songs.append({
        "title": row.get("title",""),
        "artist": row.get("artist",""),
        "vec": X_norm[i].round(4).tolist()  # round for smaller JSON
    })

out = {
    "features": FEATURES,
    "means": means.round(4).tolist(),
    "stds": stds.round(4).tolist(),
    "songs": songs
}

# ---- Step 5: Save ----
with open(OUTPUT_JSON, "w") as f:
    json.dump(out, f, indent=2)

print(f"\n✅ Saved {len(songs)} songs to {OUTPUT_JSON}")

Loading dataset...
Shape: (6000, 20)
        genre        artist_name                             track_name  \
0     Country  A Thousand Horses                       My Time's Comin'   
1  Soundtrack  Mark Mothersbaugh                             House Tour   
2      Reggae    Unified Highway  We Can't Fall (Remix) [feat. J. Patz]   
3  Electronic       Stooki Sound                    Endz - Original Mix   
4      Comedy         Bill Hicks                   I Love My Job (Live)   

                 track_id  popularity  acousticness  danceability  \
0  16zol4GvHyTER5irYODUk0          45       0.00192         0.327   
1  6ac5gUfGTckpdGQCyWsdh2          25       0.93200         0.253   
2  09Yz6koF1Y15n1012t1UX6          19       0.03310         0.821   
3  3dzEZARDL4ZwICMKVta7Xn          29       0.00428         0.745   
4  39Z1G5384UgGa5vmW6WyxC          17       0.96500         0.502   

   duration_ms  energy  instrumentalness key  liveness  loudness   mode  \
0       194107  0.8350

Processing songs: 100%|██████████| 6000/6000 [00:00<00:00, 27602.43it/s]



✅ Saved 6000 songs to ../../data/songs_data.json
