In [1]:
import os
import h5py
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
import pickle



In [2]:
DATASET_PATH = "../data/MillionSongSubset"

In [3]:
def safe_decode(value):
    """Decode bytes to string; leave numbers unchanged."""
    if isinstance(value, bytes):
        return value.decode('utf-8')
    elif isinstance(value, np.ndarray) and value.dtype.kind in {'S', 'O'}:
        return value[0].decode('utf-8')
    return value

def extract_track_info(h5_file):
    """Extract track info and audio features from a .h5 file."""
    with h5py.File(h5_file, "r") as f:
        # Metadata
        meta = f['metadata/songs'][0]
        artist_name = safe_decode(meta['artist_name'])
        genre = safe_decode(meta['genre']) if 'genre' in meta.dtype.names else "unknown"
        
        # Analysis features
        analysis = f['analysis/songs'][0]
        track_id = safe_decode(analysis['track_id'])
        tempo = analysis['tempo']
        loudness = analysis['loudness']
        duration = analysis['duration']
        key = analysis['key']
        mode = analysis['mode']
        time_signature = analysis['time_signature']
        
    return {
        "track_id": track_id,
        "artist_name": artist_name,
        "tempo": tempo,
        "loudness": loudness,
        "duration": duration,
        "key": key,
        "mode": mode,
        "time_signature": time_signature
    }

In [4]:
tracks = []
for root, dirs, files in os.walk(DATASET_PATH):
    for file in files:
        if file.endswith(".h5"):
            file_path = os.path.join(root, file)
            try:
                info = extract_track_info(file_path)
                tracks.append(info)
            except Exception as e:
                print(f"Failed to read {file_path}: {e}")

df_tracks = pd.DataFrame(tracks)
print(f"Loaded {len(df_tracks)} tracks")

Loaded 10000 tracks


In [None]:
all_item_features = set()
item_features_list = []

for row in df_tracks.itertuples(index=False):
    features = {
        "tempo": row.tempo,
        "loudness": row.loudness,
        "duration": row.duration,
        "key": row.key,
        "mode": row.mode,
        "time_signature": row.time_signature
    }
    all_item_features.update(features.keys())
    item_features_list.append((row.track_id, features))

print(f"Created {len(all_item_features)} unique features")

Created 23344 unique features


In [None]:
dataset = Dataset()
dataset.fit(
    users=df_tracks['artist_name'].unique(),
    items=df_tracks['track_id'].tolist(),
    item_features=list(all_item_features)
)

users = df_tracks['artist_name'].unique().tolist()  # using artists as users
items = df_tracks['track_id'].tolist()
user_item_pairs = list(df_tracks[['artist_name', 'track_id']].itertuples(index=False, name=None))
interactions, weights = dataset.build_interactions(user_item_pairs)
print('created user-item matrix')

created user-item matrix


In [7]:
item_features = dataset.build_item_features(item_features_list)

In [None]:
# model = LightFM(loss='bpr', no_components=25)
# model.fit(interactions, item_features=item_features, epochs=10, num_threads=1)