In [1]:
import pandas as pd
import numpy as np
import zipfile
import joblib
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import os

# -------------------------
# PATHS (use YOUR actual paths)
# -------------------------

SONGS_PATH = r"E:\Music Recommendation\Music Info.csv"
HISTORY_ZIP = r"E:\Music Recommendation\User Listening History.zip"

print("Paths set:")
print("SONGS_PATH:", SONGS_PATH)
print("HISTORY_ZIP:", HISTORY_ZIP)


Paths set:
SONGS_PATH: E:\Music Recommendation\Music Info.csv
HISTORY_ZIP: E:\Music Recommendation\User Listening History.zip


In [2]:
df = pd.read_csv(SONGS_PATH)

# Clean key text columns
for col in ["name", "artist", "genre", "tags"]:
    if col not in df.columns:
        df[col] = ""
    df[col] = df[col].fillna("").astype(str)

df = df.reset_index(drop=True)

print("Songs loaded:", df.shape)
df.head()


Songs loaded: (50683, 21)


Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,TROUVHL128F426C441,Come as You Are,Nirvana,https://p.scdn.co/mp3-preview/a1c11bb1cb231031...,0keNu0t0tqsWtExGM3nT1D,"rock, alternative, alternative_rock, 90s, grunge",RnB,1991,218920,0.508,...,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,https://p.scdn.co/mp3-preview/399c401370438be4...,0ancVQ9wEcHVd0RrGICTE4,"rock, alternative, indie, alternative_rock, in...",,2004,237026,0.279,...,9,-8.851,1,0.0371,0.000389,0.000655,0.133,0.49,104.56,4
4,TRLNZBD128F935E4D8,Creep,Radiohead,https://p.scdn.co/mp3-preview/e7eb60e9466bc3a2...,01QoK9DA7VTeTSE3MNzp4I,"rock, alternative, indie, alternative_rock, in...",RnB,2008,238640,0.515,...,7,-9.935,1,0.0369,0.0102,0.000141,0.129,0.104,91.841,4


In [3]:
with zipfile.ZipFile(HISTORY_ZIP, 'r') as z:
    inside_file = z.namelist()[0]
    df_hist = pd.read_csv(z.open(inside_file))

print("History loaded:", df_hist.shape)
df_hist.head()


History loaded: (9711301, 3)


Unnamed: 0,track_id,user_id,playcount
0,TRIRLYL128F42539D1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
1,TRFUPBA128F934F7E1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
2,TRLQPQJ128F42AA94F,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
3,TRTUCUY128F92E1D24,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1
4,TRHDDQG12903CB53EE,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1


In [4]:
possible_user = ["user_id", "userid", "user", "UserID"]
possible_track = ["track_id", "track", "song_id", "spotify_id"]
possible_count = ["play_count", "count", "plays", "listen_count"]

def detect(columns, df):
    for c in columns:
        if c in df.columns:
            return c
    return None

user_col = detect(possible_user, df_hist)
track_col = detect(possible_track, df_hist)
count_col = detect(possible_count, df_hist)

print("Detected:", user_col, track_col, count_col)

# Fallback if no count exists
if count_col is None:
    df_hist["play_count"] = 1
    count_col = "play_count"

df_hist = df_hist[[user_col, track_col, count_col]].rename(
    columns={user_col: "user_id", track_col: "track_id", count_col: "play_count"}
)

df_hist["user_id"] = df_hist["user_id"].astype(str)
df_hist["track_id"] = df_hist["track_id"].astype(str)

df_hist.head()


Detected: user_id track_id None


Unnamed: 0,user_id,track_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,TRIRLYL128F42539D1,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,TRFUPBA128F934F7E1,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,TRLQPQJ128F42AA94F,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,TRTUCUY128F92E1D24,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,TRHDDQG12903CB53EE,1


In [5]:
# remove the unmatched records,converting track IDs into internal indices.

df["track_id"] = df["track_id"].astype(str)

track_to_index = {tid: i for i, tid in enumerate(df["track_id"])}

df_hist["song_idx"] = df_hist["track_id"].map(track_to_index)
df_hist = df_hist.dropna(subset=["song_idx"])
df_hist["song_idx"] = df_hist["song_idx"].astype(int)

print("Matched listens:", len(df_hist))
df_hist.head()


Matched listens: 9711301


Unnamed: 0,user_id,track_id,play_count,song_idx
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,TRIRLYL128F42539D1,1,27758
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,TRFUPBA128F934F7E1,1,2350
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,TRLQPQJ128F42AA94F,1,38473
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,TRTUCUY128F92E1D24,1,5416
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,TRHDDQG12903CB53EE,1,2722


In [6]:
# song stores the set of users who listened

item_users = defaultdict(set)

for _, row in df_hist.iterrows():
    user = row["user_id"]
    item = int(row["song_idx"])
    item_users[item].add(user)

print("Items with user data:", len(item_users))


Items with user data: 30459


In [7]:
# Text for TF-IDF
df["text"] = (
    df["name"] + " " + df["artist"] + " " +
    df["genre"] + " " + df["tags"]
).str.lower()

tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_matrix = tfidf.fit_transform(df["text"])

numeric_cols = [
    "danceability","energy","loudness","speechiness","acousticness",
    "instrumentalness","liveness","valence","tempo","duration_ms","year"
]

for c in numeric_cols:
    if c not in df.columns:
        df[c] = 0

num_df = df[numeric_cols].fillna(0)

scaler = StandardScaler()
scaled_nums = scaler.fit_transform(num_df)

num_sparse = csr_matrix(scaled_nums)

feature_matrix = hstack([tfidf_matrix, num_sparse])

print("Feature matrix shape:", feature_matrix.shape)


Feature matrix shape: (50683, 5011)


In [8]:
SAVE_DIR = r"E:\Music Recommendation\Hybrid model"

if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

joblib.dump(df, SAVE_DIR + r"\songs_df.joblib")
joblib.dump(tfidf, SAVE_DIR + r"\tfidf_vectorizer.joblib")
joblib.dump(scaler, SAVE_DIR + r"\scaler.joblib")
joblib.dump(numeric_cols, SAVE_DIR + r"\numeric_cols.joblib")
joblib.dump(feature_matrix, SAVE_DIR + r"\feature_matrix.joblib")
joblib.dump(dict(item_users), SAVE_DIR + r"\item_users_dict.joblib")

print("Artifacts saved in:", SAVE_DIR)


Artifacts saved in: E:\Music Recommendation\Hybrid model
