In [3]:
import json
import os
import sqlite3

import numpy as np
import pandas as pd

from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares

In [4]:
TPS_DIR = '~/Desktop/DSCI-441-Project/dataset/taste-profile/'

# http://labrosa.ee.columbia.edu/millionsong/sites/default/files/challenge/train_triplets.txt.zip
TP_file = os.path.join(TPS_DIR, 'train_triplets.txt')

# Metadata: http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/track_metadata.db
md_dbfile = os.path.join(TPS_DIR, 'track_metadata.db')

In [5]:
tp = pd.read_table(TP_file, header=None, names=['uid', 'sid', 'count'])
tp.head()

Unnamed: 0,uid,sid,count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [4]:
md = pd.read_table

### Get the user-playcount

In [5]:
# Only keep songs that have been listened to by at least 50, and users that have listened to at least 20 songs.
MIN_USER_COUNT = 20
MIN_SONG_COUNT = 50

def filter_tp(tp, min_user_count=MIN_USER_COUNT, min_song_count=MIN_SONG_COUNT):
    # Get the number of songs listened to by each user
    user_counts = tp.groupby('uid')['sid'].nunique()
    # Get the number of users that listened to each song
    song_counts = tp.groupby('sid')['uid'].nunique()

    # Filter users and songs based on the counts
    filtered_users = user_counts[user_counts >= min_user_count].index
    filtered_songs = song_counts[song_counts >= min_song_count].index

    # Filter the original DataFrame
    filtered_tp = tp[tp['uid'].isin(filtered_users) & tp['sid'].isin(filtered_songs)]

    return filtered_tp
df = filter_tp(tp)
df.head()

Unnamed: 0,uid,sid,count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [6]:
n_songs = df['sid'].nunique()
n_users = df['uid'].nunique()
print(f'Number of unique songs: {n_songs}')
print(f'Number of unique users: {n_users}')

Number of unique songs: 98485
Number of unique users: 661089


In [7]:
df.shape

(40266961, 3)

# Build MF

In [8]:
def build_mappings(df: pd.DataFrame):
    user_ids = df["uid"].unique()
    song_ids = df["sid"].unique()

    user_to_idx = {u: i for i, u in enumerate(user_ids)}
    idx_to_user = {i: u for u, i in user_to_idx.items()}

    song_to_idx = {s: i for i, s in enumerate(song_ids)}
    idx_to_song = {i: s for s, i in song_to_idx.items()}

    return user_to_idx, idx_to_user, song_to_idx, idx_to_song

user_to_idx, idx_to_user, song_to_idx, idx_to_song = build_mappings(df)


In [9]:
def build_user_item_matrix(df: pd.DataFrame, user_to_idx, song_to_idx):
    rows = df["uid"].map(user_to_idx).to_numpy()
    cols = df["sid"].map(song_to_idx).to_numpy()

    # implicit ALS usually works better with a log transform for play counts
    data = np.log1p(df["count"].to_numpy()).astype(np.float32)

    mat = coo_matrix((data, (rows, cols)),
                     shape=(len(user_to_idx), len(song_to_idx))).tocsr()
    return mat

user_item = build_user_item_matrix(df, user_to_idx, song_to_idx)

In [18]:
# implicit expects an item-user matrix
# item_user = user_item.T.tocsr()

model = AlternatingLeastSquares(
    factors=64,
    regularization=0.1,
    iterations=20,
    random_state=42
)

# confidence scaling (common trick in implicit feedback)
alpha = 40.0
model.fit(user_item * alpha)

  0%|          | 0/20 [00:00<?, ?it/s]

In [19]:
some_user = df["uid"].iloc[0]
uidx = user_to_idx[some_user]

rec_item_idxs, scores = model.recommend(
    uidx,
    user_item[uidx],   # one user row
    N=10,
    filter_already_liked_items=True
)

for i, s in zip(rec_item_idxs, scores):
    i = int(i)
    print(">>>", i, idx_to_song[i], float(s))

>>> 1533 SOSPXWA12AB0181875 1.042104721069336
>>> 1082 SOERYLG12A6701F07F 1.0220763683319092
>>> 2031 SOJSTYO12A8C13F200 1.0158164501190186
>>> 1086 SOKUTUM12A6701D9CD 0.9769822359085083
>>> 1359 SOOABBO12A6701DFDA 0.8984556198120117
>>> 13505 SOUMOMJ12A6701DFDC 0.8798072934150696
>>> 2040 SOVGLTY12AF72A39CD 0.8792482614517212
>>> 7520 SOTHABI12A58A7DACB 0.8738271594047546
>>> 6433 SOMYECL12A6701D9C8 0.8702127933502197
>>> 2662 SOGCDYR12AC961854A 0.869952380657196


In [20]:
print("user_item:", user_item.shape)
print("model item_factors:", model.item_factors.shape)

user_item: (661089, 98485)
model item_factors: (98485, 64)


In [21]:
some_user = df["uid"].iloc[0]
print("???", some_user)

rec_item_idxs, scores = model.recommend(
    user_to_idx[some_user],
    user_item[user_to_idx[some_user]],
    N=10,
    filter_already_liked_items=True
)

item_idx = int(rec_item_idxs[0])
[print(">>>", i, idx_to_song[i]) for i in rec_item_idxs]
print(">>>", item_idx,  idx_to_song[item_idx])

# print("Recommended song IDs and scores:")
# for idx, score in zip(rec_item_idxs, scores):
#     print(f"{idx_to_song[idx]}: {score:.4f}")

??? b80344d063b5ccb3212f76538f3d9e43d87dca9e
>>> 1533 SOSPXWA12AB0181875
>>> 1082 SOERYLG12A6701F07F
>>> 2031 SOJSTYO12A8C13F200
>>> 1086 SOKUTUM12A6701D9CD
>>> 1359 SOOABBO12A6701DFDA
>>> 13505 SOUMOMJ12A6701DFDC
>>> 2040 SOVGLTY12AF72A39CD
>>> 7520 SOTHABI12A58A7DACB
>>> 6433 SOMYECL12A6701D9C8
>>> 2662 SOGCDYR12AC961854A
>>> 1533 SOSPXWA12AB0181875


In [22]:
print("len(idx_to_song) =", len(idx_to_song))
print("max idx_to_song key =", max(idx_to_song.keys()))
print("user_item #items =", user_item.shape[1])
print("model #items =", model.item_factors.shape[0])
print("max recommended idx =", int(rec_item_idxs.max()))

len(idx_to_song) = 98485
max idx_to_song key = 98484
user_item #items = 98485
model #items = 98485
max recommended idx = 13505


In [23]:
def recommend_for_user(user_id: str, N: int = 10):
    if user_id not in user_to_idx:
        raise ValueError("User not found in training data.")

    uidx = user_to_idx[user_id]

    # Pass only this user's row (shape: 1 x num_items)
    user_row = user_item[uidx]

    rec_item_idxs, scores = model.recommend(
        uidx,
        user_row,
        N=N,
        filter_already_liked_items=True
    )

    item_idxs = [int(i) for i in rec_item_idxs]

    rec_sids = [idx_to_song[i] for i in item_idxs]
    return pd.DataFrame({"sid": rec_sids, "score": scores})

# Example usage:
some_user = df["uid"].iloc[0]
print("User:", some_user)
print(recommend_for_user(some_user, N=10))

User: b80344d063b5ccb3212f76538f3d9e43d87dca9e
                  sid     score
0  SOSPXWA12AB0181875  1.042105
1  SOERYLG12A6701F07F  1.022076
2  SOJSTYO12A8C13F200  1.015816
3  SOKUTUM12A6701D9CD  0.976982
4  SOOABBO12A6701DFDA  0.898456
5  SOUMOMJ12A6701DFDC  0.879807
6  SOVGLTY12AF72A39CD  0.879248
7  SOTHABI12A58A7DACB  0.873827
8  SOMYECL12A6701D9C8  0.870213
9  SOGCDYR12AC961854A  0.869952


In [24]:
some_user

'b80344d063b5ccb3212f76538f3d9e43d87dca9e'