In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from collections import OrderedDict
from typing import Dict

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Embedding, Flatten, Input, Lambda
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
reldir = '../datasets/'

In [3]:
df = pd.read_pickle(reldir + 'clean_df.csv', compression='zip')

In [4]:
# Convert user id and song id to numerical ids
df['user_id'] = df['user'].astype('category').cat.codes
df['song_id'] = df['song'].astype('category').cat.codes

df_orig = df.copy()

# Create lookup frame so we can get the 'Song - Artist' later
item_lookup = df[['song_id','Song - Artist']].drop_duplicates()
item_lookup['song_id'] = item_lookup['song_id'].astype(str)

# Drop 'user' and 'song' and 'Song - Artist'
df = df.drop(['user','song','Song - Artist'], axis=1)

# Create lists of all users, songs, and counts
users = list(np.sort(df['user_id'].unique()))
songs = list(np.sort(df['song_id'].unique()))
play_counts = list(np.sort(df['count']))

# Get the rows and columns for our matrix
user_ = df['user_id'].astype(float)
item_ = df['song_id'].astype(float)

### Train/test split

The main idea here is to see if our model recommends any of the songs a user has listened to many times. We know these song items are most likely positive (well-liked by the user), so we can measure goodness of the model based on how many of these positive items it recommends.

#### Test split

In [6]:
# Get the users that have listened to any song more than 4 times
min_listens = 4
tmp_test = df[df['count'] > min_listens]
tmp_test = (
        tmp_test.groupby('user_id')['song_id'].count()
    ).reset_index().rename({'song_id':'records'}, axis=1)
tmp_test.shape

(87253, 2)

In [7]:
min_records = 4
conditions = (df['user_id'].isin(tmp_test[tmp_test['records'] > min_records].user_id) & (df['count'] > min_listens))
df_test = df[conditions].groupby('user_id').head(2).reset_index()
del df_test['index']
df_test.shape

(428, 3)

In [8]:
ground_truth_test = df_test.groupby('user_id')['song_id'].agg(list).reset_index()
ground_truth_test.shape

(214, 2)

#### Train split

In [10]:
df_train = pd.concat([df, df_test]).drop_duplicates(keep=False)
df_train.shape

(772233, 3)

In [11]:
ground_truth_train = df_train[df_train['count'] > 4].groupby('user_id')['song_id'].agg(list).reset_index()
ground_truth_train.shape

(87253, 2)

In [16]:
model = tf.keras.models.load_model('../models/bpr_e3')

AttributeError: 'NoneType' object has no attribute 'get'