In [48]:
import numpy as np
import pandas as pd
import scipy as sp
from typing import Dict

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Embedding, Flatten, Input, Lambda
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
%matplotlib inline

In [43]:
df = pd.read_pickle('../../datasets/clean_df.csv', compression='zip')

In [3]:
# Convert user id and song id to numerical ids
df['user_id'] = df['user'].astype('category').cat.codes
df['song_id'] = df['song'].astype('category').cat.codes

df_orig = df.copy()

# Create lookup frame so we can get the 'Song - Artist' later
item_lookup = df[['song_id','Song - Artist']].drop_duplicates()
item_lookup['song_id'] = item_lookup['song_id'].astype(str)

# Drop 'user' and 'song' and 'Song - Artist'
df = df.drop(['user','song','Song - Artist'], axis=1)

# Create lists of all users, songs, and counts
users = list(np.sort(df['user_id'].unique()))
songs = list(np.sort(df['song_id'].unique()))
play_counts = list(np.sort(df['count']))

# Get the rows and columns for our matrix
user_ = df['user_id'].astype(float)
item_ = df['song_id'].astype(float)

In [4]:
# data_sparse = sp.sparse.csr_matrix((play_counts, (rows, cols)), shape=(len(users), len(songs)))

In [5]:
sp_user_item = sp.sparse.csr_matrix(
    (df['count'].astype(float), (user_, item_)),
    shape=(len(users), len(songs))
)

sp_item_user = sp.sparse.csr_matrix(
    (df['count'].astype(float), (item_, user_)),
    shape=(len(songs), len(users))
)

# Alternating Least Squares

In [6]:
# Hyperparams
factors=30
regularization=0.1
iterations=20
alpha=40

In [7]:
model = implicit.als.AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            iterations=iterations)



In [8]:
data = (sp_item_user * alpha).astype('double')
model.fit(data)

  0%|          | 0/20 [00:00<?, ?it/s]

In [9]:
recs_raw = model.recommend(1, sp_user_item, N=5)
print(recs_raw)

[(2736, 0.7730313), (2629, 0.7692152), (848, 0.7180464), (1160, 0.7083988), (691, 0.58708537)]


In [10]:
for song_id , _ in recs_raw:
    print(item_lookup[item_lookup['song_id'] == song_id.astype(str)]['Song - Artist'])

325048    Show You How - The Killers
Name: Song - Artist, dtype: object
631587    I Am The Club (Explicit Album Version) - Plies
Name: Song - Artist, dtype: object
438930    You (featuring Tank) (Amended Album Version) -...
Name: Song - Artist, dtype: object
228969    Everyone's At It - Lily Allen
Name: Song - Artist, dtype: object
272655    One I Love - Coldplay
Name: Song - Artist, dtype: object


# Bayesian Personalized Ranking

https://medium.com/heyjobs-tech/building-recommendation-system-based-bayesian-personalized-ranking-using-tensorflow-2-1-b814d2704130

The missing values in the sparse matrix are **MNAR**. This is because we do not know if the user did not listen to a certain song because the user didn't like the song, was not aware of the song, or for some other reasons unforeseen to us.

In [11]:
model_bpr = implicit.bpr.BayesianPersonalizedRanking(
                factors=factors, 
                regularization=regularization,
                iterations=iterations)

To train a model based on Bayes Personalized Ranking, we need to define the triplets of a user, positive item and negative item.

This means we must define what constitutes either a positive item or negative item in the scope of our dataset.

In [46]:
model_bpr.fit(sp_item_user)

  0%|          | 0/20 [00:00<?, ?it/s]

In [47]:
recs_raw_bpr = model_bpr.recommend(1, sp_user_item, N=5)
for song_id , _ in recs_raw_bpr:
    print(item_lookup[item_lookup['song_id'] == song_id.astype(str)]['Song - Artist'])

763209    Brand New Broken Heart - NewFound Road
Name: Song - Artist, dtype: object
504826    You Need Hands (1993 Digital Remaster) - Malco...
Name: Song - Artist, dtype: object
740634    Lonesome To The Bone - Johnny Cash
Name: Song - Artist, dtype: object
754710    Die Meisten - Samy Deluxe
Name: Song - Artist, dtype: object
701335    The Luck You Got - The High Strung
Name: Song - Artist, dtype: object


## Build triplets

In [14]:
num_records_for_user = pd.DataFrame(
    df.groupby(['user_id'])['song_id'].count().reset_index()
).rename({'song_id' : 'records'}, axis=1)

In [15]:
users_gt_1_record = num_records_for_user[num_records_for_user['records'] > 1]['user_id'].tolist()

In [16]:
half_users_gt_1 = len(users_gt_1_record) / 2

In [49]:
threshold = 2
qualified_users = []
pos_user_item = {}
neg_user_item = {}
processed_users = []

# for each user
for user in users_gt_1_record:
    if ( len(processed_users) % 10_000 == 0 ):
        print(f'{len(processed_users)} users processed. Found {len(qualified_users)} qualified users. {len(users_gt_1_record) - len(processed_users)} users remaining.')
    processed_users.append(user)
    temp = df[df['user_id'] == user]
    positive = None
    negative = None
    for _ , row in temp.iterrows():
        if ( row['count'] > threshold ):
            positive = user, row['song_id']
            # pos_user_item[user] = row['song']
        else : 
            negative = user, row['song_id']
            # neg_user_item[user] = row['song']
        if ( positive != None and negative != None ):
            qualified_users.append(user)
            pos_user_item[positive[0]] = positive[1]
            neg_user_item[negative[0]] = negative[1]
            break

0 users processed. Found 0 qualified users. 172844 users remaining.


KeyError: 'user_id'

In [36]:
triplets = pd.DataFrame(columns=['user_id','positive','negative'])
for user in qualified_users:
    triplets = triplets.append({
        'user_id':user,
        'positive':pos_user_item[user],
        'negative':neg_user_item[user]}, ignore_index=True)
#     triplets['user'] = user
#     triplets['positive'] = pos_user_item[user]
#     triplets['negative'] = neg_user_item[user]

In [37]:
# The assumption is that the recommendations should as many as possible high ranked movies 
# that a specific user has already watched.

ground_truth_train = df_train[df_train.rating > 3].groupby('user_id').movie_id.agg(list).reset_index()

ground_truth_train.head(1)

Unnamed: 0,user_id,positive,negative
0,1,2370,2818
1,4,366,1894
2,5,2988,691
3,12,370,114
4,14,1231,1424
...,...,...,...
68328,418224,1354,2930
68329,418227,1215,114
68330,418229,1231,764
68331,418231,2381,2032


In [41]:
def bpr_triplet_loss(X: dict):
    """
    Calculate triplet loss - as higher the difference between positive interactions
    and negative interactions as better
    :param X: X contains the user input, positive item input, negative item input
    :return:
    """
    positive_item_latent, negative_item_latent, user_latent = X

    positive_interactions = tf.math.reduce_sum(tf.math.multiply(user_latent, positive_item_latent), axis=-1, keepdims=True)
    negative_interactions = tf.math.reduce_sum(tf.math.multiply(user_latent, negative_item_latent), axis=-1, keepdims=True)

    return tf.math.subtract(tf.constant(1.0), tf.sigmoid(tf.math.subtract(positive_interactions, negative_interactions)))

In [44]:
def full_auc(model: Model, ground_truth: Dict[int, list], items: list) -> float:
    """
    :param model: 
    :param ground_truth: dictionary of the users and the high ranked movies for the specific user
    :param items: a list of the all available movies
    :return: AUC
    """

    number_of_items = len(items)
    scores = []

    for user_id, true_item_ids in ground_truth:
        predictions = bpr_predict(model, user_id, items)
        grnd = np.zeros(number_of_items, dtype=np.int32)

        for p in true_item_ids:
            index = items.index(p)
            grnd[index] = 1

        if true_item_ids:
            scores.append(roc_auc_score(grnd, predictions))

    return sum(scores) / len(scores)


NameError: name 'Model' is not defined

In [45]:
def mean_average_precision_k(model: Model,  
                             ground_truth: Dict[int, list], 
                             items: list, 
                             k=100) -> float:
    scores = []

    for user, actual in ground_truth:
        predictions = bpr_predict(model, user, items)
        predictions = dict(zip(items, predictions))
        predictions = sorted(predictions.items(), key=lambda kv: kv[1], reverse=True)[:k]
        predictions = list(OrderedDict(predictions).keys())

        score = 0.0
        num_hits = 0.0

        for i, p in enumerate(predictions):
            if p in actual:
                num_hits += 1.0
                score += num_hits / (i + 1.0)

        score = score / min(len(actual), k)
        scores.append(score)

    return np.mean(scores)

NameError: name 'Model' is not defined