In [572]:
# import sys
# !{sys.executable} -m pip install lightfm
import random
import itertools

import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from scipy import sparse
import random
import lightfm
from lightfm import LightFM, cross_validation
from lightfm.evaluation import precision_at_k, auc_score
from sklearn.metrics.pairwise import cosine_similarity

from surprise import Dataset
from surprise import Reader
from surprise import SVDpp
from collections import defaultdict

In [573]:
p = 0.025

In [574]:
df_playlist = pd.read_csv('./data/spotify_dataset.csv', error_bad_lines=False, warn_bad_lines=False, skiprows=lambda i: i>0 and random.random() > p)

In [575]:
df_playlist.shape

(321555, 4)

In [576]:
df_playlist.columns = df_playlist.columns.str.replace('"', '')
df_playlist.columns = df_playlist.columns.str.replace('name', '')
df_playlist.columns = df_playlist.columns.str.replace(' ', '')
df_playlist.columns
df_playlist.head()

Unnamed: 0,user_id,artist,track,playlist
0,9cc0cfd4d7d7885102480dd99e7a90d6,Crowded House,Weather With You,HARD ROCK 2010
1,07f0fc3be95dcd878966b1f9572ff670,Emancipator,Nevergreen (Blockhead Remix),Chill out
2,07f0fc3be95dcd878966b1f9572ff670,Trifonic,Sooner Or Later,Chill out
3,07f0fc3be95dcd878966b1f9572ff670,Symbion Project,Vandalized Lovemap,Chill out
4,07f0fc3be95dcd878966b1f9572ff670,Antonio Vivaldi,The Four Seasons - Spring (Violin Concerto in ...,Classique


In [577]:
# Adding test user and seeing what the output will be.
temp_user_id = '00000000000000000000000000000000'
temp_user_artists = [
    'Eminem', 'Kanye West', '50 Cent', 'Drake', 'Kendrick Lamar', 'Future', 
    'Lil Baby', 'Snoop Dogg', 'Post Malone', 'Lil Wayne', 'Kid Cudi']
temp_track_name = 'testtrackname'
temp_playlist_name = 'testplaylist'

new_rows = []
for i in range(5000):
    temp_artist = random.choice(temp_user_artists)
    new_row = {
        'user_id': temp_user_id,
        'artist': temp_artist,
        'track': temp_track_name,
        'playlist': temp_playlist_name
    }
    new_rows.append(new_row)

temp_df = pd.DataFrame(new_rows) 
df_playlist = pd.concat([df_playlist, temp_df], ignore_index = True)
df_last = df_playlist.tail(10)
df_last

df_playlist.loc[df_playlist['user_id'] == temp_user_id]

Unnamed: 0,user_id,artist,track,playlist
321555,00000000000000000000000000000000,Eminem,testtrackname,testplaylist
321556,00000000000000000000000000000000,Kanye West,testtrackname,testplaylist
321557,00000000000000000000000000000000,Kid Cudi,testtrackname,testplaylist
321558,00000000000000000000000000000000,Post Malone,testtrackname,testplaylist
321559,00000000000000000000000000000000,Future,testtrackname,testplaylist
...,...,...,...,...
326550,00000000000000000000000000000000,Kid Cudi,testtrackname,testplaylist
326551,00000000000000000000000000000000,Snoop Dogg,testtrackname,testplaylist
326552,00000000000000000000000000000000,Kid Cudi,testtrackname,testplaylist
326553,00000000000000000000000000000000,Drake,testtrackname,testplaylist


In [578]:
# Keeping artists that appear with a minimum frequency. 
# Only keeping artists with high frequency than 50

In [579]:
df_playlist = df_playlist.groupby('artist').filter(lambda x : len(x) >= 50)
# df_playlist
df_playlist.loc[df_playlist['user_id'] == temp_user_id]

Unnamed: 0,user_id,artist,track,playlist
321555,00000000000000000000000000000000,Eminem,testtrackname,testplaylist
321556,00000000000000000000000000000000,Kanye West,testtrackname,testplaylist
321557,00000000000000000000000000000000,Kid Cudi,testtrackname,testplaylist
321558,00000000000000000000000000000000,Post Malone,testtrackname,testplaylist
321559,00000000000000000000000000000000,Future,testtrackname,testplaylist
...,...,...,...,...
326550,00000000000000000000000000000000,Kid Cudi,testtrackname,testplaylist
326551,00000000000000000000000000000000,Snoop Dogg,testtrackname,testplaylist
326552,00000000000000000000000000000000,Kid Cudi,testtrackname,testplaylist
326553,00000000000000000000000000000000,Drake,testtrackname,testplaylist


In [580]:
# Keeping users with at least 10 unique artists in their playlists to lessen
# the impact of COLD START PROBLEM

In [581]:
df_playlist = df_playlist[df_playlist.groupby('user_id').artist.transform('nunique') >= 10]
df_playlist.loc[df_playlist['user_id'] == temp_user_id]

Unnamed: 0,user_id,artist,track,playlist
321555,00000000000000000000000000000000,Eminem,testtrackname,testplaylist
321556,00000000000000000000000000000000,Kanye West,testtrackname,testplaylist
321557,00000000000000000000000000000000,Kid Cudi,testtrackname,testplaylist
321558,00000000000000000000000000000000,Post Malone,testtrackname,testplaylist
321559,00000000000000000000000000000000,Future,testtrackname,testplaylist
...,...,...,...,...
326550,00000000000000000000000000000000,Kid Cudi,testtrackname,testplaylist
326551,00000000000000000000000000000000,Snoop Dogg,testtrackname,testplaylist
326552,00000000000000000000000000000000,Kid Cudi,testtrackname,testplaylist
326553,00000000000000000000000000000000,Drake,testtrackname,testplaylist


In [582]:
# Grouping by the frequency count for each users artists (# of times that an
# artist appeared in playlists created by a user)

In [583]:
size = lambda x : len(x)
df_freq = df_playlist.groupby(['user_id', 'artist']).agg('size').reset_index().rename(columns={0:'freq'})[['user_id', 'artist', 'freq']].sort_values(['freq'], ascending=False)
df_freq.head()

df_freq.loc[df_freq['user_id'] == temp_user_id]

Unnamed: 0,user_id,artist,freq
9,0,Post Malone,511
7,0,Lil Baby,497
5,0,Kendrick Lamar,483
1,0,Drake,457
8,0,Lil Wayne,450
0,0,50 Cent,444
4,0,Kanye West,444
3,0,Future,443
2,0,Eminem,438
6,0,Kid Cudi,424


In [584]:
# Creating a data frame for artists and artist id
# Artist to artist id mapping

In [585]:
df_artist = pd.DataFrame(df_freq['artist'].unique())
df_artist = df_artist.reset_index()
df_artist = df_artist.rename(columns={'index':'artist_id', 0:'artist'})
df_artist.head()

Unnamed: 0,artist_id,artist
0,0,Post Malone
1,1,Lil Baby
2,2,Kendrick Lamar
3,3,Drake
4,4,Lil Wayne


In [586]:
df_artist.shape

(1072, 2)

In [587]:
# Adding artist_id to the frequence data frame
df_freq = pd.merge(df_freq, df_artist, how='inner', on='artist')

In [588]:
#########
# Using the lightfm library to run a traditional MF (matrix factorization) model
# since the dataset doesn't include any user or artist features
# the library allows builing of a hybrid model as well. (CF AND CBF?)
# examples: https://github.com/lyst/lightfm/blob/master/examples/

In [589]:
## Parameter Tuning
def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(16, 64),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp", "warp-kos"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "user_alpha": np.random.exponential(1e-8),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(5, 50),
        }

In [590]:
# Don't know what the
# Creating an interaction matrix df from transactional type interactions
def create_interaction_matrix(df, user_col, item_col, rating_col, norm=False, threshold=None):
    interactions = df.groupby([user_col, item_col])[rating_col].sum().unstack().reset_index().fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions

In [591]:
# Function to create a user dictionary based on their index and number in
# interaction dataset
def create_user_dict(interactions):
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0 
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict

In [592]:
# Function to create an item dictionary based on their item_id and item name
def create_item_dict(df, id_col, name_col):
    item_dict = {}
    for i in range(df.shape[0]):
        item_dict[(df.loc[i,id_col])] = df.loc[i, name_col]
    return item_dict

In [593]:
# Runs the matrix factorization algorithm and returns the trained model
def runMF(interactions, n_components=30, loss='warp', k=15, epoch=30, n_jobs = 4):
#     x = sparse.csr_matrix(interactions.values)
    model = LightFM(no_components = n_components, loss=loss, k=k)
    model.fit(interactions, epochs=epoch, num_threads = n_jobs)
    return model

In [594]:
# Parameter Tuning
def random_search(train, test, num_samples=50, num_threads=4):
    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")
        
        model = LightFM(**hyperparams)
        model.fit(train, epochs=num_epochs, num_threads=num_threads)
        
        test_auc = auc_score(model, test, train_interactions=train, num_threads=num_threads).mean()
        train_auc = auc_score(model, train, num_threads=num_threads).mean()
        
        hyperparams["num_epochs"] = num_epochs
        
        yield (test_auc, train_auc, hyperparams, model)

In [595]:
# Produces user recommendations, prints the list of items the user already 
# prefers, prints the list of N recommended items which user will hopefully 
# prefer.
def sample_recommendation_user(model, interactions, user_id, user_dict, item_dict, threshold = 0, nrec_items = 10, show = True):
    n_users, n_items = interactions.shape # indices
    user_x = user_dict[user_id] # getting the specific user
    scores = pd.Series(model.predict(user_x, np.arange(n_items))) # model prediction
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interactions.loc[user_id,:]
                                 [interactions.loc[user_id,:] > threshold].index).sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter+=1
    return return_score_list

In [596]:
# Prepping the model inputs

In [597]:
# Create interaction matrix
interactions = create_interaction_matrix(df=df_freq, user_col="user_id", item_col="artist_id", rating_col='freq', norm=False, threshold=None)
interactions.head()

artist_id,0,1,2,3,4,5,6,7,8,9,...,1062,1063,1064,1065,1066,1067,1068,1069,1070,1071
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000000000000000000000000000000,511.0,497.0,483.0,457.0,450.0,444.0,444.0,443.0,438.0,424.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000c11a16c89aa4b14b328080f5954ee,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00123e0f544dee3ab006aa7f1e5725a7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0025022960e5f0d7d01af5d840014594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
00287ecf491882a40ff34b0fd75a5b16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [598]:
## NEW
# Prep the data for surprise. 
# surprise_df = df_freq.copy(deep=True)
# for index, row in surprise_df.iterrows():
#     if row['freq'] > 200:
#         surprise_df.at[index, 'freq'] = 200

In [599]:
# surprise_df.head()

In [600]:
# def get_top_n(user_id, predictions, n=10):
#     """Return the top-N recommendation for each user from a set of predictions.

#     Args:
#         predictions(list of Prediction objects): The list of predictions, as
#             returned by the test method of an algorithm.
#         n(int): The number of recommendation to output for each user. Default
#             is 10.

#     Returns:
#     A dict where keys are user (raw) ids and values are lists of tuples:
#         [(raw item id, rating estimation), ...] of size n.
#     """

#     # First map the predictions to each user.
#     top_n = defaultdict(list)
#     for uid, iid, true_r, est, _ in predictions:
#         top_n[user_id].append((iid, est))

#     # Then sort the predictions for each user and retrieve the k highest ones.
#     for uid, user_ratings in top_n.items():
#         user_ratings.sort(key=lambda x: x[1], reverse=True)
#         top_n[uid] = user_ratings[:n]

#     return top_n

In [601]:
# reader = Reader(rating_scale=(0,200))
# data = Dataset.load_from_df(surprise_df[["user_id", "artist_id", "freq"]], reader)
# trainingSet = data.build_full_trainset()

# algo = SVD()
# algo.fit(trainingSet)

# # Than predict ratings for all pairs (u, i) that are NOT in the training set.
# testset = trainingSet.build_anti_testset()
# predictions = algo.test(testset)

In [602]:
# top_n = get_top_n(predictions, n=10)

# # Print the recommended items for each user

# for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])

In [603]:
# artists_dict = create_item_dict(df=df_artist, id_col="artist_id", name_col="artist")
# artists_dict

## NEW END

In [604]:
interactions.shape

(3345, 1072)

In [605]:
# temp

In [606]:
# Create user dict
user_dict = create_user_dict(interactions=interactions)

In [607]:
# Create item dict
artists_dict = create_item_dict(df=df_artist, id_col="artist_id", name_col="artist")

In [608]:
# Train-test split
x = sparse.csr_matrix(interactions.values)
train, test = lightfm.cross_validation.random_train_test_split(x, test_percentage=0.2, random_state=None)

In [609]:
# How does a matrix factorization model work?
# https://developers.google.com/machine-learning/recommendation/collaborative/matrix

In [610]:
# Train the matrix factorization model
# Can do hyper-parameter tuning for better results
# %time
# model = runMF(interactions = train, n_components=30, loss='warp', k=15, epoch=30, n_jobs=4)

In [611]:
## New Code
(test_auc, train_auc, hyperparams, model) = max(random_search(train, test, num_threads=4), key=lambda x: x[0])

print("Best Test AUC Score: {}\nTrain AUC Score: {}\nParams: {}".format(test_auc, train_auc, hyperparams))

## End New Code

Best Test AUC Score: 0.8025776743888855
Train AUC Score: 0.9214357733726501
Params: {'no_components': 35, 'learning_schedule': 'adagrad', 'loss': 'warp', 'learning_rate': 0.025644415138587746, 'item_alpha': 1.909711769084892e-08, 'user_alpha': 1.563410797457356e-08, 'max_sampled': 9, 'num_epochs': 31}


In [612]:
# Evaluation metrics
# Compute AUC score for train set
# Original
# train_auc = auc_score(model, train, num_threads=4).mean()
# print('Train AUC: %s' % train_auc)

In [613]:
# Compute AUC score test set
# the parameter train_interactions allows you to exlude known positives in
# training set from the predictions and score calculations
# this is to avoid re-recommending the items the user has already interacted with
# Original
# test_auc = auc_score(model, test, train_interactions=train, num_threads=4).mean()
# print('Test AUC: %s' % test_auc)

In [614]:
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

In [615]:
print('train Precision %.2f, test Precision %.2f.' % (train_precision, test_precision))

train Precision 0.30, test Precision 0.04.


In [616]:
random_cell = interactions.sample()
random_user_id = random_cell.index[0]

rec_list = sample_recommendation_user(model = model, 
                                      interactions = interactions, 
                                      user_id = temp_user_id, 
                                      user_dict = user_dict,
                                      item_dict = artists_dict, 
                                      threshold = 0,
                                      nrec_items = 10,
                                      show = True)

Known Likes:
1- Snoop Dogg
2- Kid Cudi
3- Eminem
4- Future
5- Kanye West
6- 50 Cent
7- Lil Wayne
8- Drake
9- Kendrick Lamar
10- Lil Baby
11- Post Malone

 Recommended Items:
1- JAY Z
2- Nas
3- The Notorious B.I.G.
4- 2Pac
5- Ludacris
6- T.I.
7- Childish Gambino
8- OutKast
9- Rick Ross
10- Chris Brown


In [617]:
print(type(test_auc))

<class 'numpy.float32'>
