In [9]:
!pip install scikit-surprise



In [10]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
import pandas as pd
import os
import numpy as np
from collections import defaultdict

In [11]:
ratings_file = "datasets/piki_dataset.csv"
df = pd.read_csv(ratings_file)

In [12]:
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0, 100))

data = Dataset.load_from_df(df[['user_id', 'song_id', 'spotify_popularity']], reader)

# cross validation , splits data into trainset and testset
trainset, testset = train_test_split(data, test_size=.2)

In [13]:
# kf = KFold(n_splits=3)
algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)
# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 7.7473


7.74733216662211

In [14]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [15]:
test = []
for i in range(1,10000):
  tup = (df.loc[i]['user_id'], df.loc[i]['song_id'], df.loc[i]['spotify_popularity'])
  test.append(tup)
predictions = algo.test(test)

top_n = get_top_n(predictions, n=10)

In [17]:
print(top_n)
new = pd.DataFrame.from_dict(top_n, orient='index')
new.to_csv('SVD_response.csv')

defaultdict(<class 'list'>, {3720277: [(4227953, 92.65001621812388), (254, 90.90973008280967), (1763031, 85.80682883704662), (4056, 85.603224420455), (28739, 84.75517817051357), (1830214, 81.93547101079312), (91297, 81.89384247028347), (151950, 81.60228054779529), (4179913, 80.98594930774529), (1786012, 80.9833150436001)], 3720670: [(1761306, 97.4238612069334), (1841779, 93.03355335549988), (1829686, 91.96356802762261), (3398743, 86.63392644090237), (3884, 84.6141817213433), (1907274, 80.29056193402158), (31417, 78.83788592935157), (1763077, 78.81017457834889), (214090, 78.57045427544932), (541, 78.16947232623194)], 3721232: [(3123, 71.05943185355832), (4125304, 70.83999111811617), (2142, 70.17053865108645), (10298, 69.97512613732336), (9439, 65.07485319704894), (7069, 58.7780967685281), (1754197, 56.00015865828241), (3306538, 54.959368602472175), (20121, 54.10048652246711), (4405563, 53.31966166041254)], 3721278: [(3123, 71.57213041027319), (4185634, 69.26393007833393), (2142, 67.9555