In [33]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import accuracy

In [34]:
ratings_df  = pd.read_csv('../data/ratings.csv')

In [35]:
# See how many unique items are there in the data to decide the value of k 

len(set(ratings_df['item_id']))

10974

In [36]:
reader = Reader(rating_scale=(1,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(ratings_df,reader) #load dataset into Surprise datastructure Dataset

# Holdout method

In [37]:
# I tried grid search but due to the size of the dataset, it took forever to run it so I tried different k and min_k
# a few times with holdout method.

In [38]:
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=None, shuffle=True)

In [39]:
sim_options = {
    'name': 'pearson', #similarity measure default is MSD
    'user_based': False, #user-based CF
    'min_support':25
}

In [40]:
#KNN
knn = KNNBasic(sim_options=sim_options,k=10,min_k=5) # number of neighbours = 45
knn.fit(trainingSet) #fit model to the training set
predictions_knn = knn.test(testSet) #predict for test set values

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [41]:
accuracy.rmse(predictions_knn, verbose=True) 

RMSE: 0.9416


0.9415999193823226

In [42]:
predictions_knn

[Prediction(uid='the_abomination', iid=295110, r_ui=2.0, est=2.5134124151376347, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='wassupdardy', iid=72850, r_ui=4.0, est=2.7418674082241115, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='altas2011', iid=8850, r_ui=2.0, est=2.198298821575094, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='nolosing', iid=72850, r_ui=4.0, est=2.3304839339333876, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='76561198053140237', iid=202970, r_ui=1.0, est=2.647489289430651, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='76561198100410705', iid=258160, r_ui=1.0, est=1.4495943177159543, details={'actual_k': 9, 'was_impossible': False}),
 Prediction(uid='76561198097037228', iid=24240, r_ui=3.0, est=1.5154971642923207, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='76561197967584607', iid=9070, r_ui=2.0, est=1.727266465544664, details=

# Precision at k

In [43]:
from collections import defaultdict
def precision_recall_at_k(predictions, k, threshold):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [None]:
from surprise.model_selection import KFold

kf = KFold(n_splits=5)
knn = KNNBasic(sim_options=sim_options,k=10,min_k=5)

for trainset, testset in kf.split(data):
    knn.fit(trainset)
    predictions = knn.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=2.5)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

# Recommendation

In [None]:
ui_df = pd.read_json("../data/australian_users_items_fixed.json")

In [None]:
ui_df.head()

In [None]:
user_dict = {}
for i in range(len(ui_df)):
    user_dict[str(ui_df.iloc[i].user_id)] = ui_df.iloc[i].user_id

In [None]:
game_df = pd.read_json("../data/steam_games_fixed.json")

In [None]:
game_df.head()

In [None]:
game_dict = {}
for i in range(len(game_df)):
    game_dict[str(game_df.iloc[i].id)] = game_df.iloc[i].title

In [None]:
# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it. Follow methodology provided previously
algo = KNNBasic(sim_options=sim_options,k=10,min_k=5)
algo.fit(trainset)

In [None]:
testset = trainset.build_anti_testset()

In [None]:
# becuase the size of the dataset is too large, I reduced the size by choosing the first 1097400 rows
# choosing 1097400 is because there are 10974 unique items in rating.csv, I decide to test only 100 users
# 10974 x 100 = 1097400 
# The exact number of users is 103 because the number of unrated items for each user is not exact 10974
testset = testset[:1097400]

In [None]:
predictions = algo.test(testset)

In [None]:
predictions

In [None]:
from collections import defaultdict

def getGameRecommendations(topN=3):
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions: 
        top_recs[uid].append((str(iid)+'.0', est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs 

In [None]:
recommendations = getGameRecommendations(3)

In [None]:
def getGameName(game_id):
    if game_id not in game_dict:
        return game_id
    g = game_dict[game_id]
    return g

In [None]:
def getGameRecommendationsForUser(userId, recommendations):
    if userId not in user_dict:
        print("User id is not present")
        return
    u_id = user_dict[userId]
    recommended_games = recommendations[u_id]
    game_list = []
    for game in recommended_games:
        game_list.append((getGameName(game[0]),game[1]))
    return game_list 

In [None]:
getGameRecommendationsForUser('js41637',recommendations)

In [None]:
# In this case the data of the second game recommended is not in the game dataframe so I show the game id.
getGameRecommendationsForUser('76561197970982479',recommendations)