In [1]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import accuracy

In [2]:
ratings_df  = pd.read_csv('../data/ratings.csv')

In [3]:
# See how many unique items are there in the data to decide the value of k 

len(set(ratings_df['item_id']))

10974

In [4]:
reader = Reader(rating_scale=(1,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(ratings_df,reader) #load dataset into Surprise datastructure Dataset

# Holdout method

In [5]:
# I tried grid search but due to the size of the dataset, it took forever to run it so I tried different k and min_k
# a few times with holdout method.

In [6]:
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=None, shuffle=True)

In [7]:
sim_options = {
    'name': 'pearson', #similarity measure default is MSD
    'user_based': False, #user-based CF
    'min_support':25
}

In [11]:
#KNN
knn = KNNBasic(sim_options=sim_options,k=10,min_k=5) # number of neighbours = 45
knn.fit(trainingSet) #fit model to the training set
predictions_knn = knn.test(testSet) #predict for test set values

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [12]:
accuracy.rmse(predictions_knn, verbose=True) 

RMSE: 0.9412


0.9411844264384757

In [13]:
predictions_knn

[Prediction(uid='kenny536', iid=41800, r_ui=1.0, est=1.4100691972662873, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='epiclyralphy', iid=239160, r_ui=2.0, est=2.105794391398108, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='RawrForMercy', iid=205790, r_ui=1.0, est=1.2375358018388056, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='cosimatron', iid=377160, r_ui=4.0, est=2.6868446835298183, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='munkii', iid=312610, r_ui=1.0, est=1.9804572080796794, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='Kitteh', iid=409720, r_ui=1.0, est=1.5722837854374043, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='76561198082776838', iid=409720, r_ui=1.0, est=2.1344392392747773, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='76561198009001563', iid=6000, r_ui=3.0, est=2.491928854932452, details={'actual_k': 10, 

# Precision at k

In [14]:
from collections import defaultdict
def precision_recall_at_k(predictions, k, threshold):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [34]:
from surprise.model_selection import KFold

kf = KFold(n_splits=5)
knn = KNNBasic(sim_options=sim_options,k=10,min_k=5)

for trainset, testset in kf.split(data):
    knn.fit(trainset)
    predictions = knn.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=3)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

Computing the pearson similarity matrix...
Done computing similarity matrix.
0.20288938687468355
0.10472484944654947
Computing the pearson similarity matrix...
Done computing similarity matrix.
0.20442290976207966
0.1052983848141981
Computing the pearson similarity matrix...
Done computing similarity matrix.
0.2022364941499845
0.10492564926521318
Computing the pearson similarity matrix...
Done computing similarity matrix.
0.20008285097406062
0.10460332517935374
Computing the pearson similarity matrix...
Done computing similarity matrix.
0.20221893173391772
0.1055496897771085


# Recommendation

In [16]:
ui_df = pd.read_json("../data/australian_users_items_fixed.json")

In [17]:
ui_df.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982480,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864384,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712560,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445856,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099488,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [18]:
user_dict = {}
for i in range(len(ui_df)):
    user_dict[str(ui_df.iloc[i].user_id)] = ui_df.iloc[i].user_id

In [20]:
game_df = pd.read_csv("../processed_data/all_games_id_name_pair.csv")

In [21]:
game_df.head()

Unnamed: 0,item_id,item_name
0,10.0,Counter-Strike
1,20.0,Team Fortress Classic
2,30.0,Day of Defeat
3,40.0,Deathmatch Classic
4,50.0,Half-Life: Opposing Force


In [22]:
game_dict = {}
for i in range(len(game_df)):
    game_dict[str(game_df.iloc[i].item_id)] = game_df.iloc[i].item_name

In [23]:
# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it. Follow methodology provided previously
algo = KNNBasic(sim_options=sim_options,k=10,min_k=5)
algo.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fc495c82880>

In [24]:
testset = trainset.build_anti_testset()

In [25]:
# becuase the size of the dataset is too large, I reduced the size by choosing the first 1097400 rows
# choosing 1097400 is because there are 10974 unique items in rating.csv, I decide to test only 100 users
# 10974 x 100 = 1097400 
# The exact number of users is 103 because the number of unrated items for each user is not exact 10974
testset = testset[:1097400]

In [26]:
predictions = algo.test(testset)

In [27]:
predictions

[Prediction(uid='76561197970982479', iid=80, r_ui=1.9174700894758845, est=1.754106609549855, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='76561197970982479', iid=100, r_ui=1.9174700894758845, est=1.3278965181685405, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='76561197970982479', iid=3910, r_ui=1.9174700894758845, est=2.133207462143634, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='76561197970982479', iid=4000, r_ui=1.9174700894758845, est=2.168188111851549, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='76561197970982479', iid=6880, r_ui=1.9174700894758845, est=2.2720915738706005, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='76561197970982479', iid=2200, r_ui=1.9174700894758845, est=1.3392995337727562, details={'actual_k': 10, 'was_impossible': False}),
 Prediction(uid='76561197970982479', iid=2270, r_ui=1.9174700894758845, est=1.762726938349888, details={'actual_k': 10

In [28]:
from collections import defaultdict

def getGameRecommendations(topN=3):
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions: 
        top_recs[uid].append((str(iid)+'.0', est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs 

In [29]:
recommendations = getGameRecommendations(3)

In [30]:
def getGameName(game_id):
    if game_id not in game_dict:
        return game_id
    g = game_dict[game_id]
    return g

In [31]:
def getGameRecommendationsForUser(userId, recommendations):
    if userId not in user_dict:
        print("User id is not present")
        return
    u_id = user_dict[userId]
    recommended_games = recommendations[u_id]
    game_list = []
    for game in recommended_games:
        game_list.append((getGameName(game[0]),game[1]))
    return game_list 

In [32]:
getGameRecommendationsForUser('js41637',recommendations)

[('Home Design 3D', 3.8207585428352626),
 ('Zombie Party', 3.5308332710288655),
 ('Emily Wants To Play', 3.4458787293077866)]

In [33]:
# In this case the data of the second game recommended is not in the game dataframe so I show the game id.
getGameRecommendationsForUser('76561197970982479',recommendations)

[('Wildlife Park 2 - Crazy Zoo', 3.9751955815407207),
 ('ROD: Revolt Of Defense', 3.945550037988869),
 ('Undead Shadows', 3.9038338631502634)]