In [None]:
# CURRENT --> ADD TESTs / Accuracy reports
# Remove 15 percent of each userid's rows
# train on the 85 percent
# test on the 15 percent that has NOT been used in training
# -problem is that custom functions are used instead of sklearn classifiers
# -so need to figure out the interface between the two and emulate it
# -OR put functions in custom classifier class, needs some methods defined like fit(), train(), ...

# Python User-User Collaborative Filtering Recommender System

# Datafield has 'user_ids' and their 'rating' (1-10) of various 'game_ids'
# Prediction of unrated games for each user_id is made using:
# k-nearest neighbors found using cosine-similarity
# predicted rating based on those near-neighbors' rating weighted by cosine-similarity users

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('inputs/boardgame-elite-users.csv')
rating_ptable = df.pivot_table(index='userID', columns='gameID', values='rating') 

rating_ptable = rating_ptable.fillna(0)

In [2]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(rating_ptable, test_size=.15)

In [3]:
print(train.head)
print('XXXXXXXXX')
print(test.head)

<bound method NDFrame.head of gameID  3       5       10      11      12      13      18      41      \
userID                                                                   
170718    5.00    3.00    4.00    7.00    7.00    6.00    6.00    5.00   
54797     7.00    0.00    6.00    4.00    8.00    6.00    3.00    7.00   
181472    7.00    8.00    8.00    6.00    9.00    7.00    8.00    7.00   
58582     6.00    8.00    7.50    8.00    7.50    7.00    6.00    8.00   
134145    5.00    6.00    6.00    7.00    7.00    7.00    7.00    6.00   
46928     7.00    6.50    8.00    6.00    6.00    5.00    7.00    6.50   
100055    8.00    8.00    5.00    9.00    8.00    9.00   10.00    8.00   
30507     0.00    0.00    8.00    7.50    0.00    8.00    8.00    8.00   
76252     4.00    5.00    6.00    7.00    6.00    7.00    8.50    9.00   
55101     5.00    7.75    5.00    5.00    5.00    8.50    5.00    5.00   
27853     6.50    5.50    7.00    7.50    8.50    5.50    7.00    7.00   
49788   

In [2]:
from sklearn.preprocessing import normalize

norm_train = normalize(train)

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(normalized)

In [14]:
nbor_indx = sim.argsort()[:,::-1]

# HELPER FUNCTIONS
def get_users_that_rated(gameid):
    return rating_ptable[gameid].where(rating_ptable[gameid]>0).dropna().index.tolist()
    
def get_k_nearest_neighbors(userid, gameid, k):
    assert(k>1)
    pot_nbors = get_users_that_rated(gameid)
    userid_indx = rating_ptable.index.get_loc(userid)
    
    sim_userIDs = []
    for x in nbor_indx[userid_indx]:
        if rating_ptable.index[x] in pot_nbors:
            sim_userIDs.append(rating_ptable.index[x])
        if len(sim_userIDs)>k:
            break
    return sim_userIDs[1:]

In [15]:
# Takes userID, gameID, optional k
# Returns predicted rating for gameID
def predict_rating(uid, gameid, k=10):
    sim_uids = get_k_nearest_neighbors(uid, gameid, k)
    accm_score = 0.0
    accm_weight = 0.0
    for nbor_id in sim_uids:
        sim_indx_x = rating_ptable.index.get_loc(uid)
        sim_indx_y = rating_ptable.index.get_loc(nbor_id)
        accm_score += (rating_ptable.loc[nbor_id][gameid] * sim[sim_indx_x][sim_indx_y])
        accm_weight += sim[sim_indx_x][sim_indx_y]
    score = accm_score / accm_weight
    return score

predict_rating(5480, 97903)

6.5095784352606305