In [1]:
# NOTES
# organize output to return top recommendations for each user, top N
# test accuracy of predictions on top N, not that pred numbers are ncsrly same, but top N pred are among
# highest rated

# k nearest villains - recs based on most dissimilar users' lowest rated

# possible check for outliers (dist of ratings vastly different vectors, all one value)

# Consider cross-validation for more accurate testing
# sklearn cross-validation module is the equiv of just shuffling train/test split, retraining, rerun

# Consider wrapping in custom sklearn estimator / classifier, maybe inherit from base classes

# README
# Datafield has 'user_ids' and their 'rating' (1-10) of various 'game_ids'
# Fields = userID , gameID , rating
# Prediction of unrated games for each user_id is made using:
# k-nearest neighbors found using cosine-similarity
# predicted rating based on those near-neighbors' rating weighted by cosine-similarity users

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('inputs/boardgame-elite-users.csv')

train, test = train_test_split(df, test_size=.15)

In [2]:
train_ptable = train.pivot_table(index='userID', columns='gameID', values='rating') 

test_ptable = test.pivot_table(index='userID', columns='gameID', values='rating') 


train_ptable = train_ptable.fillna(0)
test_ptable = test_ptable.fillna(0)

In [3]:
from sklearn.preprocessing import normalize

norm_train = normalize(train_ptable, copy=True)

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(norm_train)

In [5]:
nbor_indx = sim.argsort()[:,::-1]

# HELPER FUNCTIONS
def get_users_that_rated(gameid):
    return train_ptable[gameid].where(train_ptable[gameid]>0).dropna().index.tolist()
    
def get_k_nearest_neighbors(userid, gameid, k):
    assert(k>1)
    pot_nbors = get_users_that_rated(gameid)
    userid_indx = train_ptable.index.get_loc(userid)
    
    sim_userIDs = []
    for x in nbor_indx[userid_indx]:
        if train_ptable.index[x] in pot_nbors:
            sim_userIDs.append(train_ptable.index[x])
        if len(sim_userIDs)>k:
            break
    return sim_userIDs[1:]

def round_to_point5(someFloat):
    score = np.around(someFloat, decimals=1)
    first_digit = score // 1
    last_digit = (score % 1) * 10
    carry = 0
    if 0 <= last_digit <= 2:
        return float(first_digit)
    elif 2 < last_digit <= 7:
        return float(first_digit + .5)
    elif last_digit > 7:
        return float(first_digit + 1)
        
    

In [6]:
# Takes userID, gameID, optional k
# Returns predicted rating for gameID
def predict_rating(uid, gameid, k=10):
    sim_uids = get_k_nearest_neighbors(uid, gameid, k)
    accm_score = 0.0
    accm_weight = 0.0
    for nbor_id in sim_uids:
        sim_indx_x = train_ptable.index.get_loc(uid)
        sim_indx_y = train_ptable.index.get_loc(nbor_id)
        accm_score += (train_ptable.loc[nbor_id][gameid] * sim[sim_indx_x][sim_indx_y])
        accm_weight += sim[sim_indx_x][sim_indx_y]
    score = accm_score / accm_weight
    score = round_to_point5(score)
    return score

predict_rating(5480, 97903)

# works with new split, now test

6.5

In [7]:
# is iteration slowing down? try apply or something

user_ids = []

for row in test_ptable.itertuples():
    user_ids.append(row[0])
    
game_ids = []
for x in test_ptable.columns:
    game_ids.append(x)

# tmpdict = {'user_ID':}    

pred_matrix = []
for user in user_ids:
    row = []
    for gid in game_ids:
        row.append(predict_rating(user,gid))
    pred_matrix.append(row)

In [32]:
# for each userID get top N from pred_matrix
# just make gid-rating pairs?

usr_game_matrix = []
for usr_row in pred_matrix:
    sorted_row_indexes = np.asarray(usr_row).argsort()[::-1]
    
    
    gids = []
    for indx in sorted_row_indexes:
        gids.append(train_ptable.columns[indx])
    usr_game_matrix.append(gids)
print(usr_game_matrix)

[[115746, 2651, 172818, 3076, 9609, 10630, 164153, 12333, 51811, 134342, 20551, 40834, 205637, 24439, 120677, 36218, 27833, 92539, 66690, 187645, 463, 102794, 126163, 555, 183394, 93, 12, 104162, 100901, 124742, 40849, 39856, 103885, 103343, 43111, 36932, 35677, 34499, 105134, 33160, 31594, 41114, 92828, 43570, 45315, 54625, 62219, 65781, 96848, 66589, 30933, 70323, 70919, 77423, 83330, 84876, 90419, 93260, 31481, 209010, 30549, 822, 8217, 7854, 6472, 5405, 4098, 2993, 2653, 2163, 760, 28720, 521, 215, 188, 150, 45, 42, 41, 5, 9209, 9216, 12942, 13004, 28143, 28023, 27708, 25613, 25554, 24480, 24181, 22545, 21790, 21241, 19237, 18602, 17133, 15364, 15062, 14105, 13291, 25417, 37111, 3, 157354, 178900, 127067, 131357, 169786, 132531, 163968, 161936, 160499, 157969, 138161, 182028, 139030, 155703, 155426, 153938, 143884, 144733, 150658, 148949, 146652, 181304, 167791, 122515, 205059, 194655, 124708, 121921, 201808, 124361, 119890, 24068, 155987, 22345, 46, 21882, 156009, 171, 21059, 2010

In [34]:
print(predict_rating(272, 115746))
print(predict_rating(272, 2651))
print(predict_rating(272, 172818))
print(predict_rating(272, 3076))
print(predict_rating(272, 9609))

8.0
8.0
8.0
8.0
8.0


In [8]:
# generate matrix of actual ratings from user_id/game_id pairs
truth_matrix = []
for user in user_ids:
    row = []
    for gid in game_ids:
        row.append(test_ptable.loc[user][gid])
    truth_matrix.append(row)

In [10]:
compare_tuples = []

for i, row in enumerate(truth_matrix):
    for j, rating in enumerate(row):
        if rating > .1:
            compare_tuples.append((truth_matrix[i][j], pred_matrix[i][j]))


print(compare_tuples)

[(4.0, 7.0), (6.0, 7.0), (1.0, 7.0), (3.0, 7.5), (1.0, 7.0), (1.0, 7.5), (1.0, 6.0), (7.0, 7.0), (1.0, 8.0), (1.0, 6.0), (1.0, 7.0), (1.0, 7.0), (1.0, 7.5), (1.0, 7.5), (1.0, 7.5), (1.0, 7.5), (1.0, 7.5), (1.0, 7.0), (1.0, 7.0), (1.0, 7.5), (1.0, 8.0), (1.0, 7.5), (1.0, 6.0), (1.0, 7.5), (1.0, 8.5), (1.0, 7.5), (1.0, 7.5), (1.0, 7.0), (2.0, 7.0), (1.0, 7.5), (1.0, 7.0), (1.0, 7.0), (1.0, 7.0), (1.0, 7.5), (1.0, 7.0), (1.0, 7.5), (1.0, 7.0), (8.0, 7.5), (10.0, 7.5), (9.0, 8.5), (7.0, 6.5), (7.0, 6.0), (9.0, 7.5), (5.0, 6.5), (6.0, 6.0), (7.0, 6.0), (6.0, 7.0), (6.0, 5.5), (8.0, 7.0), (6.5, 6.5), (8.0, 7.0), (8.0, 7.5), (5.0, 6.0), (7.0, 6.5), (8.0, 8.0), (10.0, 7.0), (7.0, 8.0), (7.0, 7.0), (10.0, 7.5), (8.0, 8.0), (9.0, 8.0), (8.0, 7.0), (8.0, 7.0), (7.0, 6.0), (8.0, 7.0), (8.0, 7.0), (7.0, 7.0), (8.0, 8.0), (8.0, 7.0), (8.0, 8.0), (8.0, 7.0), (9.0, 7.5), (7.0, 7.5), (7.0, 7.5), (7.0, 6.5), (8.0, 7.5), (7.0, 6.5), (10.0, 8.5), (9.0, 7.0), (10.0, 6.5), (8.0, 7.0), (6.0, 6.5), (8.0, 7.0)

In [11]:
# find difference in compare_tuples
from math import sqrt

square_mean_sum = 0

for pair in compare_tuples:
    square_mean_sum += ((pair[0] - pair[1])**2)

root_square_mean = sqrt(square_mean_sum)

print(root_square_mean)
# this is a measure of acc of all pred
# check this measure of acc by comparing to a matrix of random guesses, mean guess for all (7)

# need to just get top N pred for each user
# then see if those suggestions are among highest rated in test data

147.90911904442467
