In [1]:
# NOTES
# organize output to return top recommendations for each user, top N
# test accuracy of predictions on top N, not that pred numbers are ncsrly same, but top N pred are among
# highest rated

# k nearest villains - recs based on most dissimilar users' lowest rated

# possible check for outliers (dist of ratings vastly different vectors, all one value)

# Consider cross-validation for more accurate testing
# sklearn cross-validation module is the equiv of just shuffling train/test split, retraining, rerun

# Consider wrapping in custom sklearn estimator / classifier, maybe inherit from base classes

# README
# Datafield has 'user_ids' and their 'rating' (1-10) of various 'game_ids'
# Fields = userID , gameID , rating
# Prediction of unrated games for each user_id is made using:
# k-nearest neighbors found using cosine-similarity
# predicted rating based on those near-neighbors' rating weighted by cosine-similarity users

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('inputs/boardgame-elite-users.csv')

train, test = train_test_split(df, test_size=.15)

In [2]:
train_ptable = train.pivot_table(index='userID', columns='gameID', values='rating') 

test_ptable = test.pivot_table(index='userID', columns='gameID', values='rating') 


train_ptable = train_ptable.fillna(0)
test_ptable = test_ptable.fillna(0)

In [3]:
from sklearn.preprocessing import normalize

norm_train = normalize(train_ptable, copy=True)

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(norm_train)

In [5]:
nbor_indx = sim.argsort()[:,::-1]

# HELPER FUNCTIONS
def get_users_that_rated(gameid):
    return train_ptable[gameid].where(train_ptable[gameid]>0).dropna().index.tolist()
    
def get_k_nearest_neighbors(userid, gameid, k):
    assert(k>1)
    pot_nbors = get_users_that_rated(gameid)
    userid_indx = train_ptable.index.get_loc(userid)
    
    sim_userIDs = []
    for x in nbor_indx[userid_indx]:
        if train_ptable.index[x] in pot_nbors:
            sim_userIDs.append(train_ptable.index[x])
        if len(sim_userIDs)>k:
            break
    return sim_userIDs[1:]

def round_to_point5(someFloat):
    score = np.around(someFloat, decimals=1)
    first_digit = score // 1
    last_digit = (score % 1) * 10
    carry = 0
    if 0 <= last_digit <= 2:
        return float(first_digit)
    elif 2 < last_digit <= 7:
        return float(first_digit + .5)
    elif last_digit > 7:
        return float(first_digit + 1)
        
    

In [6]:
# Takes userID, gameID, optional k
# Returns predicted rating for gameID
def predict_rating(uid, gameid, k=10):
    sim_uids = get_k_nearest_neighbors(uid, gameid, k)
    accm_score = 0.0
    accm_weight = 0.0
    for nbor_id in sim_uids:
        sim_indx_x = train_ptable.index.get_loc(uid)
        sim_indx_y = train_ptable.index.get_loc(nbor_id)
        accm_score += (train_ptable.loc[nbor_id][gameid] * sim[sim_indx_x][sim_indx_y])
        accm_weight += sim[sim_indx_x][sim_indx_y]
    score = accm_score / accm_weight
    score = round_to_point5(score)
    return score

predict_rating(5480, 97903)

7.0

In [7]:
# is iteration slowing down? try apply or something

user_ids = []

for row in test_ptable.itertuples():
    user_ids.append(row[0])
    
game_ids = []
for x in test_ptable.columns:
    game_ids.append(x)

# tmpdict = {'user_ID':}...

pred_matrix = []
for user in user_ids:
    row = []
    for gid in game_ids:
        row.append(predict_rating(user,gid))
    pred_matrix.append(row)

In [10]:
# create user_game_matrix, which holds list of top 10 predicted gameIDs for each userID, desc order

user_game_matrix = []
for usr_row in pred_matrix:
    sorted_row_indexes = np.asarray(usr_row).argsort()[::-1]
    
    
    gids = []
    for indx in sorted_row_indexes[:10]:
        gids.append(train_ptable.columns[indx])
    user_game_matrix.append(gids)

In [11]:
# generate matrix of actual ratings from user_id/game_id pairs
truth_matrix = []
for user in user_ids:
    row = []
    for gid in game_ids:
        row.append(test_ptable.loc[user][gid])
    truth_matrix.append(row)

In [12]:
# get top 10 gameIDs for each userID from truth_matrix

truth_game_matrix = []
for usr_row in truth_matrix:
    sorted_row_indexes = np.asarray(usr_row).argsort()[::-1]
    
    gids = []
    for indx in sorted_row_indexes[:10]:
        gids.append(test_ptable.columns[indx])
    truth_game_matrix.append(gids)

In [43]:
# test by checking that top 10 gameIDs for each userID from truth_matrix have high rating predictions

top10_tuples = []
i = 0

for game_row in truth_game_matrix:
    usr_row = []
    for gid in game_row:
        usr_row.append((predict_rating(train_ptable.index[i], gid), test_ptable.iloc[i][gid]))
    i += 1
    top10_tuples.append(usr_row)

print(top10_tuples)

[[(4.0, 8.0), (7.5, 7.5), (7.0, 5.0), (7.5, 3.0), (7.0, 3.0), (7.0, 2.0), (7.0, 1.0), (6.0, 1.0), (6.5, 1.0), (7.0, 1.0)], [(7.0, 10.0), (7.5, 10.0), (7.5, 10.0), (7.0, 9.0), (7.5, 9.0), (7.0, 9.0), (7.5, 9.0), (6.5, 9.0), (8.0, 9.0), (7.5, 9.0)], [(8.0, 10.0), (7.5, 10.0), (8.5, 10.0), (7.5, 10.0), (7.5, 9.0), (7.5, 9.0), (7.0, 8.0), (7.0, 8.0), (7.5, 8.0), (7.5, 8.0)], [(7.0, 9.0), (8.5, 8.0), (8.0, 8.0), (7.5, 8.0), (7.5, 8.0), (8.0, 8.0), (6.5, 8.0), (7.5, 8.0), (6.0, 7.5), (7.0, 7.5)], [(7.0, 10.0), (7.5, 9.0), (6.0, 9.0), (7.0, 8.0), (7.0, 8.0), (7.0, 8.0), (7.5, 8.0), (6.5, 8.0), (7.0, 8.0), (7.5, 8.0)], [(7.0, 10.0), (7.0, 10.0), (7.5, 10.0), (8.0, 10.0), (7.5, 9.6), (7.0, 9.5), (8.0, 9.5), (6.5, 9.1), (7.0, 9.1), (7.0, 9.0)], [(8.0, 10.0), (7.0, 9.0), (9.0, 9.0), (8.0, 9.0), (7.5, 9.0), (8.0, 9.0), (7.0, 8.0), (6.0, 8.0), (7.5, 8.0), (5.5, 8.0)], [(7.5, 9.0), (7.5, 9.0), (7.5, 9.0), (8.0, 9.0), (7.5, 9.0), (6.5, 8.0), (7.0, 8.0), (7.5, 8.0), (7.5, 8.0), (6.5, 8.0)], [(8.0, 10.

In [45]:
# get root mean square error of top 10 predictioned versus truth
import math

mean_square_error = 0

for row in top10_tuples:
    for pair in row:
        mean_square_error += ((pair[0] - pair[1])**2)
print(math.sqrt(mean_square_error))

66.6588786284318


In [47]:
# get root mean square error of guessing mean rating for every pair

for row in train_ptable.mean():
    print(row)

5.575025125628141
5.097085427135678
3.789949748743719
5.37427135678392
6.373869346733668
5.881105527638191
5.000301507537688
4.6229648241206025
5.859949748743717
4.819497487437186
4.949849246231156
4.2425125628140705
5.7182914572864325
4.272613065326633
5.667537688442212
3.166130653266332
3.472663316582915
4.735376884422111
6.347035175879396
2.1316582914572866
2.253618090452261
4.189949748743718
3.522412060301508
4.380804020100502
5.254120603015076
3.412663316582914
4.726783919597991
4.23391959798995
5.4806097989949745
3.440402010050251
2.896482412060302
3.9342211055276386
5.291005025125627
2.17
3.7778391959799
4.1065326633165835
3.1404020100502517
4.445477386934674
3.0537688442211053
5.144924623115578
3.9096482412060305
5.318140703517588
4.680100502512563
5.451859296482411
2.7833165829145727
5.251507537688443
5.754572864321608
5.9888944723618085
1.493467336683417
2.524723618090452
5.049899497487437
2.569748743718593
5.230954773869347
4.128643216080402
3.7050251256281412
1.632160804020

In [32]:
compare_tuples = []

for i, row in enumerate(truth_matrix):
    for j, rating in enumerate(row):
        if rating > .1:
            compare_tuples.append((truth_matrix[i][j], pred_matrix[i][j]))


print(compare_tuples)

[(1.0, 7.5), (7.5, 7.5), (1.0, 7.0), (1.0, 7.0), (8.0, 4.0), (1.0, 6.0), (1.0, 6.5), (1.0, 7.0), (1.0, 6.0), (1.0, 6.5), (1.0, 7.0), (1.0, 8.0), (1.0, 6.5), (1.0, 7.5), (1.0, 8.0), (1.0, 6.5), (1.0, 8.0), (1.0, 7.0), (1.0, 7.0), (1.0, 7.5), (1.0, 7.0), (1.0, 7.0), (1.0, 8.0), (1.0, 6.5), (1.0, 7.0), (1.0, 7.0), (3.0, 7.0), (1.0, 6.5), (2.0, 7.0), (3.0, 7.5), (1.0, 7.0), (1.0, 7.0), (1.0, 7.5), (5.0, 7.0), (1.0, 6.5), (1.0, 7.5), (7.0, 7.5), (8.0, 7.5), (9.0, 8.0), (6.0, 6.0), (9.0, 7.5), (6.0, 5.5), (6.0, 7.0), (8.0, 7.0), (6.0, 6.0), (7.0, 6.5), (9.0, 7.5), (6.0, 6.5), (7.0, 7.0), (6.0, 5.5), (6.0, 4.5), (8.0, 7.0), (10.0, 7.5), (8.0, 7.5), (9.0, 8.0), (8.0, 6.5), (10.0, 7.5), (8.0, 6.5), (8.0, 7.0), (8.0, 7.0), (7.0, 7.5), (7.0, 6.5), (8.0, 7.0), (8.0, 7.0), (8.0, 7.5), (7.0, 8.0), (7.0, 6.5), (9.0, 7.5), (9.0, 6.5), (9.0, 7.0), (8.0, 7.5), (9.0, 7.0), (9.0, 7.0), (9.0, 7.5), (9.0, 7.5), (7.0, 7.0), (6.0, 7.0), (8.0, 7.5), (10.0, 7.0), (6.0, 7.0), (8.0, 6.5), (8.0, 7.0), (7.0, 7.5), 

In [33]:
# find difference in compare_tuples
from math import sqrt

square_mean_sum = 0

for pair in compare_tuples:
    square_mean_sum += ((pair[0] - pair[1])**2)

root_square_mean = sqrt(square_mean_sum)

print(root_square_mean)
# this is a measure of acc of all pred
# check this measure of acc by comparing to a matrix of random guesses, mean guess for all (7)

# need to just get top N pred for each user
# then see if those suggestions are among highest rated in test data

147.9022673792535
