In [1]:
# NOTES

# compare the ordered gameid vecs by iterating over test vec and subtracting from a val for the 
# distance, by index, when compared to pred vec

# 'seven' (mean/mode) guess against test data is more accurate than predictions, according
# to rmse, but is useless for actual rec system (same rating for all userid-gameid pairs)...

# instead, design test for confirming recognition of highest rated pairs among test data...
# test this by ordering all userID-gameID predictions (desc) and comparing to order from test data
# how to compare similarity of lists of unique gameIDs? ...
# also the test vector of gameIDs will not have ncsrly been rated although they do have a pred rating...

# k nearest villains - recs based on most dissimilar users' lowest rated?...

# possible check for outliers (dist of ratings vastly different vectors, all one value)

# Consider cross-validation for more accurate testing
# sklearn cross-validation module is the equiv of just shuffling train/test split, retraining, rerun

# Consider wrapping in custom sklearn estimator / classifier, maybe inherit from base classes

# README
# Datafield has 'user_ids' and their 'rating' (1-10) of various 'game_ids'
# Fields = userID , gameID , rating
# Prediction of unrated games for each user_id is made using:
# k-nearest neighbors found using cosine-similarity
# predicted rating based on those near-neighbors' rating weighted by cosine-similarity users

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('inputs/boardgame-elite-users.csv')

train, test = train_test_split(df, test_size=.15)

In [3]:
train_ptable = train.pivot_table(index='userID', columns='gameID', values='rating') 

test_ptable = test.pivot_table(index='userID', columns='gameID', values='rating') 

# get mean before fillna
# this is mean of rating for gameids using train data
# test by applying to every userid-gameid pair in test corresponding to gameid
i = 0
gameID_mean_tuples = []
for row in train_ptable.mean():
    gameID_mean_tuples.append((train_ptable.columns[i], row))
    i += 1

train_ptable = train_ptable.fillna(0)
test_ptable = test_ptable.fillna(0)

In [4]:
from sklearn.preprocessing import normalize

norm_train = normalize(train_ptable, copy=True)

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(norm_train)

In [6]:
nbor_indx = sim.argsort()[:,::-1]

# HELPER FUNCTIONS
def get_users_that_rated(gameid):
    return train_ptable[gameid].where(train_ptable[gameid]>0).dropna().index.tolist()
    
def get_k_nearest_neighbors(userid, gameid, k):
    assert(k>1)
    pot_nbors = get_users_that_rated(gameid)
    userid_indx = train_ptable.index.get_loc(userid)
    
    sim_userIDs = []
    for x in nbor_indx[userid_indx]:
        if train_ptable.index[x] in pot_nbors:
            sim_userIDs.append(train_ptable.index[x])
        if len(sim_userIDs)>k:
            break
    return sim_userIDs[1:]

def round_to_point5(someFloat):
    score = np.around(someFloat, decimals=1)
    first_digit = score // 1
    last_digit = (score % 1) * 10
    carry = 0
    if 0 <= last_digit <= 2:
        return float(first_digit)
    elif 2 < last_digit <= 7:
        return float(first_digit + .5)
    elif last_digit > 7:
        return float(first_digit + 1)
        
    

In [7]:
# Takes userID, gameID, optional k
# Returns predicted rating for gameID
def predict_rating(uid, gameid, k=10):
    sim_uids = get_k_nearest_neighbors(uid, gameid, k)
    accm_score = 0.0
    accm_weight = 0.0
    for nbor_id in sim_uids:
        sim_indx_x = train_ptable.index.get_loc(uid)
        sim_indx_y = train_ptable.index.get_loc(nbor_id)
        accm_score += (train_ptable.loc[nbor_id][gameid] * sim[sim_indx_x][sim_indx_y])
        accm_weight += sim[sim_indx_x][sim_indx_y]
    score = accm_score / accm_weight
    score = round_to_point5(score)
    return score

predict_rating(5480, 97903)

6.5

In [8]:
# is iteration slowing down? try apply or something

user_ids = []

for row in test_ptable.itertuples():
    user_ids.append(row[0])
    
game_ids = []
for x in test_ptable.columns:
    game_ids.append(x)

# tmpdict = {'user_ID':}...

pred_matrix = []
for user in user_ids:
    row = []
    for gid in game_ids:
        row.append(predict_rating(user,gid))
    pred_matrix.append(row)

In [24]:
# create user_game_matrix, which holds list predicted gameIDs for each userID, desc order

user_game_matrix = []
for usr_row in pred_matrix:
    sorted_row_indexes = np.asarray(usr_row).argsort()[::-1]
    
    
    gids = []
    for indx in sorted_row_indexes:
        gids.append(train_ptable.columns[indx])
    user_game_matrix.append(gids)

In [10]:
# generate matrix of actual ratings from user_id/game_id pairs
truth_matrix = []
for user in user_ids:
    row = []
    for gid in game_ids:
        row.append(test_ptable.loc[user][gid])
    truth_matrix.append(row)

In [25]:
# get gameIDs for each userID from truth_matrix

truth_game_matrix = []
for usr_row in truth_matrix:
    sorted_row_indexes = np.asarray(usr_row).argsort()[::-1]
    
    gids = []
    for indx in sorted_row_indexes:
        gids.append(test_ptable.columns[indx])
    truth_game_matrix.append(gids)

In [30]:
for gid in user_game_matrix[0]:
    
print(truth_game_matrix[0])

[182028, 66690, 36218, 822, 3076, 9209, 24439, 28720, 30549, 34119, 34499, 37111, 521, 37046, 205637, 40692, 40849, 68448, 96848, 124742, 104162, 555, 115746, 172818, 12, 178900, 188, 13, 173346, 205059, 102794, 107529, 65781, 55660, 54625, 54043, 51811, 43015, 45, 42, 40834, 50, 37904, 54, 110327, 93, 66188, 41, 102680, 69789, 70323, 36932, 77423, 92539, 92828, 93260, 105551, 97207, 97786, 97842, 98778, 100901, 102652, 72125, 35677, 118, 6249, 12333, 10630, 9609, 9216, 320, 8217, 7854, 5405, 171, 5404, 463, 2993, 2653, 2651, 2397, 2163, 14105, 15062, 17133, 19857, 31627, 31481, 31260, 30869, 172, 28143, 28023, 27833, 120677, 25554, 25292, 215, 22545, 21790, 21348, 25613, 209010, 126042, 160499, 176396, 126163, 127023, 176189, 155068, 175914, 155426, 163968, 177736, 155703, 164153, 164928, 171623, 131357, 167791, 169786, 163602, 129622, 163412, 136888, 147020, 201808, 136063, 198773, 148949, 193738, 192291, 183394, 150376, 150658, 181304, 154203, 15364, 157354, 17226, 18602, 20100, 205

In [12]:
# test by checking that top 10 gameIDs for each userID from truth_matrix have high rating predictions

top10_tuples = []
i = 0

for game_row in truth_game_matrix:
    usr_row = []
    for gid in game_row:
        usr_row.append((predict_rating(train_ptable.index[i], gid), test_ptable.iloc[i][gid]))
    i += 1
    top10_tuples.append(usr_row)

In [20]:
# get root mean square error of top 10 predictioned versus truth

import math

sum_sqr_error = 0
i = 0

for row in top10_tuples:
    for pair in row:
        sum_sqr_error += ((pair[0] - pair[1])**2)
        i += 1
print('rmse = ', math.sqrt(sum_error/i))

rmse =  1.1172206451051852


In [14]:
compare_tuples = []

for i, row in enumerate(truth_matrix):
    for j, rating in enumerate(row):
        if rating > .1:
            compare_tuples.append((truth_matrix[i][j], pred_matrix[i][j]))

In [23]:
# find rmse in all compare_tuples

mean_sqr_sum = 0
i = 0

for pair in compare_tuples:
    mean_sqr_sum += ((pair[0] - pair[1])**2)
    i += 1

print('rmse = ', math.sqrt(mean_sqr_sum/i))
# this is a measure of acc of all pred
# check this measure of acc by comparing to a matrix of random guesses, mean guess for all (7)

# need to just get top N pred for each user
# then see if those suggestions are among highest rated in test data

rmse =  1.5871907191521808


In [21]:
# rmse of all sevens
# seven is both the mode and median (i think, test/show this)

mean_sqr_sum = 0
i = 0

for row in top10_tuples:
    for pair in row:
        mean_sqr_sum += ((pair[0] - 7)**2)
        i += 1

print('rmse = ', math.sqrt(mean_sum/i))

rmse =  0.7463478919909917
