In [None]:
# NOTES
# For testing: should omit ratings for userid/gameid pairs from entire df
# do not omit entire rows
# try test/train split with df before changing to pivot_table
# ideally, if userid entries have the same freq, omit 10-15% of uniq userid entries
# does it make sense to try and keep user-similarity matrix as close as possible to training on full df?
# ...aka... does omitting certain test data drastically change user-similarity matrix?


# Consider cross-validation for more accurate testing

# Consider wrapping in custom sklearn estimator / classifier, maybe inherit from base classes

# README
# Datafield has 'user_ids' and their 'rating' (1-10) of various 'game_ids'
# Fields = userID , gameID , rating
# Prediction of unrated games for each user_id is made using:
# k-nearest neighbors found using cosine-similarity
# predicted rating based on those near-neighbors' rating weighted by cosine-similarity users

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('inputs/boardgame-elite-users.csv')

train, test = train_test_split(df, test_size=.15)

In [9]:
#print(train.userID.unique)
print(train.shape)
print(test.shape)
x = train.groupby('userID')
print(len(x.groups))
y = test.groupby('userID')
print(len(y.groups))
# this will work

(50399, 3)
(8894, 3)
199
199


In [2]:
train_ptable = train.pivot_table(index='userID', columns='gameID', values='rating') 

test_ptable = test.pivot_table(index='userID', columns='gameID', values='rating') 


train_ptable = train_ptable.fillna(0)
test_ptable = test_ptable.fillna(0)

In [4]:
from sklearn.preprocessing import normalize

norm_train = normalize(train_ptable)

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(norm_train)

In [6]:
nbor_indx = sim.argsort()[:,::-1]

# HELPER FUNCTIONS
def get_users_that_rated(gameid):
    return train_ptable[gameid].where(train_ptable[gameid]>0).dropna().index.tolist()
    
def get_k_nearest_neighbors(userid, gameid, k):
    assert(k>1)
    pot_nbors = get_users_that_rated(gameid)
    userid_indx = train_ptable.index.get_loc(userid)
    
    sim_userIDs = []
    for x in nbor_indx[userid_indx]:
        if train_ptable.index[x] in pot_nbors:
            sim_userIDs.append(train_ptable.index[x])
        if len(sim_userIDs)>k:
            break
    return sim_userIDs[1:]

def round_to_point5(someFloat):
    score = np.around(someFloat, decimals=1)
    first_digit = score // 1
    last_digit = (score % 1) * 10
    carry = 0
    if 0 <= last_digit <= 2:
        return float(first_digit)
    elif 2 < last_digit <= 7:
        return float(first_digit + .5)
    elif last_digit > 7:
        return float(first_digit + 1)
        
    

In [7]:
# Takes userID, gameID, optional k
# Returns predicted rating for gameID
def predict_rating(uid, gameid, k=10):
    sim_uids = get_k_nearest_neighbors(uid, gameid, k)
    accm_score = 0.0
    accm_weight = 0.0
    for nbor_id in sim_uids:
        sim_indx_x = train_ptable.index.get_loc(uid)
        sim_indx_y = train_ptable.index.get_loc(nbor_id)
        accm_score += (train_ptable.loc[nbor_id][gameid] * sim[sim_indx_x][sim_indx_y])
        accm_weight += sim[sim_indx_x][sim_indx_y]
    score = accm_score / accm_weight
    score = round_to_point5(score)
    return score

predict_rating(5480, 97903)

# works with new split, now test

7.0

In [None]:
# -Generate matrix of all predictions
# -ensure shape of test data is the same
# use sklearn.metrics accuracy_score as below

>>> import numpy as np
>>> from sklearn.metrics import accuracy_score
>>> y_pred = [0, 2, 1, 3]
>>> y_true = [0, 1, 2, 3]
>>> accuracy_score(y_true, y_pred)
0.5
>>> accuracy_score(y_true, y_pred, normalize=False)
2

In [9]:
# is iteration slowing down? try apply or something

user_ids = []
for row in test_ptable.itertuples():
    user_ids.append(row[0])
game_ids = []
for x in test_ptable.columns:
    game_ids.append(x)

pred_matrix = []
for user in user_ids:
    row = []
    for gid in game_ids:
        row.append(predict_rating(user,gid))
    pred_matrix.append(row)

In [11]:
print(pred_matrix)
# np.save(predict_matrix_file, pred_matrix)

[[7.5, 7.5, 7.0, 7.5, 8.0, 8.0, 6.5, 7.0, 7.5, 8.0, 7.0, 7.5, 7.0, 6.5, 7.5, 7.0, 6.5, 6.5, 8.5, 6.0, 7.0, 7.0, 6.0, 6.5, 7.5, 6.5, 7.0, 8.5, 7.0, 6.5, 5.0, 8.0, 8.0, 4.5, 6.0, 8.0, 7.0, 7.0, 6.5, 6.5, 6.5, 7.0, 7.0, 6.5, 7.5, 7.5, 8.0, 8.0, 4.5, 6.0, 7.0, 6.5, 7.5, 6.5, 7.0, 5.0, 7.0, 7.0, 5.5, 6.5, 6.5, 6.5, 6.0, 5.5, 5.5, 7.0, 5.0, 6.5, 6.0, 6.0, 5.5, 4.5, 7.0, 6.0, 5.5, 5.5, 7.5, 6.5, 4.5, 6.0, 7.0, 5.5, 7.5, 7.0, 8.5, 7.0, 7.0, 6.5, 6.5, 3.0, 6.0, 6.0, 7.5, 8.0, 7.5, 5.5, 6.5, 8.0, 6.5, 7.5, 7.5, 7.5, 7.0, 7.5, 6.5, 7.0, 7.5, 7.0, 7.0, 7.0, 7.5, 8.0, 7.5, 7.0, 6.5, 8.0, 7.0, 6.5, 8.0, 6.5, 6.5, 7.5, 6.5, 7.0, 7.0, 7.0, 6.5, 6.5, 6.0, 7.5, 7.5, 7.5, 6.0, 7.5, 7.5, 6.5, 6.5, 7.0, 6.5, 7.5, 6.5, 8.0, 7.0, 7.0, 7.5, 7.0, 6.5, 7.5, 7.5, 7.5, 6.5, 7.0, 7.5, 7.5, 7.0, 7.0, 7.5, 6.5, 7.0, 7.0, 8.0, 7.0, 7.0, 7.5, 7.5, 7.5, 8.5, 6.5, 6.5, 6.5, 7.0, 6.0, 6.5, 6.0, 7.5, 7.5, 7.5, 7.0, 8.0, 7.0, 7.0, 7.5, 8.0, 7.0, 7.0, 8.5, 6.5, 7.5, 7.5, 7.0, 7.0, 7.0, 7.0, 8.0, 7.5, 7.5, 8.0, 7.5, 6.0, 8.0

NameError: name 'predict_matrix_file' is not defined