In [None]:
# Python User-User Collaborative Filtering Recommender System
# consider not using pivot table?
# experiment with different similarity functions

'''
-For each movie j user i has not seen yet, we find the set of users U who are similar to user i and have seen 
movie j.
For each similar user u, we take u's rating of movie j and multiply it by the cosine similarity 
of user i and user u. Sum up these weighted ratings, divide by the number of users in U, and we get a
weighted average rating for the movie j.
'''

In [83]:
import numpy as np
import pandas as pd
import scipy
from sklearn.preprocessing import normalize

# Read CSV and change to pivot_table
df = pd.read_csv('inputs/boardgame-elite-users.csv')
df = df.pivot_table(index='userID', columns='gameID', values='rating')

# Fill nan with zero. Normalize ignores the zeroes
df = df.fillna(0)
# print(len(df.columns)) --> 402
# print(len(df.index)) --> 199
print(df.head())

gameID  3       5       10      11      12      13      18      41      \
userID                                                                   
272        1.0     1.0     1.0     0.0     1.0     7.5     1.0     4.0   
388        8.0     7.0     8.0     8.0     8.0     7.0     8.0     9.0   
430        6.0     6.0     7.0     7.0     6.0     8.0     0.0     5.0   
2044       8.0     7.0     7.0     7.0     0.0     8.0     7.0     7.0   
3080       9.0     0.0     5.0     8.0     7.0     8.0     8.0     7.0   

gameID  42      45       ...    187645  192291  193738  194655  198773  \
userID                   ...                                             
272        1.0     6.0   ...       0.0     0.0     0.0     0.0     0.0   
388        9.0     8.0   ...       0.0     0.0     9.0     8.0     0.0   
430        6.0     9.0   ...       8.0     0.0     7.5     0.0     0.0   
2044       7.0     6.0   ...       7.0     8.0     8.0     0.0     9.0   
3080       8.0     9.0   ...       9.

In [None]:
# Test / Train Split
# from sklearn.model_selection import train_test_split

# train, test = train_test_split(df)

In [2]:
# Normalize User Ratings for GameIDs
normalized = normalize(df)

In [3]:
# Get User Similarity Matrix
# Also try Pearson coefficient, city-block, etc...
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(df)
# print(sim.shape) --> (199, 199)

In [68]:
# !!! I don't want 'nearest 10 neighbors'...
# I want the nearest 10 neighbors WHO HAVE rated the gameID
# Get K nearest neighbors (indices from user similarity matrix)
# argsort returns indices of the values of a sorted version of the array

# return the 10 nearest neighbors who have rated the game
# should I check to make sure game has not yet been rated?

nbor_indx = sim.argsort()[:,::-1][:,:]

# Helper function called by "get_ten_nearest"
# Takes a gameID and returns list
def get_users_that_rated(gid):
    user_list = df[gid].where(df[gid]>0).dropna().index.tolist()
    return user_list
    
# Helper function called by "predict_rating"
# Takes df, userID, and gameID that has not been rated by userID ; returns list of 10 most similar userIDs 
# THAT HAVE rated the game (descending order of similarity)
def get_ten_nearest(df, uid, gameid):
    pot_nbors = get_users_that_rated(gameid)
    uid_indx = df.index.get_loc(uid)
    desc_sim_indx = nbor_indx[uid_indx]
    
    sim_userIDs = []
    for x in desc_sim_indx:
        if df.index[x] in pot_nbors:
            sim_userIDs.append(df.index[x])
        if len(sim_userIDs)>10: # itself will be the first member(total 11)
            break
    return sim_userIDs[1:]
    
print(get_ten_nearest(df,66705,13))

[187094, 128117, 55092, 13340, 19677, 37860, 12305, 96817, 119312, 27853]


In [84]:
# Takes df, userID, gameID
# returns predicted rating for
def predict_rating(df, uid, gameid):
    sim_uids = get_ten_nearest(df, uid, gameid)
    accm_score = 0.0
    for nbor_id in sim_uids:
        sim_indx_x = df.index.get_loc(uid)
        sim_indx_y = df.index.get_loc(nbor_id)
        accm_score += (df.loc[nbor_id][gameid] * sim[sim_indx_x][sim_indx_y])
    score = accm_score / 10
    return score

predict_rating(df, 3080, 187645)

# Now just apply where each rating is zero...

6.157596239942643

In [44]:
print(df.index[1])
y = nbor_indx[0][1]
print(df.index[y])
print(df.index.get_loc(7100))
print(sim[0][10])
print(sim[0][nbor_indx[0][3]])

388
7100
10
0.5953403235711451
0.5803093270234448


In [None]:
# Prediction Matrix
# For each movie that a user has not rated,
# Get the 10 nearest neighbors that have rated the movie, (how to handle user with no very similar users, every
# nearest neighbor is still more disimilar than similar?)
# Add each rating-Times-Similarity-Weight and divide by 10,
# Of all the new values generated for unrated movies, return the highest N
