In [6]:
# Python User-User Collaborative Filtering Recommender System
# consider not using pivot table?
# experiment with different similarity functions

# KNeighbors-based solution
# - For each unrated gameID that a userID has,
# - Take K-Nearest Neighbors, (Experiment with different values of K)
# - Add each Neighbor's RatingTimesSimilarityWeight, divide by K
# - Predict should return the gameIDs that have the highest scores from unrated gameIDs (top N versus some threshold)

# VERSUS BELOW (slight difference in step 3)

'''
1.) We have an n X m matrix consisting of the ratings of n users and m items. Each element of the matrix (i, j) 
represents how user i rated item j. Since we are working with movie ratings, each rating can be expected to be an 
integer from 1-5 (reflecting one-star ratings to five-star ratings) if user i has rated movie j, and 0 if the user 
has not rated that particular movie.

2.) For each user, we want to recommend a set of movies that they have not seen yet (the movie rating is 0). 
To do this, we will effectively use an approach that is similar to weighted K-Nearest Neighbors.

3.) For each movie j user i has not seen yet, we find the set of users U who are similar to user i and have seen 
movie j.
For each similar user u, we take u's rating of movie j and multiply it by the cosine similarity 
of user i and user u. Sum up these weighted ratings, divide by the number of users in U, and we get a
weighted average rating for the movie j.

4.) Finally, we sort the movies by their weighted average rankings. These average rankings serve as an estimate
for what the user will rate each movie. Movies with higher average rankings are more likely to be favored by the
user, so we will recommend the movies with the highest average rankings to the user.
'''

In [1]:
import numpy as np
import pandas as pd
import scipy
from sklearn.preprocessing import normalize

# Read CSV and change to pivot_table
df = pd.read_csv('inputs/boardgame-elite-users.csv')
df = df.pivot_table(index='userID', columns='gameID', values='rating')

# Fill nan with zero. Normalize ignores the zeroes
df = df.fillna(0)

In [2]:
# Test / Train Split
from sklearn.model_selection import train_test_split

train, test = train_test_split(df)

In [3]:
# Normalize User Ratings for GameIDs
normalized = normalize(train)

In [4]:
# Get User Similarity Matrix
# Also try Pearson coefficient, city-block, etc...
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(train)



In [28]:
# Get K nearest neighbors
# argsort returns indices of the values of a sorted version of the array
values = sim.argsort()
print(values)

# K = 10
neighbor_indices = values[:,-10:-1]
print(neighbor_indices)

for i in neighbor_indices[0]:
    print(sim[i])

[[ 15  38 133 ...  88  81   0]
 [ 15  44  87 ...   6 145   1]
 [ 15  79  62 ...   4 145   2]
 ...
 [ 15  36  62 ...  81 145 146]
 [ 15  93  44 ... 148 145 147]
 [ 15  44  93 ... 145 147 148]]
[[116 145  56 ... 113  88  81]
 [ 43   4 101 ...  28   6 145]
 [146 111   6 ... 101   4 145]
 ...
 [ 25  49  28 ...   4  81 145]
 [ 81   6   4 ...  28 148 145]
 [ 26  81  65 ... 101 145 147]]
[0.80440108 0.73326391 0.7514676  0.75083669 0.78557142 0.71503425
 0.80285102 0.73683489 0.76579813 0.68448903 0.74518113 0.7223275
 0.74671072 0.7121978  0.77402031 0.49111386 0.80308443 0.74540818
 0.78803303 0.78047858 0.69491587 0.73965451 0.69316955 0.81373856
 0.77352903 0.77487341 0.81116841 0.70801971 0.80962626 0.79627946
 0.77896857 0.74511386 0.7867252  0.73842551 0.72353026 0.74747914
 0.69329338 0.76055945 0.67109399 0.75675578 0.77151952 0.74919266
 0.71289883 0.7504243  0.72725927 0.75358458 0.78685315 0.74602113
 0.67847055 0.80265077 0.79188392 0.7511024  0.78180316 0.73034499
 0.71091363 0.

In [68]:
from sklearn.neighbors import KNeighborsClassifier
