# Implementing Recommender Systems for BoardGamesGeek.com

In [3]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import display
%matplotlib inline

pd.set_option('display.max_rows',1000)

elite = pd.read_csv('../inputs/boardgame-elite-users.csv')
elite = elite.rename(columns = {'Compiled from boardgamegeek.com by Matt Borthwick':'UserID'})
titles = pd.read_csv('../inputs/boardgame-titles.csv')
titles = titles.rename(columns={'boardgamegeek.com game ID':'gameID'})
#frequent = pd.read_csv('../inputs/boardgame-frequent-users.csv')
#frequent = frequent.rename(columns = {'Compiled from boardgamegeek.com by Matt Borthwick':'UserID'})
#load up the big dataset
#users = pd.read_csv('../inputs/boardgame-users.csv')
#users = users.rename(columns = {'Compiled from boardgamegeek.com by Matt Borthwick':'UserID'})


## Baseline Predictor
This is the simplest predictor I'm making for the project. It doubles as a way to normalize the Ratings matrix R for more complex algorithms (like SVD) that require some kind of a way to fill missing ratings in the sparse matrix. Subtracting the baseline prediction from each value in R normalizes so that missing values can be set to 0.

All this prediction does is use the user rating averages, total average rating, and average item ratings to come up with a believable first guess. Details are in section 2.1 of the paper linked in the exploratory notebook. 

In [4]:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split

class Base_Predictor(BaseEstimator,RegressorMixin):
    def __init__(self, DAMPENING_TERM=25, dampening=False):
        self._dampening_term = 25
        self._dampening = dampening
        
    def fit(self, X, y):
        self._mean = y.mean()
        self._R = pd.concat([X,y],axis=1).pivot_table(index='UserID',columns='gameID',values='rating')
        self._bu = self._R.apply(lambda row: self._user_base(row), axis=1)
        self._bi = self._R.apply(lambda column: self._item_base(column))
        
    def _user_base(self, row):
        """(1/M+d)*bu is with the dampening factor. Without it's 1/M * bu. To find the way to add the 
        dampening factor as a scalar multiplication: 
            k*1/M(bu) = 1/M+d(bu)
            k = M/M+d"""
        bu = row.mean() - self._mean
        if self._dampening:
            num_items_user_reviewed = row[row.notnull()].size
            damp_factor = num_items_user_reviewed/(num_items_user_reviewed+self._dampening_term)
            bu*=damp_factor
        return bu
    
    def _item_base(self, column):
        users_that_reviewed_this_item = column[column.notnull()]
        bu_for_users_that_reviewed_i = self._bu[users_that_reviewed_this_item.index].mean()
        bi = users_that_reviewed_this_item.mean()-bu_for_users_that_reviewed_i-self._mean
        if self._dampening:
            num_users_reviewed_item = column[column.notnull()].size
            damp_factor = num_users_reviewed_item/(num_users_reviewed_item+self._dampening_term)
            bi*=damp_factor
        return bi
        
    def predict(self, X):
        return self._mean + self._bu[X.UserID].values + self._bi[X.gameID].values
    

### A little Unit Testing for Sanity's sake.
I just made a simple 3x3 test data set with one missing value. I ran through the calculations by hand, and set up a little battery of tests to make sure it all works. I print out the 3 pieces of info so you can see visually. There's TDD_test_X, the user, item pair I predict. TDD_train_X is the list of values at the bottom, the middle matrix is TDD_test_X blown up into the ratings matrix (with the missing value I'm testing for showing). As long as this cell compiles without triggering an assertion error, things are working fine.

In [5]:
np.random.seed(42)

test_ratings_matrix = pd.DataFrame(np.random.randint(1,10,size=(3,3)),columns=map(int,list('456')),index=map(int,list('123')))
test_ratings_matrix.loc[3,6] = np.NaN
#collapse test frame down the same format as our dataset. 3 columns, user
TDD_train_X = test_ratings_matrix.stack().reset_index()
TDD_train_X.columns = ['UserID','gameID','rating']
TDD_test_X = pd.DataFrame(data={'UserID':[3],'gameID':[6]})
display(TDD_test_X)
display(test_ratings_matrix)
display(TDD_train_X)

def test_baseline_predictor():
    predictor = Base_Predictor()
    predictor.fit(TDD_train_X[['UserID','gameID']],TDD_train_X.rating)
    assert predictor._mean == 6.125, "The incorrect mean was calculated for the baseline test set"
    assert predictor._bu.tolist() == [0.20833333333333304,-1.125,1.375], "The wrong bu values were calculated for the baseline test"
    assert predictor._bi.tolist() == [0.05555555555555536, 0.05555555555555536, -0.16666666666666607], "incorrect bi was calculated for baseline test"
    assert predictor.predict(TDD_test_X).tolist() == [7.333333333333334], "baseline prediction for the test value was incorrect"
        

test_baseline_predictor()

Unnamed: 0,UserID,gameID
0,3,6


Unnamed: 0,4,5,6
1,7,4,8.0
2,5,7,3.0
3,7,8,


Unnamed: 0,UserID,gameID,rating
0,1,4,7.0
1,1,5,4.0
2,1,6,8.0
3,2,4,5.0
4,2,5,7.0
5,2,6,3.0
6,3,4,7.0
7,3,5,8.0


## User-User Collaborative Filtering

This is the system described in section 2.2 of the linked paper. The idea is that to predict the rating of user U and item I, you use the normalized average rating of the N most similar users to U who have reviewed item I. There are multiple similarity measures that can be used, and several other hyper paramters that can be tweaked that can be fed into the class constructor for testing and comparison. 

In [111]:
from sklearn.base import BaseEstimator, RegressorMixin

class U_U_predictor(BaseEstimator, RegressorMixin):
    
    #ratings matrix from the actual training values
    #_R 
    
    #for pre-processing
    #_user_average_rating
    
    #user similarity matrix (size user x user)
    #_S
    
    #function that changes depending on selected similarity metric
    #_calculate_user_similarity
    
    #switches between equation 2.6 and 2.7 in the paper
    #_normalize_to_z_scores
    
    #the paper suggests a dampening threshhold to keep users from sparse reviews getting rated as overly similar
    #_pearson_threshold
    
    #how many nearest neighbors to look at when computing rating predictions
    #_N_similar
        
    def __init__(self, similarity_type = 'pearson', normalize_to_z_scores=False, pearson_threshold=50,N_similar=20):
        self._normalize_to_z_scores = normalize_to_z_scores
        self._pearson_threshold = pearson_threshold
        self._N_similar = N_similar

        if similarity_type=='cosine':
            self._calculate_user_similarity = self._cosine_similarity
        else: self._calculate_user_similarity = self._pearson_calculate_user_similarity
            
    def fit(self,X,y):
        self._R = pd.concat([X,y],axis=1).pivot_table(index='UserID',columns='gameID',values='rating')
        self._user_average_rating = self._R.mean(axis=1)
        
        self._calculate_user_similarity_matrix_s()

    def _calculate_user_similarity_matrix_s(self):    
        """All of the similarity measures I'm implementing are transitive (s(u,u') == s(u',u)) so I'm only calculating 
        one value for each pair once. If S is a UxU matrix, since you don't want to bother with calculating similarity 
        both for s(u,u') and s(u',u), that means we basically need to calculate values for only the upper triangle of 
        matrix S. We'll skip the main diagonal too, since we don't care about s(u,u). After calculating, we'll fill 
        out the matrix by mirroring the upper triangle down to the lower triangle to get our full matrix S."""
        
        self._S = pd.DataFrame(index=self._R.index, columns = self._R.index.rename('User_Prime_ID'), data=0.0)
        self._S = self._S.mask(np.tril(np.ones(self._S.shape, dtype=np.bool_)))
        user_combinations = self._S.stack().reset_index()
        user_combinations.columns = ['UserID','User_Prime_ID','rating']
        
        user_combinations.iloc[:1000].rating = user_combinations[:1000].apply(
            lambda row: self._calculate_user_similarity(row), axis=1)
        
        display(user_combinations)
        
    def _pearson_calculate_user_similarity(self, row):
        items_in_common = self._find_items_two_users_both_reviewed(row.UserID, row.User_Prime_ID)
        user1_average = self._user_average_rating[row.UserID]
        user2_average = self._user_average_rating[row.User_Prime_ID]
        
        user1_normalized_ratings = self._R.loc[row.UserID, items_in_common] - user1_average
        user2_normalized_ratings = self._R.loc[row.User_Prime_ID, items_in_common] - user2_average
        
        corr = np.dot(user1_normalized_ratings, user2_normalized_ratings)/(
            np.linalg.norm(user1_normalized_ratings) * np.linalg.norm(user2_normalized_ratings))
        
        dampening_factor = min((items_in_common.size/self._pearson_threshold), 1.0)

        return corr*dampening_factor

    def _cosine_similarity(self):
        pass
    
    def _find_N_similar_users(self, user1,user2):
        pass
    
    def _find_items_two_users_both_reviewed(self, user1,user2):
        ru = self._R.loc[user1]
        ru_p = self._R.loc[user2]
        items_user1_reviewed = ru[ru.notnull()]
        items_user2_reviewed = ru_p[ru_p.notnull()]
        
        #align returns a tuple, one with rating values for user 1, one for rating values for user 2. I just want
        #the item IDs, so that's all I return (the index from one of the tuples)
        return items_user1_reviewed.align(items_user2_reviewed,join='inner')[0].index
    
    def predict(self,R):
        pass

    

u = 272
u_prime = 388


train_X, test_x, train_y,test_y = train_test_split(elite[['UserID','gameID']],elite.rating,test_size=.3,random_state=42)

predictor = U_U_predictor()
predictor.fit(train_X,train_y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Unnamed: 0,UserID,User_Prime_ID,rating
0,272,388,-0.056713
1,272,430,0.042577
2,272,2044,-0.083717
3,272,3080,0.006745
4,272,3256,-0.150557
5,272,3557,-0.013788
6,272,5038,-0.178553
7,272,5217,-0.064375
8,272,5480,-0.126510
9,272,7100,0.135336


## Model Selection and Error checking

Now that I've gotten some models built out, I can use Sklearn's framework to check out different prediction systems, compare RMSE, and see what kind of model works the best with this dataset.

In [None]:
from sklearn.metrics import mean_squared_error

predictor = Base_Predictor()
train_X, test_x, train_y,test_y = train_test_split(elite[['UserID','gameID']],elite.rating,test_size=.3,random_state=42)
predictor.fit(train_X,train_y)

predictions = predictor.predict(train_X)

mse = mean_squared_error(predictions,train_y)
print(np.sqrt(mse))