# Implementing Recommender Systems for BoardGamesGeek.com

In [114]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import display
%matplotlib inline
#line-by-line runtime comparison for easier code optimization.
%load_ext line_profiler

pd.set_option('display.max_rows',1000)

elite = pd.read_csv('../inputs/boardgame-elite-users.csv')
elite = elite.rename(columns = {'Compiled from boardgamegeek.com by Matt Borthwick':'UserID'})
titles = pd.read_csv('../inputs/boardgame-titles.csv')
titles = titles.rename(columns={'boardgamegeek.com game ID':'gameID'})
frequent = pd.read_csv('../inputs/boardgame-frequent-users.csv')
frequent = frequent.rename(columns = {'Compiled from boardgamegeek.com by Matt Borthwick':'UserID'})
#load up the big dataset
#users = pd.read_csv('../inputs/boardgame-users.csv')
#users = users.rename(columns = {'Compiled from boardgamegeek.com by Matt Borthwick':'UserID'})


The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


## Baseline Predictor
This is the simplest predictor I'm making for the project. It doubles as a way to normalize the Ratings matrix R for more complex algorithms (like SVD) that require some kind of a way to fill missing ratings in the sparse matrix. Subtracting the baseline prediction from each value in R normalizes so that missing values can be set to 0.

All this prediction does is use the user rating averages, total average rating, and average item ratings to come up with a believable first guess. Details are in section 2.1 of the paper linked in the exploratory notebook. 

In [2]:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split

class Base_Predictor(BaseEstimator,RegressorMixin):
    def __init__(self, DAMPENING_TERM=25, dampening=False):
        self._dampening_term = 25
        self._dampening = dampening
        
    def fit(self, X, y):
        self._mean = y.mean()
        self._R = pd.concat([X,y],axis=1).pivot_table(index='UserID',columns='gameID',values='rating')
        self._bu = self._R.apply(lambda row: self._user_base(row), axis=1)
        self._bi = self._R.apply(lambda column: self._item_base(column))
        
    def _user_base(self, row):
        """(1/M+d)*bu is with the dampening factor. Without it's 1/M * bu. To find the way to add the 
        dampening factor as a scalar multiplication: 
            k*1/M(bu) = 1/M+d(bu)
            k = M/M+d"""
        bu = row.mean() - self._mean
        if self._dampening:
            num_items_user_reviewed = row[row.notnull()].size
            damp_factor = num_items_user_reviewed/(num_items_user_reviewed+self._dampening_term)
            bu*=damp_factor
        return bu
    
    def _item_base(self, column):
        users_that_reviewed_this_item = column[column.notnull()]
        bu_for_users_that_reviewed_i = self._bu[users_that_reviewed_this_item.index].mean()
        bi = users_that_reviewed_this_item.mean()-bu_for_users_that_reviewed_i-self._mean
        if self._dampening:
            num_users_reviewed_item = column[column.notnull()].size
            damp_factor = num_users_reviewed_item/(num_users_reviewed_item+self._dampening_term)
            bi*=damp_factor
        return bi
        
    def predict(self, X):
        return self._mean + self._bu[X.UserID].values + self._bi[X.gameID].values
    

### A little Unit Testing for Sanity's sake.
I just made a simple 3x3 test data set with one missing value. I ran through the calculations by hand, and set up a little battery of tests to make sure it all works. I print out the 3 pieces of info so you can see visually. There's TDD_test_X, the user, item pair I predict. TDD_train_X is the list of values at the bottom, the middle matrix is TDD_test_X blown up into the ratings matrix (with the missing value I'm testing for showing). As long as this cell compiles without triggering an assertion error, things are working fine.

In [3]:
np.random.seed(42)

test_ratings_matrix = pd.DataFrame(np.random.randint(1,10,size=(3,3)),columns=map(int,list('456')),index=map(int,list('123')))
test_ratings_matrix.loc[3,6] = np.NaN
#collapse test frame down the same format as our dataset. 3 columns, user
TDD_train_X = test_ratings_matrix.stack().reset_index()
TDD_train_X.columns = ['UserID','gameID','rating']
TDD_test_X = pd.DataFrame(data={'UserID':[3],'gameID':[6]})
display(TDD_test_X)
display(test_ratings_matrix)
display(TDD_train_X)

def test_baseline_predictor():
    predictor = Base_Predictor()
    predictor.fit(TDD_train_X[['UserID','gameID']],TDD_train_X.rating)
    assert predictor._mean == 6.125, "The incorrect mean was calculated for the baseline test set"
    assert predictor._bu.tolist() == [0.20833333333333304,-1.125,1.375], "The wrong bu values were calculated for the baseline test"
    assert predictor._bi.tolist() == [0.05555555555555536, 0.05555555555555536, -0.16666666666666607], "incorrect bi was calculated for baseline test"
    assert predictor.predict(TDD_test_X).tolist() == [7.333333333333334], "baseline prediction for the test value was incorrect"
        

test_baseline_predictor()

Unnamed: 0,UserID,gameID
0,3,6


Unnamed: 0,4,5,6
1,7,4,8.0
2,5,7,3.0
3,7,8,


Unnamed: 0,UserID,gameID,rating
0,1,4,7.0
1,1,5,4.0
2,1,6,8.0
3,2,4,5.0
4,2,5,7.0
5,2,6,3.0
6,3,4,7.0
7,3,5,8.0


## User-User Collaborative Filtering

This is the system described in section 2.2 of the linked paper. The idea is that to predict the rating of user U and item I, you use the normalized average rating of the N most similar users to U who have reviewed item I. There are multiple similarity measures that can be used, and several other hyper paramters that can be tweaked that can be fed into the class constructor for testing and comparison. 

In [108]:
train_X, test_x, train_y,test_y = train_test_split(elite[['UserID','gameID']],elite.rating,test_size=.3,random_state=42)
predictor = U_U_predictor()
predictor.fit(train_X,train_y)


3.2 s ± 167 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [110]:
from sklearn.base import BaseEstimator, RegressorMixin

class U_U_predictor(BaseEstimator, RegressorMixin):
    
    #ratings matrix from the actual training values
    #_R 
    
    #precomputing user's average ratings and std to save time later.
    #_user_average_rating
    #_user_standard_deviation
    
    #user similarity matrix (size user x user)
    #_S
    
    #function that changes depending on selected similarity metric (cosine, pearson, spearman, etc.)
    #_calculate_user_similarity
    
    #switches between equation 2.6 and 2.7 in the paper
    #_normalize_to_z_scores
    
    #the paper suggests a dampening threshhold to keep users from sparse reviews getting rated as overly similar
    #_pearson_threshold
    
    #how many nearest neighbors to look at when computing rating predictions
    #_N_similar
        
    def __init__(self, similarity_type = 'pearson', normalize_to_z_scores=False, pearson_threshold=50,N_similar=20):
        self._normalize_to_z_scores = normalize_to_z_scores
        self._pearson_threshold = pearson_threshold
        self._N_similar = N_similar
        if similarity_type=='cosine':
            self._calculate_user_similarity = self._cosine_similarity
        else: self._calculate_user_similarity = self._pearson_calculate_user_similarity
            
    def fit(self,X,y):
        self._calculate_ratings_matrix(X,y)
        self._calculate_user_similarity_matrix_s_vector()
    
    def _calculate_ratings_matrix(self, X,y):
        self._R = pd.concat([X,y],axis=1).pivot_table(index='UserID',columns='gameID',values='rating')
        
        #preprocessing to make user similarities easier to calculate
        self._user_average_rating = self._R.mean(axis=1)
        self._user_standard_deviation = self._R.std(axis=1)
        
    def _calculate_user_similarity_matrix_s_vector(self):
        #initialize our similarity matrix _S, and our temp numpy matrix we'll be using while calculating.
        self._S = pd.DataFrame(index=self._R.index, columns = self._R.index.rename('User_Prime_ID'), data=0.0)
        temp = np.full(self._S.shape,np.nan)
        
        #We have a user x user matrix of similarity values, but we don't need to do the main diagonal (user1 x user1
        #will always have 1.0) and since the top and bottom diagonals are identical (since user1xuser2 = user2xuser1)
        #we only bother calculating along the upper triangle. We go row by row, the row sizes decrease as we go.
        for index,user1 in enumerate(self._R.index[:-1]):
            user2s = self._R.index[index+1:]
            temp[index,index+1:] = self._pearson_r_vector(user1,index,user2s)
            
        #now that we have the upper triangle values, all we have to do is mirror it to the bottom and we're done.
        i_lower = np.tril_indices(temp.shape[0], -1)
        temp[i_lower] = temp.T[i_lower] 
        #turn our numpy temp matrix back into a dataframe.
        self._S = pd.DataFrame(temp, columns=self._S.columns, index=self._S.index)
            
    def _pearson_r_vector(self,user1,user1_index,user2s):
        """Vectorized implementation of the user x user algorithm. User1 is a single value, index is where user1 appears
        in the list, and user2s is a vector of all the users that follow User1."""
        #get the average for user 1, and a vector of averages for all the other users.
        user1_average = self._user_average_rating[user1]
        user2_averages = self._user_average_rating[user1_index+1:]
        
        #find where user1 and user2s have reviewed items. (left half is a bool vector, right is a bool matrix).
        #End matrix has each row as a boolean vector showing which items both user1 and the user2 for that row reviewed  
        #We're reversing since we want False where both users reviewed the same item, and True elsewhere.
        mask = ~(self._R.iloc[user1_index].notnull().as_matrix() & self._R.iloc[user1_index+1:].notnull().as_matrix())

        #Turns out working directly with numpy matrixes is faster, so that's what I do from here on out.
        #First, null out any item reviews for user2s where user1 didn't have a review.
        user2_ratings = self._R.iloc[user1_index+1:].as_matrix()
        user2_ratings[mask] = np.NaN
        
        #now we get a matrix of user1 reviews. Each row corresponds to one of the users in user2s, with all reviews
        #nulled except for items both user1 and the user in that row of user2s reviewed.
        user1_ratings = np.full(user2_ratings.shape,np.NaN)
        user1_base_ratings = self._R.iloc[user1_index].as_matrix()
        for i in range(user1_ratings.shape[0]):
            user1_ratings[i] = user1_base_ratings
            user1_ratings[i, mask[i]] = np.NaN
            
        #normalize by mean. Turns out subtracting a column vector from a matrix is fussy in numpy.
        user1_ratings -= user1_average
        user2_ratings = np.subtract(user2_ratings, user2_averages.values.reshape(-1,1))
        
        #now that we have our normalized reviews, for computational convenience I'm turning NaNs to 0s.
        nullmask = np.isnan(user1_ratings)
        user1_ratings[nullmask] = 0.0
        user2_ratings[nullmask] = 0.0
        
        #pearsonr is cov(1,2)/sqrt(Var(1)*Var(2)). This is just a vectorized implementation, doing row-wise dot 
        #products between two vectors.
        variance1 = (user1_ratings * user1_ratings).sum(axis=1)
        variance2 = (user2_ratings * user2_ratings).sum(axis=1)
        denom = np.sqrt(variance1*variance2)
        covariance = (user1_ratings * user2_ratings).sum(axis=1)

        #catch any divide by 0 errors. Any user2s with 0 variance will produce a NaN.
        with np.errstate(divide='ignore',invalid='ignore'):
            ret = covariance/denom
            ret[np.isnan(ret)] = 0.0
            
        return ret
        
    def _cosine_similarity(self):
        pass
    
    def _find_N_similar_users(self, user1,user2):
        pass
    
    def _find_items_two_users_both_reviewed(self, user1,user2):
        return self._items_reviewed[user1].intersection(self._items_reviewed[user2])
    
    def predict(self,X):
        return X.apply(lambda row: self._single_prediction(row), axis=1).values
        
    def _single_prediction(self, row):
        user_mean = self._user_average_rating[row.UserID]
        N = self._N_similar
    
        #nearest users IDs are the index of this series, s(u,u') is the values. It's filtered so only users
        #who have also rated the game in question are considered. 
        N_nearest_similar = self._S[row.UserID][self._R[row.gameID].notnull()].nlargest(N)
    
        #vectors for most similar user information
        similar_users_means = self._user_average_rating[N_nearest_similar.index].values
        similar_users_item_ratings = self._R.loc[N_nearest_similar.index, row.gameID].values
    
        user_std = 1.0
        similar_users_stds = np.full(N,1.0)
        if(self._normalize_to_z_scores):
            user_std = self._user_standard_deviation[row.UserID]
            similar_users_stds = self._user_standard_deviation[N_nearest_similar.index].values
    
        similar_users_z_values = (similar_users_item_ratings-similar_users_means)/similar_users_stds
        prediction = user_mean + ((user_std/N) * np.dot(N_nearest_similar.values,similar_users_z_values))
    
        return prediction



In [164]:
train_X, test_x, train_y,test_y = train_test_split(elite[['UserID','gameID']],elite.rating,test_size=.3,random_state=42)
def test_user_user_similarity_matrix():
    predictor = U_U_predictor()
    predictor.fit(train_X,train_y)
    
    S = pd.read_pickle('user_S')
    #display(S.equals(predictor._S))
    display(S.head())
    
def profile_pearson_r():
    test_row = pd.Series(data={'UserID':272.0,'User_Prime_ID':388.0,'rating':0})
    predictor = U_U_predictor()
    predictor._calculate_ratings_matrix(train_X,train_y)
    predictor._pearson_calculate_user_similarity(test_row)


test_pearson_r()
#%lprun -f U_U_predictor._pearson_calculate_user_similarity profile_pearson_r()

True

In [165]:
    S = pd.read_pickle('user_S')
    display(S.head())

User_Prime_ID,272,388,430,2044,3080,3256,3557,5038,5217,5480,...,180167,180775,181123,181339,181472,187094,189973,191116,192057,193339
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
272,,-0.056713,0.042577,-0.083717,0.006745,-0.150557,-0.013788,-0.178553,-0.064375,-0.12651,...,-0.179775,-0.081585,-0.050838,-0.101766,-0.118772,-0.077584,0.082922,0.011729,-0.095787,-0.074771
388,-0.056713,,0.046515,0.153163,0.097794,0.197996,0.285074,0.177844,0.293407,0.350157,...,0.036938,0.208991,0.37145,0.189091,0.320189,0.261483,-0.05857,0.348025,0.142535,0.224021
430,0.042577,0.046515,,0.234026,0.097142,0.140773,0.321387,0.087036,0.087268,0.201068,...,0.107156,0.285411,0.265436,0.290484,0.087081,0.095054,0.263194,0.042129,0.173206,0.265883
2044,-0.083717,0.153163,0.234026,,0.434913,0.340015,0.420451,0.290227,0.03639,0.310784,...,0.344774,0.35466,0.358454,0.300479,0.251006,0.209808,0.319423,0.041889,0.065291,0.186324
3080,0.006745,0.097794,0.097142,0.434913,,0.24052,0.436059,0.127337,-0.014994,0.300714,...,0.000822,0.1708,0.327374,0.256311,0.029283,0.218664,0.167901,-0.068388,0.179503,0.360398


In [115]:
#was 189 seconds
#was 97 seconds. 
#was 49 seconds
#calling it good at 3.1 seconds

train_X, test_x, train_y,test_y = train_test_split(frequent[['UserID','gameID']],frequent.rating,test_size=.3,random_state=42)


predictor = U_U_predictor()
#%lprun -f predictor.fit predictor.fit(train_X,train_y)
%time predictor.fit(train_X,train_y)

CPU times: user 2min 57s, sys: 496 ms, total: 2min 58s
Wall time: 5min 7s


## Model Selection and Error checking

Now that I've gotten some models built out, I can use Sklearn's framework to check out different prediction systems, compare RMSE, and see what kind of model works the best with this dataset.

In [178]:
from sklearn.metrics import mean_squared_error

#predictor = Base_Predictor()
#train_X, test_x, train_y,test_y = train_test_split(elite[['UserID','gameID']],elite.rating,test_size=.3,random_state=42)
#predictor.fit(train_X,train_y)

predictions = predictor.predict(test_x)

mse = mean_squared_error(predictions,test_y)
print(np.sqrt(mse))

1.32964534833


In [186]:
predictor = U_U_predictor(normalize_to_z_scores=True, N_similar=10)
predictor.fit(train_X,train_y)

predict = predictor.predict(test_X)

mse = mean_squared_error(predict,test_y)
print(np.sqrt(mse))

1.31049914233


In [184]:
predictor = U_U_predictor(normalize_to_z_scores=True, N_similar=5)
predictor.fit(train_X,train_y)

predict = predictor.predict(test_X)

mse = mean_squared_error(predict,test_y)
print(np.sqrt(mse))

1.30633111229


In [185]:
predictor = U_U_predictor(normalize_to_z_scores=True, N_similar=3)
predictor.fit(train_X,train_y)

predict = predictor.predict(test_X)

mse = mean_squared_error(predict,test_y)
print(np.sqrt(mse))

1.31054160406


In [187]:
predictor = Base_Predictor()
predictor.fit(train_X,train_y)

predictions = predictor.predict(test_x)

mse = mean_squared_error(predictions,test_y)
print(np.sqrt(mse))

1.30322536673
