## Board game recommendation engine
### example of KFolds cross-validation

#### John Burt


#### Purpose of this notebook:

This is a simplified example of how to implement KFolds cross-validation of a user rating predictor, using the boardgamegeek dataset.

In this case, the rating predictor is very simple: it just returns the mean rating of each game in the test set, regardless of user.  

#### WARNING: If you use the all-users data set, there will be some users with only one rating. When KFolds randomly splits that rating data into training and test sets, it will likely put some of those one-rating users into the test set, so your rating estimation method must be able to generate ratings for users with no ratings in the training set!

In [65]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')
# ---

%matplotlib inline
import pandas as pd
pd.options.display.max_columns = 100
from matplotlib import pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
import numpy as np

from datetime import datetime

pd.options.display.max_rows = 100

# load the boardgame user data
testdata = pd.read_csv('boardgame-users-test.csv') 
userdata = pd.read_csv('boardgame-users.csv') 
#userdata = pd.read_csv('boardgame-elite-users.csv')
#userdata = pd.read_csv('boardgame-frequent-users.csv')

# rename the userID column
userdata=userdata.rename(columns = {"Compiled from boardgamegeek.com by Matt Borthwick":'userID'})

# load the boardgame title data
titledata = pd.read_csv('boardgame-titles.csv')

# rename the gameID column
titledata=titledata.rename(columns = {"boardgamegeek.com game ID":'gameID'})

# for titledata set game ID as the index
titledata = titledata.set_index("gameID")

In [66]:
# Using KFolds provided row indices train_index and test_index, extract subsets from userdata. 
#
# Pivot the training data to create rows of userID, with columns of gameID. 
# If a user rated a game, it will be at user x game and if not, then the cell will be NAN
def get_kf_pivot_data(userdata, train_index, test_index):
    train_p = userdata.iloc[train_index].pivot(index="userID", columns="gameID", values="rating")
    testdata = userdata.iloc[test_index]
    return train_p, testdata    

In [67]:
# a simple rating predictor: 
#  for given test user + game, return mean rating for that game
#
# This func assumes:
#  train_p is a pandas df w/ rows=userID, cols=gameID
#  testdata is a pandas df w/ cols: userID, gameID
def predict_ratings(train_p, testdata):
    
    # get mean ratings for all games
    gameratingmean = train_p[:].mean()
    
    # generate a rating for each test row
    rating = []
    for index, rec in testdata.iterrows():
        rating.append(gameratingmean[rec.gameID])

    return rating
    

In [69]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from math import sqrt

# set up kfold to generate 10 test sets, with shuffled indices for selecting from userdata
kf = KFold(n_splits=10, shuffle=True)

# iterate through test sets (folds), test prediction model and print RMSE result
i = 1
rmse = []
for train_index, test_index in kf.split(userdata):
    print("k-fold train/test set #%d: "%(i), end="")
    train_p, kf_testdata = get_kf_pivot_data(userdata, train_index, test_index)
    ratings_pred = predict_ratings(train_p, kf_testdata)  
    rmse.append(sqrt(mean_squared_error(kf_testdata.rating, ratings_pred)))
    print("RMSE = %2.3f"%(rmse[-1]))
    # break # uncomment this to only run one fold 
    i += 1
    
print("\nmean RMSE = %2.3f"%(np.mean(rmse)))

k-fold train/test set #1: RMSE = 1.385
k-fold train/test set #2: RMSE = 1.383
k-fold train/test set #3: RMSE = 1.379
k-fold train/test set #4: RMSE = 1.383
k-fold train/test set #5: RMSE = 1.381
k-fold train/test set #6: RMSE = 1.384
k-fold train/test set #7: RMSE = 1.382
k-fold train/test set #8: RMSE = 1.382
k-fold train/test set #9: RMSE = 1.381
k-fold train/test set #10: RMSE = 1.385

mean RMSE = 1.383
