# Board Games
__________________________

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import sqlite3
import os
import pprint
import tempfile

from typing import Dict, Text

# import math
# import random
# import sklearn
# import scipy
import cv2

# Recommender
import tensorflow as tf
import tensorflow_recommenders as tfrs
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow.keras.layers import Embedding, Dense, Layer
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds


### Import data  

Import the cleaned dataframe, reference dictionaries, and user ratings.

In [2]:
# Open dataframe
infile = open('../datasets/boardgames/clean_bgg_GameItem.pkl', 'rb')
df = pickle.load(infile)
infile.close()

In [3]:
df.shape

(7929, 20)

In [4]:
# Open dictionaries
infile = open('../datasets/boardgames/ref_dictionaries.pkl', 'rb')
ref_dicts = pickle.load(infile)
infile.close()

In [5]:
# Extract ratings from sqlite database
conn = sqlite3.connect("../datasets/boardgames/bgg_5yrs_RatingItem.db")
cur = conn.cursor()

user_df = pd.read_sql_query("""
SELECT *,
    COUNT(bgg_user_name) OVER
         (PARTITION BY bgg_user_name) AS user_count
FROM bgg_ratings

""", conn)

user_df.head()

Unnamed: 0,bgg_user_name,bgg_id,bgg_user_rating,year,month,user_count
0,fu_koios,223033,9.0,2017,10,1
1,-=yod@=-,7,7.5,2015,3,173
2,-=yod@=-,42,6.5,2016,10,173
3,-=yod@=-,217,6.75,2016,10,173
4,-=yod@=-,432,7.5,2017,5,173


In [6]:
cur.close()
conn.close()

In [7]:
user_df.shape

(12278237, 6)

## Preprocessing

A common problem in recommender systems is known as ***user cold-start***, where it is difficult to recommend items for users with very few number of consumed items (in this case rated board games), due to lack of information to model their preferences. As such, we choose to only keep the users with at least 30 rated board games.

In [8]:
# Filtering dataframe to contain users with at least 30 rates
user_df = user_df[user_df['user_count']>=30]
user_df.shape

(10667845, 6)

We also want to extract the user ratings for the board games that we are left with after extensive EDA and cleaning.

In [9]:
# Filtering dataframe to user ratings of the board games we are concerned with
user_df = user_df[user_df['bgg_id'].isin(df['bgg_id'])]
user_df.shape

(9182849, 6)

In [10]:
user_df['user_count'].describe()

count    9.182849e+06
mean     2.552930e+02
std      3.462861e+02
min      3.000000e+01
25%      7.900000e+01
50%      1.560000e+02
75%      3.160000e+02
max      6.717000e+03
Name: user_count, dtype: float64

In [11]:
# Save df as .pkl
outfile = open('../datasets/boardgames/bgg_users_2015.pkl', 'wb')
pickle.dump(user_df, outfile)
outfile.close()

#### Board Game Mapper

We require a mapper for board game id to the board game name since our predictions would be done on the board game ids. This mapper will be user at the end after an actual prediction has been made.

In [12]:
# Mapper (bgg_id -> name)
bg_mapper = {}
for i, name in zip(df['bgg_id'], df['name']):
    bg_mapper[str(i)] = name

#### Unique id  

We require to map the board game ids to embedding vectors in the models later. Hence, we need lists of the unique board game ids and unique user ids.

In [13]:
# Extract unique users and unique board game ids
# Need to keep it as numpy.ndarray
unique_user = user_df['bgg_user_name'].unique()
unique_bgg_id = df['bgg_id'].unique().astype(str)

In [14]:
unique_bgg_id[:10]

array(['3', '9', '10', '11', '12', '13', '14', '16', '17', '25'],
      dtype='<U21')

## Retrieval Model

This is a two-tower retrieval model, we will build each tower separately and then combine them in the final model.

#### Split into train and test sets

We want to split the user dataframe into train and test sets, by time. The data up to time $T$ would be used to predict user rating after $T$.

In [15]:
# Sort user dataframe by date
user_df = user_df.sort_values(by=['year', 'month']).reset_index(drop=True)

In [16]:
# Convert train and test into Tensor Datasets
user_dict = {'bgg_id': user_df['bgg_id'].astype(str),
            'bgg_user_name': user_df['bgg_user_name']}
user_data = tf.data.Dataset.from_tensor_slices(user_dict)

In [17]:
# Train/test split
num_entries = tf.get_static_value(user_data.__len__())
train_split = int(np.ceil(0.8*num_entries))
test_split = int(np.floor(0.2*num_entries))

user_train = user_data.take(train_split)
user_test = user_data.skip(train_split).take(test_split)

In [298]:
## THIS WAS LAST SUCCESSFUL
# Split dataset with shuffle False
# We only need the user name and bgg_id
# user_train, user_test = train_test_split(user_df[['bgg_id', 'bgg_user_name']], shuffle=False, test_size=0.2)

In [None]:
# Convert train and test into Tensor Datasets
# user_train = user_train.to_dict('records')
# user_train = tf.data.Dataset.from_tensor_slices(user_train)

In [None]:
# user_test = user_test.to_dict('records')
# user_test = tf.data.Dataset.from_tensor_slices(user_test)

In [73]:
## THIS WAS LAST SUCCESSFUL
# Convert train and test into Tensor Datasets
# user_train['bgg_id'] = user_train['bgg_id'].astype(str)
# user_test['bgg_id'] = user_test['bgg_id'].astype(str)
# user_train = tf.data.Dataset.from_tensor_slices(user_train)
# user_test = tf.data.Dataset.from_tensor_slices(user_test)

#### Query tower

In [18]:
# Dimensionality of the query
embedding_dimension = 32

# Define the model
user_model = tf.keras.Sequential([
    tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_user, mask_token=None),
    # Additional embedding to account for unknwon tokens
    tf.keras.layers.Embedding(len(unique_user) + 1, embedding_dimension)
])

#### Candidate tower

In [None]:
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Embedding(1000, 64))
# # The model will take as input an integer matrix of size (batch,
# # input_length), and the largest integer (i.e. word index) in the input
# # should be no larger than 999 (vocabulary size).
# # Now model.output_shape is (None, 10, 64), where `None` is the batch
# # dimension.
# input_array = np.random.randint(1000, size=(32, 10))
# model.compile('rmsprop', 'mse')
# output_array = model.predict(input_array)
# print(output_array.shape)


In [19]:
# Define the model for board game names
bg_model = tf.keras.Sequential([
    tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_bgg_id, mask_token=None),
    tf.keras.layers.Embedding(len(unique_bgg_id) + 1, embedding_dimension)
])

#### Metrics

In the training data, there are positive (bgg_id, bgg_user_name) pairs. To gauge on how good the model is, we need to compare the affinity score that the model calculates for a particular pair to the scores of all the other possible candidates. In other words, the higher the score for the positive pair as compared to other candidates, the more accurate the model is.

We use `FactorizedTopK` metric which requires the dataset of candidates that are used as implicit negatives for evaluation. We are implicitly assuming that if a user did not rate a board game, he/she do not like that board game as much.

In [20]:
type(df['bgg_id'].values.astype(str))

numpy.ndarray

In [21]:
type(tf.data.Dataset.from_tensor_slices(df['bgg_id'].values.astype(str)))

tensorflow.python.data.ops.dataset_ops.TensorSliceDataset

In [31]:
# Convert to Tensor Dataset object
bgg_ids = tf.data.Dataset.from_tensor_slices(df['bgg_id'].values.astype(str))

# The metrics
metrics = tfrs.metrics.FactorizedTopK(
    candidates=bgg_ids.batch(128).map(bg_model)
)

#### Loss

We use the `Retrieval` task object to bundle together the loss function and metric computation. This becomes a Keras layer that takes the embeddings from the two towers as arguments, and returning the computed loss.

In [32]:
# Setting up the task
task = tfrs.tasks.Retrieval(
    metrics=metrics
)

#### Full model

We want to combine all of the above together into a model. We use `tfrs.Model` as the base model which take care of creating the appropriate training loop to fit the model.

In [356]:
user_test.__len__()

<tf.Tensor: shape=(), dtype=int64, numpy=1836569>

In [24]:
# Check the format of train and test datasets
count = 0

for element in user_test.batch(4096).as_numpy_iterator():
    if count <=3:
        print(element)
        count += 1
    else:
        break

{'bgg_id': array([b'146021', b'214880', b'39856', ..., b'221965', b'244654',
       b'246855'], dtype=object), 'bgg_user_name': array([b'nickster1970', b'nicktaruffi', b'nickwatt', ..., b'olli_gold',
       b'olli_gold', b'olli_gold'], dtype=object)}
{'bgg_id': array([b'247694', b'266444', b'760', ..., b'98778', b'100901', b'102652'],
      dtype=object), 'bgg_user_name': array([b'olli_gold', b'olli_gold', b'olliesons', ..., b'pchomp',
       b'pchomp', b'pchomp'], dtype=object)}
{'bgg_id': array([b'119890', b'129622', b'140620', ..., b'113294', b'182874',
       b'201808'], dtype=object), 'bgg_user_name': array([b'pchomp', b'pchomp', b'pchomp', ..., b'pouringraine',
       b'pouringraine', b'pouringraine'], dtype=object)}
{'bgg_id': array([b'269725', b'224517', b'264220', ..., b'284294', b'184267',
       b'50381'], dtype=object), 'bgg_user_name': array([b'pouringraine', b'pouvla', b'pouvla', ..., b'raquelnilla',
       b'rarekarrde', b'raremind'], dtype=object)}


In [33]:
# Full model
class BGRetrievalModel(tfrs.Model):
    
    def __init__(self, user_model, bg_model):
        super().__init__()
        self.bg_model: tf.keras.Model = bg_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = task
    
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # Picking out the user features and passing them into the user model
        # Format of each entry is ['bgg_id', 'bgg_user_name']
        user_embeddings = self.user_model(features['bgg_user_name'])
        
        # Picking out the board games features, passing into bg model
        positive_bg_embeddings = self.bg_model(features['bgg_id'])
        
        # Task computes the loss and the metrics
        return self.task(user_embeddings, positive_bg_embeddings)

### Fitting and Evaluation  

In [39]:
# Instantiate and compile the model
retrieval_model = BGRetrievalModel(user_model, bg_model)
retrieval_model.compile(optimizer=Adagrad(learning_rate=0.1))

In [36]:
# Batch and cache the datasets, did not shuffle to keep time order
cached_user_train = user_train.batch(8192).cache()
cached_user_test = user_test.batch(4096).cache()

In [37]:
cached_user_test

<CacheDataset shapes: {bgg_id: (None,), bgg_user_name: (None,)}, types: {bgg_id: tf.string, bgg_user_name: tf.string}>

In [40]:
# Training the model
retrieval_model.fit(cached_user_train, epochs=3)

Epoch 1/3
 13/897 [..............................] - ETA: 10:00 - factorized_top_k/top_1_categorical_accuracy: 0.0115 - factorized_top_k/top_5_categorical_accuracy: 0.0433 - factorized_top_k/top_10_categorical_accuracy: 0.0701 - factorized_top_k/top_50_categorical_accuracy: 0.1840 - factorized_top_k/top_100_categorical_accuracy: 0.2594 - loss: 71218.0691 - regularization_loss: 0.0000e+00 - total_loss: 71218.0691

KeyboardInterrupt: 

In [86]:
# Evaluate the model
retrieval_model.evaluate(cached_user_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.23830735683441162,
 'factorized_top_k/top_5_categorical_accuracy': 0.24498885869979858,
 'factorized_top_k/top_10_categorical_accuracy': 0.25167039036750793,
 'factorized_top_k/top_50_categorical_accuracy': 0.27394208312034607,
 'factorized_top_k/top_100_categorical_accuracy': 0.288418710231781,
 'loss': 0.8462238907814026,
 'regularization_loss': 0,
 'total_loss': 0.8462238907814026}

These values tell us whether the true positive is in the top-k retrieved items from the entire candidate set. For example, a top-50 categorical accuracy metric of 0.3 means that 30% of the top 50 retrieved items are true positives.

Comparing the metrics, there is a considerable difference between the train and test accuracies, suggesting that the model has been overfitted. It is common since the model has many parameters. A low top-k accuracy would also suggest that the model is recommending board games to users who already rated those board games.

(Maybe can try regularization to generalize better to unseen data.)

### Predictions

In [87]:
# 
df.head(2)

Unnamed: 0,bgg_id,name,year,game_type,designer,artist,publisher,min_players,max_players,min_age,min_time,max_time,category,mechanic,rank,num_votes,avg_rating,stddev_rating,bayes_rating,complexity
0,3,Samurai,1998,5497,2,11883,"17,133,267,29,7340,7335,41,2973,4617,1391,8291...",2,4,10.0,30.0,60.0,10091035,208020402026284620042002,207.0,14648.0,7.45046,1.18569,7.24774,2.4885
1,9,El Caballero,1998,5497,78,74,2671333,2,4,13.0,90.0,90.0,1020,20802002,2679.0,1374.0,6.46354,1.43462,5.94897,3.1824


In [88]:
# Board game dataset needs to be Tensorflow object
bgg_ids

<TensorSliceDataset shapes: (), types: tf.string>

In [89]:
# Create a model that takes in raw query features
index = tfrs.layers.factorized_top_k.BruteForce(retrieval_model.user_model)

# Recommends a board game out of the entire boardgame dataset
index.index(bgg_ids.batch(128).map(retrieval_model.bg_model), bgg_ids)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x20492623c10>

In [118]:
# Get recommendation
_, board_games = index(tf.constant(['-johnny-']))
print(f'Recommendations for -johnny-: {board_games[0, :3]}')

Recommendations for -johnny-: [b'224031' b'28185' b'85036']


We are able to successfully recommend top 3 games (number of games arbituarily decided) to a user with the username '-johnny-' based on the trained embeddings for both the query tower and candidate tower. However, we are recommending the board game id right now, and we want to map that to the board game name for it to be more meaningful. 

In [119]:
# Map the predicted bgg_id to the board game name
named_games = []
for bgg_id in board_games[0, :3]:
    named_games.append(bg_mapper[bgg_id.numpy().decode("utf-8")])

print(f'Recommendations for -johnny-: {named_games}')

Recommendations for -johnny-: ['Cartagena', "The Kaiser's Pirates", '20th Century']


In [115]:
# Check if our model is re-recommending the user a game he or she has already played
user_df[user_df['bgg_user_name']=='-johnny-']

Unnamed: 0,bgg_user_name,bgg_id,bgg_user_rating,year,month,user_count
1,-johnny-,59946,6.0,2015,1,45
2,-johnny-,166384,7.0,2015,1,45
73899,-johnny-,150376,6.0,2015,2,45
132254,-johnny-,150658,6.0,2015,3,45
191533,-johnny-,478,6.0,2015,4,45
191534,-johnny-,103885,5.0,2015,4,45
354534,-johnny-,18833,9.0,2015,7,45
354535,-johnny-,54307,8.0,2015,7,45
1056490,-johnny-,94,8.0,2016,4,45
1056491,-johnny-,26566,4.0,2016,4,45


Observing the board games which the user '-johnny-' has rated, we see that the top 3 recommended games are not within them. This is still a good sign, but it may be just so happened that these 45 entries have no false positives. 

The retrieval model is useful for getting quick recommendations, but it is just based on the board game ids and user ids. This model is usually built to be more computationally efficient to filter out all candidates that the user is not interested in.

## Ranking Model

The ranking model is built to be used in tandem with the retrieval model, taking the outputs from the retrieval model and finetuning them to select the best possible recommendations.

#### Split into train and test sets

The train and test data will now include the user ratings to give a sense of ranking to the recommended board games.

In [142]:
rating_train, rating_test = train_test_split(user_df[['bgg_id', 'bgg_user_name', 'bgg_user_rating']], shuffle=False, test_size=0.2)

In [141]:
tf.convert_to_tensor(rating_test[['bgg_id', 'bgg_user_name']])

<tf.Tensor: shape=(1836570, 2), dtype=string, numpy=
array([[b'113294', b'nickster1970'],
       [b'146021', b'nickster1970'],
       [b'214880', b'nicktaruffi'],
       ...,
       [b'233867', b'zzzzzane'],
       [b'242302', b'zzzzzane'],
       [b'269210', b'zzzzzane']], dtype=object)>

In [143]:
tf.convert_to_tensor(rating_test['bgg_user_rating'])

<tf.Tensor: shape=(1836570,), dtype=float64, numpy=array([7. , 8. , 8. , ..., 8. , 7.5, 8. ])>

In [145]:
# Convert train and test into Tensor Datasets
tensor_user_train = tf.data.Dataset.from_tensor_slices(rating_train[['bgg_id', 'bgg_user_name']].astype(str))
tensor_user_test = tf.data.Dataset.from_tensor_slices(rating_test[['bgg_id', 'bgg_user_name']].astype(str))
tensor_rating_train = tf.data.Dataset.from_tensor_slices(rating_train['bgg_user_rating'].astype('float32'))
tensor_rating_test = tf.data.Dataset.from_tensor_slices(rating_test['bgg_user_rating'].astype('float32'))

In [146]:
# Combine
rating_train = tf.data.Dataset.zip((tensor_user_train, tensor_rating_train))
rating_test = tf.data.Dataset.zip((tensor_user_test, tensor_rating_test))

In [147]:
rating_train

<ZipDataset shapes: ((2,), ()), types: (tf.string, tf.float32)>

In [149]:
# Convert train and test into Tensor Datasets
# rating_train['bgg_id'] = rating_train['bgg_id'].astype(str)
# rating_test['bgg_id'] = rating_test['bgg_id'].astype(str)
# # rating_train['bgg_user_rating'] = rating_train['bgg_user_rating'].astype('float32')
# # rating_test['bgg_user_rating'] = rating_test['bgg_user_rating'].astype('float32')


# rating_train = tf.data.Dataset.from_tensor_slices(rating_train[['bgg_id', 'bgg_user_name']], rating_train['bgg_user_rating'])
# rating_test = tf.data.Dataset.from_tensor_slices(rating_test[['bgg_id', 'bgg_user_name']], rating_test['bgg_user_rating'])

#### Ranking layers

The ranking model is composed of multiple layers for ranking tasks.

In [262]:
# Ranking tasks
class RankingModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        embedding_dimension = 32

        # User embeddings
        self.user_embeddings = Sequential([
            StringLookup(vocabulary=unique_user, mask_token=None),
            Embedding(len(unique_user) + 1, embedding_dimension)
        ])

        # Board game embeddings
        self.bg_embeddings = Sequential([
            StringLookup(vocabulary=unique_bgg_id, mask_token=None),
            Embedding(len(unique_bgg_id) + 1, embedding_dimension)
        ])

        # Predictions
        self.ratings = Sequential([
            Dense(256, activation="relu"),
            Dense(64, activation="relu"),
          # Rating predictions in the final layer.
            Dense(1)
        ])
    
    def call(self, inputs):
        bgg_user_name, bgg_id = inputs
        user_embedding = self.user_embeddings(bgg_user_name)
        bg_embedding = self.bg_embeddings(bgg_id)
        return self.ratings(tf.concat([user_embedding, bg_embedding], axis=1))

In [277]:
# This model takes user names and bgg ids, and outputs a predicted rating
RankingModel()((['-johnny-'],['20545']))

Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.


<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.02250871]], dtype=float32)>

In [269]:
# This model takes user names and bgg ids, and outputs a predicted rating
# RankingModel()(tf.convert_to_tensor((['-johnny-'],['20545'])))

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.01110805]], dtype=float32)>

#### Loss and metrics

This time, we use the `Ranking` task object to put together the loss function and metric computation. The metrics used is `RootMeanSquaredError`.

In [161]:
# The loss + metrics task
task = tfrs.tasks.Ranking(
    loss = MeanSquaredError(),
    metrics = [RootMeanSquaredError()]
)

#### Full model

We want to combine all of the above together into a model. We use `tfrs.Model` as the base model which take care of creating the appropriate training loop to fit the model.

In [179]:
rating_test.batch(10)

<BatchDataset shapes: ((None, 2), (None,)), types: (tf.string, tf.float32)>

In [258]:
# Check the format of train and test datasets
count = 0
for element in rating_test.as_numpy_iterator():
    if count <=10:
        print(element)
        count += 1
    else:
        break

(array([b'113294', b'nickster1970'], dtype=object), 7.0)
(array([b'146021', b'nickster1970'], dtype=object), 8.0)
(array([b'214880', b'nicktaruffi'], dtype=object), 8.0)
(array([b'39856', b'nickwatt'], dtype=object), 7.0)
(array([b'92828', b'nickwatt'], dtype=object), 7.0)
(array([b'218603', b'nickwatt'], dtype=object), 8.0)
(array([b'246784', b'nickwatt'], dtype=object), 8.5)
(array([b'247160', b'nickwatt'], dtype=object), 7.0)
(array([b'24181', b'nicnied'], dtype=object), 9.5)
(array([b'35497', b'nicnied'], dtype=object), 4.0)
(array([b'132372', b'nicnied'], dtype=object), 7.0)


In [247]:
# Full model
class BGRankingModel(tfrs.Model):
    
    def __init__(self):
        super().__init__()
        self.ranking_model: tf.keras.Model = RankingModel()
        # The loss + metrics task
        self.task: Layer = tfrs.tasks.Ranking(
            loss = MeanSquaredError(),
            metrics = [RootMeanSquaredError()]
        )
    
    def compute_loss(self, features, training=False) -> tf.Tensor:
        rating_predictions = self.ranking_model(
            ([features[0][1]], [features[0][0]]))
        
        # Task computes the loss and the metrics
        return self.task(labels=features[1], predictions=rating_predictions)

### Fitting and Evaluation

In [248]:
# Instantiate and compile the model
ranking_model = BGRankingModel()
ranking_model.compile(optimizer=Adagrad(learning_rate=0.1))

In [249]:
# Batch and cache the datasets, did not shuffle to keep time order
cached_rating_train = rating_train.batch(8192).cache()
cached_rating_test = rating_test.batch(4096).cache()

In [250]:
cached_rating_train.batch(8192)

<BatchDataset shapes: ((None, None, 2), (None, None)), types: (tf.string, tf.float32)>

In [251]:
# Check the format of train and test datasets
count = 0
for element in rating_test.as_numpy_iterator():
    if count <=2:
        print(element)
        count += 1
    else:
        break

(array([b'113294', b'nickster1970'], dtype=object), 7.0)
(array([b'146021', b'nickster1970'], dtype=object), 8.0)
(array([b'214880', b'nicktaruffi'], dtype=object), 8.0)


In [252]:
# Check the format of train and test datasets
count = 0
for element in rating_test.batch(8192).as_numpy_iterator():
    if count <=2:
        print(element[0][0][1])
        count += 1
    else:
        break

b'nickster1970'
b'pchomp'
b'raremind'


In [253]:
# Training the model
ranking_model.fit(cached_rating_train, epochs=3)

Epoch 1/3
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x2048b63f700>

In [278]:
# Evaluate the model
ranking_model.evaluate(cached_rating_test, return_dict=True)

Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.


{'root_mean_squared_error': 1.4409151077270508,
 'loss': 1.7327253818511963,
 'regularization_loss': 0,
 'total_loss': 1.7327253818511963}

We observe that the rmse and loss are both lower than the train data, there may be some underfitting.

### Predictions

In [120]:
user_df.head()

Unnamed: 0,bgg_user_name,bgg_id,bgg_user_rating,year,month,user_count
0,-=yod@=-,160495,7.5,2015,1,173
1,-johnny-,59946,6.0,2015,1,45
2,-johnny-,166384,7.0,2015,1,45
3,-mide-,20545,6.0,2015,1,130
4,-mide-,145639,7.0,2015,1,130


In [288]:
def test(features, msg1, msg2):
    print(features)
    pass

In [295]:
dictdict = {'col1': [1,2,3,4],
           'col2': ['4','3','2','1']}

In [297]:
for x in tf.data.Dataset.from_tensor_slices(dictdict):
    print(x)

{'col1': <tf.Tensor: shape=(), dtype=int32, numpy=1>, 'col2': <tf.Tensor: shape=(), dtype=string, numpy=b'4'>}
{'col1': <tf.Tensor: shape=(), dtype=int32, numpy=2>, 'col2': <tf.Tensor: shape=(), dtype=string, numpy=b'3'>}
{'col1': <tf.Tensor: shape=(), dtype=int32, numpy=3>, 'col2': <tf.Tensor: shape=(), dtype=string, numpy=b'2'>}
{'col1': <tf.Tensor: shape=(), dtype=int32, numpy=4>, 'col2': <tf.Tensor: shape=(), dtype=string, numpy=b'1'>}


We want to explore more models which are able to utilize the rich features which our datasets possess, also to give better recommendations.

## Content-based Recommender

In content-based filtering, the features of the dataframe are broken down into "feature baskets". These are the characteristics that represent a board game. The main idea is that if the user likes certain categories, mechanics, or types of a certain board game, then it is likely the user likes another board game that has similar characteristics. 

In [None]:
df.shape

In [None]:
df.head()

In [None]:
test = tuple(df['bgg_id'])[:3]