In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import chain, combinations
import numpy as np

In [2]:
# load the datasets
game_data = pd.read_csv('data/game_info.csv', index_col=0)
user_data_train = pd.read_csv('data/user_data_train_no_comments.csv', index_col=0)
user_data_validation = pd.read_csv('data/user_data_validation_no_comments.csv', index_col=0)
user_data_test = pd.read_csv('data/user_data_test_no_comments.csv', index_col=0)

# compute values we may use later
mean_userscore_user_data_train = user_data_train['Userscore'].mean()
mean_userscore_user_data_train_validation = pd.concat([user_data_train, user_data_validation])['Userscore'].mean()
del user_data_train # we don't use the training data for anything other than computing mean userscores

In [3]:
# powerset function from https://stackoverflow.com/a/1482316
def powerset(s):
    '''
    Returns the powerset of iterable s
    '''
    return chain.from_iterable(combinations(s, r) for r in range(len(s) + 1))

In [4]:
# categories of game data to consider in content-based filtering
content_categories = ['Title', 'Publisher', 'Genre', 'Platform', 'No_Players']

def get_bag(category):
    '''
    Return a bag of strings/terms for tf-idf vectorization given the category name
    '''
    processing_func = None
    if category == 'Title':
        processing_func = lambda s: s.split() # split title on whitespace
    elif category == 'Genre':
        processing_func = lambda s: s.split(';') # split genre on semicolons
    else:
        processing_func = lambda s: [str(s)] # other categories may be a mix of single numbers/strings. i.e. No_Players and Platform

    return game_data[category].apply(processing_func)

def get_similarity_matrix(categories):
    '''
    Return the cosine similarity matrix using the given categories
    '''
    assert categories # assert it is not an empty list

    # retrieve a bag of strings/text per game to vectorize
    bag_of_texts = get_bag(categories[0])
    for category in categories[1:]:
        bag_of_texts += get_bag(category)

    # create a tf-idf vectorizor that filters out non-alphanumeric characters and ignores case
    tf = TfidfVectorizer(analyzer=lambda i: map(lambda s: ''.join(filter(str.isalnum, s.lower())), i))

    # compute the tf-idf matrix
    tfidf_matrix = tf.fit_transform(bag_of_texts)

    # compute and return the cosine similarity matrix
    return cosine_similarity(tfidf_matrix)

In [5]:
def make_game_avg_userscores(dataset, fallback_score):
    '''
    For each row in the dataset, give the game's average userscore if available.
    Otherwise use the fallback_score.
    '''
    avg_userscores = dataset.apply(lambda row: game_data.loc[row['Game_ID']]['Avg_Userscore'], axis=1)
    avg_userscores = pd.to_numeric(avg_userscores, errors='coerce')
    avg_userscores = avg_userscores.replace(np.nan, fallback_score)
    return avg_userscores

In [6]:
def make_predictions(dataset, similarity_matrix, use_game_avg_userscore, fallback_score):
    '''
    Make predictions for every rating from a given dataset using a given similarity matrix.
    fallback_score is used if a prediction can not be made and the game has no Avg_Userscore available.
    If use_game_avg_userscore is True, will attempt to report the game's Avg_Userscore instead of the fallback_score if available
    '''

    def make_prediction(rating):
        '''
        Make a prediction about the userscore of a particular rating from the dataset based on the user's rating of other games
        '''
        # get all ratings from the user
        user_ratings = dataset[dataset['Username'] == rating.Username]

        # make a prediction
        prediction = 0 # numerator of the weighted average
        normalization_factor = 0 # denominator of the weighted average
        for index, user_rating in user_ratings.iterrows():

            # ignore the rating we are trying to predict
            if index == rating.name:
                continue

            # a prediction is a similarity-weighted average of the user's ratings of other games
            similarity = similarity_matrix[user_rating.Game_ID, rating.Game_ID]
            prediction += user_rating.Userscore * similarity 
            normalization_factor += similarity

        if normalization_factor == 0:
            # if the user has not reviewed any similar game, then predict the average userscore of the game
            game_avg_userscore = pd.to_numeric(game_data.loc[rating.Game_ID].Avg_Userscore, errors='coerce')
            if game_avg_userscore != np.nan and use_game_avg_userscore:
                prediction = game_avg_userscore
            else:
                # if the game's average userscore is unavailable, then use the fallback score
                prediction = fallback_score
        else:
            # normalize the prediction
            prediction = prediction / normalization_factor

        return prediction
    
    return dataset.apply(lambda row: make_prediction(row), axis=1)

In [7]:
def compute_rmse(prediction, dataset):
    '''
    Computers the root-mean squared error with respect to userscore given prediction(s) and the dataset
    '''
    return ((prediction - dataset['Userscore'])**2).mean()**0.5

In [8]:
# validation step: test combinations of hyperparameters to find one that performs best on validation data
category_powerset = [categories for categories in powerset(content_categories)]
rmses = [float('inf')]
for categories in category_powerset:
    if not categories:
        continue

    similarity_matrix = get_similarity_matrix(categories)
    print('Making predictions using content categories:', ' '.join(categories))
    predictions = make_predictions(user_data_validation, similarity_matrix, False, 0)
    rmse = compute_rmse(predictions, user_data_validation)
    print('Finished! RMSE:', rmse)
    rmses.append(rmse)

Making predictions using content categories: Title
Finished! RMSE: 6.128860392913564
Making predictions using content categories: Publisher
Finished! RMSE: 7.067758570996894
Making predictions using content categories: Genre
Finished! RMSE: 3.790114323246962
Making predictions using content categories: Platform
Finished! RMSE: 4.151787427494914
Making predictions using content categories: No_Players
Finished! RMSE: 5.503915827235503
Making predictions using content categories: Title Publisher
Finished! RMSE: 5.93322010425121
Making predictions using content categories: Title Genre
Finished! RMSE: 3.5680433884674696
Making predictions using content categories: Title Platform
Finished! RMSE: 3.5732346540803834
Making predictions using content categories: Title No_Players
Finished! RMSE: 4.641266035693517
Making predictions using content categories: Publisher Genre
Finished! RMSE: 3.7410201954256417
Making predictions using content categories: Publisher Platform
Finished! RMSE: 3.84951887

In [9]:
best_rmse = min(rmses)
best_rmse_index = rmses.index(best_rmse)
best_categories = category_powerset[best_rmse_index] # = content_categories # The best combination was using all of them!
print('The best set of categories is', ' '.join(best_categories), 'with an RMSE of', best_rmse)

The best set of categories is Title Publisher Genre Platform No_Players with an RMSE of 2.840064275384498


In [10]:
# save results of validation testing
cat_rmse_results = pd.Series(rmses, index=category_powerset, name='RMSEs')
cat_rmse_results.to_csv('data/content-based-validation.csv')

In [11]:
# testing phase
test_predictions = make_predictions(user_data_test, get_similarity_matrix(best_categories), False, 0)
test_predictions_with_fallback = make_predictions(user_data_test, get_similarity_matrix(best_categories), True, mean_userscore_user_data_train_validation)

test_predictions_rmse = compute_rmse(test_predictions, user_data_test)
test_predictions_with_fallback_rmse = compute_rmse(test_predictions_with_fallback, user_data_test)
baseline_rmse = compute_rmse(mean_userscore_user_data_train_validation, user_data_test)
avg_userscore_rmse = compute_rmse(make_game_avg_userscores(user_data_test, mean_userscore_user_data_train_validation), user_data_test)

print('RMSE of Predictions:', test_predictions_rmse)
print('RMSE of Predictions w/ Fallback:', test_predictions_with_fallback_rmse)

print('RMSE of Baseline (Training Mean):', baseline_rmse)
print('RMSE of Game Avg Userscores:', avg_userscore_rmse)

RMSE of Predictions: 2.878735460472741
RMSE of Predictions w/ Fallback: 2.6723915986954228
RMSE of Baseline (Training Mean): 2.746245378070763
RMSE of Game Avg Userscores: 2.510962601606566


Cite our sources: https://towardsdatascience.com/content-based-recommender-systems-28a1dbd858f5
Write up results in paper

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=eb4e4242-5846-4617-af0a-480720b8ff22' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>