In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
from itertools import combinations
from math import sqrt
import numpy as np

In [2]:
game_data = pd.read_csv('data/game_info.csv', index_col=0)
user_data_train = pd.read_csv('data/user_data_train_no_comments.csv', index_col=0)
user_data_test = pd.read_csv('data/user_data_test_no_comments.csv', index_col=0)
user_data_validation = pd.read_csv('data/user_data_validation_no_comments.csv', index_col=0)
user_data_train_mean_userscore = user_data_train['Userscore'].mean()

In [3]:
game_data

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
0,The Legend of Zelda: Ocarina of Time,1998,Nintendo,Action Adventure;Fantasy,Nintendo64,99,9.1,1 Player
1,Tony Hawk's Pro Skater 2,2000,NeversoftEntertainment,Sports;Alternative;Skateboarding,PlayStation,98,7.4,1-2
2,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,PlayStation3,98,7.5,1 Player
3,SoulCalibur,1999,Namco,Action;Fighting;3D,Dreamcast,98,8.6,1-2
4,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,Xbox360,98,7.9,1 Player
...,...,...,...,...,...,...,...,...
4995,Donut County,2018,BenEsposito,Action Adventure;General,PC,77,8.1,No Online Multiplayer
4996,MotorStorm: Apocalypse,2011,EvolutionStudios,Driving;Racing;Simulation;Rally / Offroad;Rall...,PlayStation3,77,7.7,4 Online
4997,The Last Guy,2008,SCEJapanStudio,Action Adventure;Sci-Fi;Sci-Fi;General,PlayStation3,77,6.8,1 Player
4998,Valiant Hearts: The Great War,2014,UbisoftMontpellier,Platformer;2D;Action;Platformer;2D,PlayStation4,77,8.4,not specified


In [4]:
user_data_validation

Unnamed: 0_level_0,Userscore,Username,Game_ID
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100920,3,AlanG.,474
106599,10,AlanG.,532
106906,9,AlanG.,532
197380,9,AlanG.,1525
10713,10,TonyM.,20
...,...,...,...
242135,10,AlexN.,2375
258319,9,AlexN.,2834
17628,9,zenmechanic,33
32734,8,zenmechanic,70


In [5]:
# information consists of game title, genres, platform, and number of players (no_players)
# game_info0 = game_data['Genre'].apply(lambda s: s.split(';'))
# game_info1 = game_data['Genre'].apply(lambda s: s.split(';')) + game_data['No_Players'].apply(lambda s: [str(s)])
# game_info2 = game_data['Publisher'].apply(lambda s: [s]) + game_data['Genre'].apply(lambda s: s.split(';'))
# game_info2 = game_data['Genre'].apply(lambda s: s.split(';')) + game_data['Platform'].apply(lambda s: [s])
# game_info3 = game_data['Title'].apply(lambda s: s.split()) + game_data['Publisher'].apply(lambda s: [s]) + game_data['Genre'].apply(lambda s: s.split(';'))
# game_info4 = game_data['Title'].apply(lambda s: s.split()) + game_data['Publisher'].apply(lambda s: [s]) + game_data['Genre'].apply(lambda s: s.split(';')) + game_data['Platform'].apply(lambda s: [s])
# game_info5 = game_data['Title'].apply(lambda s: s.split()) + game_data['Publisher'].apply(lambda s: [s]) + game_data['Genre'].apply(lambda s: s.split(';')) + game_data['Platform'].apply(lambda s: [s]) + game_data['No_Players'].apply(lambda s: [str(s)])
game_info = game_data['Title'].apply(lambda s: s.split()) + game_data['Publisher'].apply(lambda s: [s]) + game_data['Genre'].apply(lambda s: s.split(';')) + game_data['Platform'].apply(lambda s: [s]) + game_data['No_Players'].apply(lambda s: [str(s)])
# game_info = game_data['Title'].apply(lambda s: s.split()) + game_data['Publisher'].apply(lambda s: [s]) + game_data['Genre'].apply(lambda s: s.split(';')) + game_data['Platform'].apply(lambda s: [s]) + game_data['No_Players'].apply(lambda s: [str(s)])


# Analyzer will strip non-alphanumeric characters and ignore case
tf = TfidfVectorizer(analyzer=lambda i: map(lambda s: ''.join(filter(str.isalnum, s.lower())), i))
tfidf_matrix = tf.fit_transform(game_info)

# pd.DataFrame(tfidf_matrix.todense(), columns=tf.get_feature_names(), index=game_data['Title'])

In [6]:
cosine_sim = cosine_similarity(tfidf_matrix)
# cosine_sim

In [7]:
def predict_rating(rating, dataset):

    # print("Queried game")
    # print(game_data.iloc[rating.Game_ID])

    # Get all ratings from the user
    user_ratings = dataset[dataset['Username'] == rating.Username]

    # Make a prediction. Prediction is a weighted average of the user's ratings of other games, weighted by similarity
    prediction = 0
    normalization_factor = 0
    for index, user_rating in user_ratings.iterrows():

        # Ignore the rating we are trying to predict
        if index == rating.name:
            continue

        similarity = cosine_sim[user_rating.Game_ID, rating.Game_ID]
        
        # print("\nSimilar Game Rated by the Same User")
        # print(game_data.iloc[user_rating.Game_ID])

        # print("\nuser's score:", user_rating.Userscore)
        # print("similarity:", similarity)

        prediction += user_rating.Userscore * similarity
        normalization_factor += similarity

    # If the user has not reviewed any similar game, then predict the average userscore of the queried game
    if not normalization_factor:
        game_avg_userscore = pd.to_numeric(game_data.loc[rating.Game_ID].Avg_Userscore, errors='coerce')
        if game_avg_userscore == np.nan:
            prediction = user_data_train_mean_userscore
        else:
            prediction = game_avg_userscore
    else:
        # normalize the predicted rating
        prediction = prediction / normalization_factor

    # Predicted rating after normalization minus actual score
    # print('\nResults')
    # print('prediction:', prediction)
    # print('actual:', rating.Userscore)
    return prediction

In [8]:
dataset = user_data_test
predictions = dataset.apply(lambda r: predict_rating(r, dataset), axis=1)

In [9]:
avg_userscores = dataset.apply(lambda r: game_data.loc[r['Game_ID']]['Avg_Userscore'], axis=1)
avg_userscores = pd.to_numeric(avg_userscores, errors='coerce')
avg_userscores = avg_userscores.replace(np.nan, user_data_train_mean_userscore)

In [10]:
predictions_rmse = ((predictions - dataset['Userscore'])**2).mean()**0.5
baseline_rmse = ((user_data_train_mean_userscore - dataset['Userscore'])**2).mean()**0.5
avg_userscore_rmse = ((avg_userscores - dataset['Userscore'])**2).mean()**0.5
print('RMSE of Predictions:', predictions_rmse)
print('RMSE of Training Mean:', baseline_rmse)
print('RMSE of Game Avg Userscores:', avg_userscore_rmse)

RMSE of Predictions: 2.6723915986954228
RMSE of Training Mean: 2.7464267971419734
RMSE of Game Avg Userscores: 2.5109615650812427


In [35]:
user_name_list = dataset['Username'].drop_duplicates().reset_index(drop=True)

threshold = 5 # threshold userscore for the game to be considered a 'positive' recommendation
positive_predictions = dataset[predictions > threshold]
positive_ratings = dataset[dataset['Userscore'] > threshold]

ratio = 0
for user in user_name_list:
    # get list of ratings from the user whose score was positive
    user_positive_ratings = positive_ratings[positive_ratings['Username'] == user]

    # of the user's positive ratings, which of our predictions for the same games were also positive?
    positive_coverage = positive_predictions.index.intersection(user_positive_ratings.index)

    if len(user_positive_ratings) != 0:
        ratio += len(positive_coverage) / len(user_positive_ratings)
ratio /= len(user_name_list)
print('Average over all users the ratio of games recommended to them among the games the user rated positively versus the number of games the user rated positively:', ratio)
# This is much higher than that of the results of https://audreygermain.github.io/Game-Recommendation-System/#content-based
# however this is because we are able to give predictions to users whom we have no data of.

Average over all users the ratio of games recommended to them among the games the user rated positively versus the number of games the user rated positively: 0.8405922951940865


Cite our sources: https://towardsdatascience.com/content-based-recommender-systems-28a1dbd858f5
Write up results in paper

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=eb4e4242-5846-4617-af0a-480720b8ff22' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>