In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
from itertools import combinations
from math import sqrt
import numpy as np

In [2]:
game_data = pd.read_csv('data/game_info.csv', index_col=0)
user_data_train = pd.read_csv('data/user_data_train_no_comments.csv', index_col=0)
user_data_test = pd.read_csv('data/user_data_test_no_comments.csv', index_col=0)
user_data_train_mean_userscore = user_data_train['Userscore'].mean()

In [3]:
game_data

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
0,The Legend of Zelda: Ocarina of Time,1998,Nintendo,Action Adventure;Fantasy,Nintendo64,99,9.1,1 Player
1,Tony Hawk's Pro Skater 2,2000,NeversoftEntertainment,Sports;Alternative;Skateboarding,PlayStation,98,7.4,1-2
2,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,PlayStation3,98,7.5,1 Player
3,SoulCalibur,1999,Namco,Action;Fighting;3D,Dreamcast,98,8.6,1-2
4,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,Xbox360,98,7.9,1 Player
...,...,...,...,...,...,...,...,...
4995,Donut County,2018,BenEsposito,Action Adventure;General,PC,77,8.1,No Online Multiplayer
4996,MotorStorm: Apocalypse,2011,EvolutionStudios,Driving;Racing;Simulation;Rally / Offroad;Rall...,PlayStation3,77,7.7,4 Online
4997,The Last Guy,2008,SCEJapanStudio,Action Adventure;Sci-Fi;Sci-Fi;General,PlayStation3,77,6.8,1 Player
4998,Valiant Hearts: The Great War,2014,UbisoftMontpellier,Platformer;2D;Action;Platformer;2D,PlayStation4,77,8.4,not specified


In [4]:
user_data_test

Unnamed: 0_level_0,Userscore,Username,Game_ID
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
122104,0,Clyton,672
274573,0,Clyton,3154
101981,9,AP,476
169777,9,AP,1122
219369,9,AP,1914
...,...,...,...
277709,0,triggareloaded,3260
117037,1,Makarash,618
187399,1,Makarash,1376
28463,10,quasar44,57


In [5]:
# information consists of game title, genres, platform, and number of players (no_players)
game_info = game_data['Title'].apply(lambda s: s.split()) + game_data['Genre'].apply(lambda s: s.split(';')) + game_data['Platform'].apply(lambda s: [s]) + game_data['No_Players'].apply(lambda s: [str(s)])
# Analyzer will strip non-alphanumeric characters and ignore case
tf = TfidfVectorizer(analyzer=lambda i: map(lambda s: ''.join(filter(str.isalnum, s.lower())), i))
tfidf_matrix = tf.fit_transform(game_info)
tfidf_matrix

<5000x3411 sparse matrix of type '<class 'numpy.float64'>'
	with 45344 stored elements in Compressed Sparse Row format>

In [6]:
pd.DataFrame(tfidf_matrix.todense(), columns=tf.get_feature_names(), index=game_data['Title'])

Unnamed: 0_level_0,Unnamed: 1_level_0,0,007,012,04,06,07,08,09,1,...,zin,zodiac,zodiarcs,zombie,zombies,zone,zumas,zur,zwei,zx
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Legend of Zelda: Ocarina of Time,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Tony Hawk's Pro Skater 2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grand Theft Auto IV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SoulCalibur,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grand Theft Auto IV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Donut County,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MotorStorm: Apocalypse,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Last Guy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Valiant Hearts: The Great War,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim

array([[1.        , 0.        , 0.07073758, ..., 0.1093758 , 0.0274612 ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.07073758, 0.        , 1.        , ..., 0.13030605, 0.        ,
        0.        ],
       ...,
       [0.1093758 , 0.        , 0.13030605, ..., 1.        , 0.03134997,
        0.03712057],
       [0.0274612 , 0.        , 0.        , ..., 0.03134997, 1.        ,
        0.02176724],
       [0.        , 0.        , 0.        , ..., 0.03712057, 0.02176724,
        1.        ]])

In [8]:
def predict_rating(query_index):
    # Get the query rating
    query = user_data_test.loc[query_index]
    query_game_id = query['Game_ID']

    # print("Queried game")
    # print(game_data.iloc[query_game_id])

    # Get all ratings from the user
    user_ratings = user_data_test[user_data_test['Username'] == query['Username']]

    # Make a prediction. Prediction is a weighted average of the user's ratings of other games, weighted by similarity
    prediction = 0
    normalization_factor = 0
    for index, rating in user_ratings.iterrows():
        # Do not factor in the queried rating into our prediction (that's cheating!)
        if index == query_index:
            continue

        rating_game_id = rating['Game_ID']
        cos_sim = cosine_sim[rating_game_id, query_game_id]
        
        # print("\nSimilar Game Rated by the Same User")
        # print(game_data.iloc[rating_game_id])

        # print("\nrating['Userscore']:",rating['Userscore'])
        # print("cos_sim", cos_sim)

        prediction += rating['Userscore'] * cos_sim
        normalization_factor += cos_sim

    # If the user has not reviewed any similar game, then predict the average userscore of the queried game
    if not normalization_factor:
        game_avg_userscore = game_data.iloc[query_game_id]['Avg_Userscore']
        if game_avg_userscore.isnumeric():
            prediction = float(game_avg_userscore)
        else: # avg_userscore of the game is not available, predict the mean userscore of the entire training set
            prediction = user_data_train_mean_userscore
    else:
        # normalize the predicted rating
        prediction = prediction / normalization_factor

    # Predicted rating after normalization minus actual score
    # print('\nResults')
    # print('prediction:', prediction)
    # print('actual:', query['Userscore'])
    return prediction

In [9]:
predictions = user_data_test.index.to_series().apply(lambda i: predict_rating(i))

In [10]:
predictions_rmse = ((predictions - user_data_test['Userscore'])**2).mean()**0.5
baseline_rmse = ((user_data_train_mean_userscore - user_data_test['Userscore'])**2).mean()**0.5
print('RMSE of Predictions:', predictions_rmse)
print('RMSE of baseline (mean of all userscores in training data):', baseline_rmse)

RMSE of Predictions: 2.938824629471649
RMSE of baseline (mean of all userscores in training data): 2.8048801329706254


Cite our sources: https://towardsdatascience.com/content-based-recommender-systems-28a1dbd858f5
Write up results in paper

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=eb4e4242-5846-4617-af0a-480720b8ff22' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>