In [10]:
import cornac
import numpy as np
import pandas as pd
import papermill as pm
import scrapbook as sb
import warnings
warnings.filterwarnings('ignore')

from recommenders.utils.timer import Timer
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

games = pd.read_csv('Game Recommendation/games.csv')
recommendations = pd.read_csv('Game Recommendation/recommendations.csv')
users = pd.read_csv('Game Recommendation/users.csv')

FM model is only supported on Linux.
Windows executable can be found at http://www.libfm.org.


In [11]:
print(games.shape)
games.head()

(48756, 13)


Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck
0,10090,Call of Duty: World at War,2008-11-18,True,False,False,Very Positive,92,37039,19.99,19.99,0.0,True
1,13500,Prince of Persia: Warrior Within™,2008-11-21,True,False,False,Very Positive,84,2199,9.99,9.99,0.0,True
2,22364,BRINK: Agents of Change,2011-08-03,True,False,False,Positive,85,21,2.99,2.99,0.0,True
3,113020,Monaco: What's Yours Is Mine,2013-04-24,True,True,True,Very Positive,92,3722,14.99,14.99,0.0,True
4,226560,Escape Dead Island,2014-11-18,True,False,False,Mixed,61,873,14.99,14.99,0.0,True


As there are 48 thousand unique games, I will use only 10 thousand games because of the memory limit of my computer.

In [12]:
games = games.sample(10000, random_state=7)

In [13]:
print(recommendations.shape)
recommendations.head()

(13406320, 8)


Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,975370,0,0,2022-12-12,True,36.3,23391,0
1,304390,4,0,2017-02-17,False,11.5,1290,1
2,1085660,2,0,2019-11-17,True,336.5,106678,2
3,703080,0,0,2022-09-23,True,27.4,108848,3
4,526870,0,0,2021-01-10,True,7.9,10695,4


There are 13 million recommendations. I will use only 100 thousand recommendations because of the memory limit of my computer.

In [14]:
recommendations = pd.merge(recommendations, games[['app_id', 'title']], on='app_id', how='left')

# Remove rows where title is null
recommendations = recommendations[~recommendations['title'].isnull()]

recommendations = recommendations.sample(100000, random_state=7)

In [15]:
recommendations.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id,title
5427932,311210,2,0,2016-05-13,True,301.2,2318707,5427932,Call of Duty®: Black Ops III
11829796,1517290,0,0,2022-03-06,False,2.9,1208459,11829796,Battlefield™ 2042
5512603,238960,0,0,2020-01-06,True,172.1,5583881,5512603,Path of Exile
9324410,457140,0,0,2021-06-01,True,24.6,2090971,9324410,Oxygen Not Included
8146213,948740,0,0,2022-06-30,True,4.8,6225495,8146213,AI: The Somnium Files


In [16]:
print(users.shape)
users.head()

(6844127, 3)


Unnamed: 0,user_id,products,reviews
0,6706132,156,1
1,4222883,329,4
2,2274077,176,2
3,2451933,98,2
4,2717184,144,3


In [23]:
X = pd.merge(recommendations.drop(columns=['review_id']), users, on='user_id', how='left')
print(X.shape)
X.head()

(100000, 10)


Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,title,products,reviews
0,311210,2,0,2016-05-13,True,301.2,2318707,Call of Duty®: Black Ops III,142,1
1,1517290,0,0,2022-03-06,False,2.9,1208459,Battlefield™ 2042,4,1
2,238960,0,0,2020-01-06,True,172.1,5583881,Path of Exile,54,3
3,457140,0,0,2021-06-01,True,24.6,2090971,Oxygen Not Included,66,23
4,948740,0,0,2022-06-30,True,4.8,6225495,AI: The Somnium Files,261,11


In [24]:
X.describe()

Unnamed: 0,app_id,helpful,funny,hours,user_id,products,reviews
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,607086.3,3.47778,0.94002,137.673982,3532139.0,205.50213,5.56305
std,434052.8,72.698237,20.95967,205.814187,1948852.0,393.475116,9.357192
min,70.0,0.0,0.0,0.0,18.0,0.0,1.0
25%,304390.0,0.0,0.0,16.0,1945179.0,41.0,1.0
50%,444090.0,0.0,0.0,47.4,3585636.0,101.0,3.0
75%,975370.0,0.0,0.0,155.0,5223206.0,230.0,6.0
max,2167580.0,15815.0,2429.0,999.9,6844101.0,19461.0,322.0


In [25]:
X['rating'] = X['is_recommended'].apply(lambda x: 1 if x == True else -1)

In [20]:
helpful_scaler = MinMaxScaler((0.1, 1))
X['helpful'] = helpful_scaler.fit_transform(X[['helpful']])

X['rating'] = X['is_recommended'] * X['helpful']

In [27]:
X = X[['user_id', 'app_id', 'rating', 'date']]

X.head()

Unnamed: 0,user_id,app_id,rating,date
0,2318707,311210,1,2016-05-13
1,1208459,1517290,-1,2022-03-06
2,5583881,238960,1,2020-01-06
3,2090971,457140,1,2021-06-01
4,6225495,948740,1,2022-06-30


In [28]:
# Train-test split
train, test = python_random_split(X[['user_id', 'app_id', 'rating']], 0.3)
print('Number of users: ', train['user_id'].nunique())
print('Number of items: ', train['app_id'].nunique())

Number of users:  29837
Number of items:  386


In [29]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=7)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 29837
Number of items: 386


# Bayesian Personalized Ranking (BPR)

The implementation of the model is from [Cornac](https://github.com/PreferredAI/cornac), which is a framework for recommender systems with a focus on models leveraging auxiliary data (e.g., item descriptive text and image, social network, etc).

The BPR has a few important parameters that we need to consider:

- `k`: controls the dimension of the latent space (i.e. the size of the vectors  $w_u$  and  $h_i$ ).
- `max_iter`: defines the number of iterations of the SGD procedure.
- `learning_rate`: controls the step size $\alpha$ in the gradient update rules.
- `lambda_reg`: controls the L2-Regularization $\lambda$ in the objective function.

Different values of `k` and `max_iter` will affect the training time.

We will here set `k` to 200, `max_iter` to 100, `learning_rate` to 0.01, and `lambda_reg` to 0.001. To train the model, we simply need to call the `fit()` method.

In [53]:
bpr = cornac.models.BPR(
    k=500,
    max_iter=1000,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=7
)

with Timer() as t:
    bpr.fit(train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/1000 [00:00<?, ?it/s]

Optimization finished!
Took 42.1236 seconds for training.


In [54]:
train = train.iloc[:100, :]
train.dtypes

user_id    int64
app_id     int64
rating     int64
dtype: object

In [55]:
with Timer() as t:
    all_predictions = predict_ranking(bpr, train, usercol='user_id', itemcol='app_id', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 8.7905 seconds for prediction.


In [59]:
all_predictions['user_id'].unique()[-10:]

array([5786562, 3274022,  500738, 5215351, 5013911,  938148, 2730931,
       1258759, 3536758,   10584], dtype=int64)

In [None]:
remove_titles = ['Wallpaper Engine', 
                 'Red Dead Redemption 2', 
                 'Path of Exile', 
                 'Dying Light 2 Stay Human', 
                 'STAR WARS Jedi: Fallen Order Deluxe Edition',
                 "Tom Clancy's Rainbow Six® Siege"
                 ]
all_predictions = pd.merge(all_predictions, games[['app_id', 'title']], on='app_id', how='left')
all_predictions = all_predictions[~all_predictions['title'].isin(remove_titles)]

In [None]:
# Remove alphabets from title
alphabets = ["™", "®", ":", "-"]
for alphabet in alphabets:
    all_predictions['title'] = all_predictions['title'].str.replace(alphabet, "")

In [84]:
# lowercase all titles
all_predictions['title'] = all_predictions['title'].str.lower()

In [72]:
# Recommend to user_id
user_id = 1258759
top_k = 3

# Get the predictions for user_id
user_predictions = all_predictions[all_predictions['user_id'] == user_id].sort_values(by='prediction', ascending=False).iloc[:50, :]
user_predictions.head(20)

Unnamed: 0,user_id,app_id,prediction,title
11515919,1258759,349040,4.847771,NARUTO SHIPPUDEN Ultimate Ninja STORM 4
11515845,1258759,218620,1.885389,PAYDAY 2
11515859,1258759,304390,1.840105,FOR HONOR
11515849,1258759,945360,1.772399,Among Us
11515842,1258759,306130,1.737273,The Elder Scrolls Online
11515881,1258759,444090,1.667641,Paladins
11515864,1258759,570,1.617063,Dota 2
11515871,1258759,1151340,1.393246,Fallout 76
11515856,1258759,1222670,1.339484,The Sims 4
11515838,1258759,1517290,1.263788,Battlefield 2042


In [82]:
# Recommend to user_id
user_id = 3536758
top_k = 3

# Get the predictions for user_id
user_predictions = all_predictions[all_predictions['user_id'] == user_id].sort_values(by='prediction', ascending=False)['title'].values[0]
user_predictions

'HalfLife Alyx'

In [74]:
predictions = all_predictions.copy()
predictions.dtypes
predictions.to_csv('predictions.csv', index=False)

In [83]:
import pandas as pd
predictions = pd.read_csv('predictions.csv')
predictions

Unnamed: 0,user_id,app_id,prediction,title
0,4725289,552990,0.103098,World of Warships
1,4725289,457140,1.009218,Oxygen Not Included
2,4725289,638230,-0.106927,Journey
3,4725289,613100,0.828413,House Flipper
4,4725289,1943950,-0.844804,Escape the Backrooms
...,...,...,...,...
11337966,10584,1406780,-1.389574,Viscerafest
11337967,10584,1023550,-1.437565,Rogue Star Rescue
11337968,10584,1246950,-1.481816,Where in the World is Carmen Sandiego?
11337969,10584,1148730,-1.343003,Errant Kingdom Prologue Part Five


In [4]:
predictions.head(20)

Unnamed: 0.1,Unnamed: 0,user_id,app_id,prediction,title
0,0,4725289,552990,2.214125,World of Warships
1,1,4725289,457140,2.655721,Oxygen Not Included
2,2,4725289,638230,1.443401,Journey
3,3,4725289,613100,2.77703,House Flipper
4,4,4725289,1943950,0.322311,Escape the Backrooms
5,5,4725289,582660,2.727118,Black Desert
6,6,4725289,4700,1.538473,Total War: MEDIEVAL II – Definitive Edition
7,7,4725289,460930,2.505392,Tom Clancy's Ghost Recon® Wildlands
8,8,4725289,466560,2.143895,Northgard
9,9,4725289,794260,1.837508,Outward Definitive Edition


In [5]:
import difflib

# Find most similar title in predictions
def return_similar_title(title):
    try:
        matches = difflib.get_close_matches(title, predictions['title'], n=1, cutoff=0.4)
        print(matches)
        return matches[0]
    except:
        print('No matches found.')
        return None

return_similar_title('Just Cause 3')

['Just Cause™ 3']


'Just Cause™ 3'

In [7]:
# Top user_id for Just Cause 3
predictions.loc[predictions['title'] == "Just Cause™ 3"].sort_values(by='prediction', ascending=False)['user_id'][:1].values[0]

4721485

In [9]:
predictions.loc[predictions['user_id'] == 1107036].sort_values(by='prediction', ascending=False)['title'][:3].values

array(['Wallpaper Engine', 'Red Dead Redemption 2', 'Path of Exile'],
      dtype=object)

In [85]:
predictions.to_csv('predictions.csv', index=False)

In [50]:
entity = pd.DataFrame(recommendations['app_id'].unique(), columns=['app_id'])
entity = pd.merge(entity, games[['app_id', 'title']], on='app_id', how='left')
entity

Unnamed: 0,app_id,title
0,311210,Call of Duty®: Black Ops III
1,1517290,Battlefield™ 2042
2,238960,Path of Exile
3,457140,Oxygen Not Included
4,948740,AI: The Somnium Files
...,...,...
409,1536420,Clip maker
410,1552220,LakeSide
411,550400,Delicious - Emily's Hopes and Fears
412,2088630,Airport Ground Handling Simulator VR


In [61]:
entity.head(20)

Unnamed: 0,app_id,title
0,311210,Call of Duty®: Black Ops III
1,1517290,Battlefield™ 2042
2,238960,Path of Exile
3,457140,Oxygen Not Included
4,948740,AI: The Somnium Files
5,1174180,Red Dead Redemption 2
6,304390,FOR HONOR™
7,349040,NARUTO SHIPPUDEN: Ultimate Ninja STORM 4
8,1057090,Ori and the Will of the Wisps
9,546560,Half-Life: Alyx


In [60]:
sublists = []
for row in entity.itertuples(index=False):
    listKey = str(row[0])
    value = row[1]
    dict2 = {"language": "en-us", "values": []}
    dict2["values"].append(listKey)
    dict2["values"].append(value)
    dict1 = {"listKey": listKey, "synonyms": [dict2]}
    sublists.append(dict1)

import json
json_object = json.dumps(sublists, indent = 4)
json_file = open("entity.json", "w")
json_file.write(json_object)
json_file.close()

In [21]:
games.dtypes

app_id              int64
title              object
date_release       object
win                  bool
mac                  bool
linux                bool
rating             object
positive_ratio      int64
user_reviews        int64
price_final       float64
price_original    float64
discount          float64
steam_deck           bool
dtype: object