# Game RecSys Model Building/Evaluation
Notebook for building and evaluating ML models of the game recommendation systems

## Load and format data

In [31]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

### Load user-game ratings data
#### Training data

In [2]:
X_train = pd.read_csv("train_test_split/X_train_3k.csv")
X_train = X_train.set_index("Unnamed: 0")
y_train = pd.read_csv("train_test_split/y_train_3k.csv")
y_train = y_train.set_index("Unnamed: 0")
# join and reset index
train_df = pd.merge(X_train, y_train, left_index=True, right_index=True, validate="1:1")
train_df = train_df.reset_index(drop=True)[["user_id", "item_id", "recommend"]].copy()

In [3]:
train_df.head()

Unnamed: 0,user_id,item_id,recommend
0,Drewmatic,8930,1
1,76561198080148447,377160,1
2,AleksoSmeksoHere,342380,1
3,gaboqse,108800,0
4,piedude,215470,1


#### Test data

In [4]:
X_test = pd.read_csv("train_test_split/X_test_3k.csv")
X_test = X_test.set_index("Unnamed: 0")
y_test = pd.read_csv("train_test_split/y_test_3k.csv")
y_test = y_test.set_index("Unnamed: 0")
# join and reset index
test_df = pd.merge(X_test, y_test, left_index=True, right_index=True, validate="1:1")
test_df = test_df.reset_index(drop=True)[["user_id", "item_id", "recommend"]].copy()

In [5]:
test_df.head()

Unnamed: 0,user_id,item_id,recommend
0,sickbubblez,386360,1
1,GetALifeStopLookingAtMyUrl,4000,1
2,kineticvine,1250,1
3,LeoNoHomo,200210,1
4,itsdandytime,4000,1


### Load game metadata

In [6]:
game_meta = pd.read_csv("train_test_split/processed_metadata.csv").rename(
    columns={"Unnamed: 0":"item_id"}).set_index("item_id")
game_meta.head()

Unnamed: 0_level_0,early_access,metascore,price,sentiment,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,...,Stats,Steam Achievements,Steam Cloud,Steam Leaderboards,Steam Trading Cards,Steam Turn Notifications,Steam Workshop,SteamVR Collectibles,Tracked Motion Controllers,Valve Anti-Cheat enabled
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0,88.0,9.99,3,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
20,0,,4.99,2,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
30,0,79.0,4.99,2,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50,0,,4.99,2,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
60,0,,4.99,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Content-based filtering (CBF)

### Compute game-game similarity matrix

In [7]:
from sklearn.preprocessing import MinMaxScaler

In [8]:
# fill NaN:s with column mean values
game_meta_cbf = game_meta.fillna(game_meta.mean())

# normalize values in the metascore, price, and sentiment columns to a range of 0 - 1
scaler = MinMaxScaler()
scaled_cbf = scaler.fit_transform(game_meta_cbf[["metascore", "price", "sentiment"]])
game_meta_cbf[["metascore", "price", "sentiment"]] = scaled_cbf
game_meta_cbf.head()

Unnamed: 0_level_0,early_access,metascore,price,sentiment,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,...,Stats,Steam Achievements,Steam Cloud,Steam Leaderboards,Steam Trading Cards,Steam Turn Notifications,Steam Workshop,SteamVR Collectibles,Tracked Motion Controllers,Valve Anti-Cheat enabled
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0,0.888889,0.012318,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
20,0,0.735917,0.005835,0.833333,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
30,0,0.763889,0.005835,0.833333,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50,0,0.735917,0.005835,0.833333,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
60,0,0.735917,0.005835,0.666667,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
# create the similarity matrix using the Pearson correlation coefficient
game_similarity_matrix = game_meta_cbf.T.corr(method="pearson")
game_similarity_matrix.head()

item_id,10,20,30,50,60,70,80,130,220,240,...,461560,462930,464780,466910,480631,485380,485890,495890,498240,512540
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,1.0,0.995876,0.996523,0.886343,0.98787,0.903771,0.874333,0.595771,0.345255,0.666608,...,0.168448,0.057541,0.271566,0.398367,0.084417,0.271758,0.16915,0.221772,0.200073,0.403472
20,0.995876,1.0,0.999913,0.890223,0.996834,0.897767,0.882236,0.567854,0.320196,0.664101,...,0.136324,0.035855,0.230239,0.376656,0.06228,0.230463,0.138871,0.184451,0.166047,0.409524
30,0.996523,0.999913,1.0,0.890122,0.996925,0.898915,0.881176,0.570038,0.322738,0.664492,...,0.139112,0.03851,0.233858,0.378615,0.065606,0.234082,0.142606,0.187232,0.169551,0.409397
50,0.886343,0.890223,0.890122,1.0,0.887474,0.992939,0.99726,0.768895,0.423638,0.57695,...,0.28429,0.167338,0.424495,0.321012,0.224906,0.424705,0.327764,0.337321,0.348091,0.466629
60,0.98787,0.996834,0.996925,0.887474,1.0,0.891217,0.878529,0.545535,0.304708,0.658711,...,0.115835,0.026715,0.20376,0.361192,0.056281,0.203988,0.125749,0.157876,0.147367,0.412284


### CBF algorithm
To calculate the rating score for a particulair user-game pair, this algorithm does the following:
1. obtain the list of other game id:s this user has rated
2. obtain the similarity scores of those games to the game of interest
3. calculate the rating for the game of interest using the following equation: $$r_{ik}=\frac{\sum_{j\neq k}{r_{ij}s_{jk}}}{\sum_{j\neq k}{s_{jk}}}$$ where $r_{ik}$, $r_{ij}$, and $s_{jk}$ are the desired rating of user **i** for game **k**, rating of game **j** by user **i**, and the similarity of game **j** with game **k** respectively.
4. If no similarities are available for calculating $r_{ik}$, the algorithm just returns the average rating of game **k** by all users

In [38]:
# CBF algorithm
def cbf_rate(user, item, data, similarity):
    """calculates the predicted rating for a game
    by a user
    data - training df
    similarity - game similarity matrix
    avg_ratings - average game ratings"""
    # compute average ratings in the training set
    r_avg = np.average(data[data["item_id"] == item]["recommend"].values)
    #print("average rating: %.2f" % (r_avg))
    # obtain similarity scores for the game
    try:
        game_s = similarity[item].to_frame()
    except KeyError:
        # no similarity data available, return average rating
        #print("No similarity data available, using average rating...")
        return r_avg
    # obtain ratings by this user for all items except the item of interest
    game_r = data[(data["user_id"] == user) & 
                  (data["item_id"] != item)][["item_id", "recommend"]].copy().set_index("item_id")
    #print("Available ratings data:")
    #print(game_r)
    if len(game_r) > 0:
        # ratings data available, compute rating
        game_r_s = game_r.merge(game_s, left_index=True, right_index=True)
        if len(game_r_s) > 0:
            # there is available similarity data
            ratings, sims = game_r_s.iloc[:,0].values, game_r_s.iloc[:,1].values
            #print(ratings, sims)
            r = np.dot(ratings, sims) / np.sum(sims)
            #print(r)
            return r
        else:
            # no available similarity data, return average rating
            return r_avg
    else:
        # ratings data not available, return average rating
        return r_avg
    
def cbf_predict(data, training_data, similarity):
    """predicts ratings for all user-item pairs in the
    passed dataset - data
    similarity - game similarity matrix"""
    user_item = data[["user_id", "item_id"]]
    with tqdm(total=len(user_item)) as pbar:
        for idx, row in user_item.iterrows():
            r = cbf_rate(row["user_id"], row["item_id"], training_data, similarity)
            user_item.loc[idx, "r_pred"] = r
            pbar.update(1)
    return user_item
        

### CBF evaluation
Evaluate the CBF algorithm on both the training and test sets and compare with the baseline models

In [19]:
from sklearn.metrics import mean_squared_error

In [39]:
# make predictions and compute MSE on the training set
train_cbf_pred = cbf_predict(train_df, train_df, game_similarity_matrix)
train_cbf_mse = mean_squared_error(train_df["recommend"], train_cbf_pred["r_pred"])
print("MSE: %.4f" %(train_cbf_mse))

HBox(children=(IntProgress(value=0, max=24969), HTML(value='')))


MSE: 0.1284


In [40]:
# make predictions and compute MSE on the test set
test_cbf_pred = cbf_predict(test_df, train_df, game_similarity_matrix)
test_cbf_mse = mean_squared_error(test_df["recommend"], test_cbf_pred["r_pred"])
print("MSE: %.4f" %(test_cbf_mse))

HBox(children=(IntProgress(value=0, max=8323), HTML(value='')))


MSE: 0.1181


Not better than the baseline model!

## Collaborative filtering (CF) using deep learning 

### Assemble the full game metadata set
Include also the game ids without any metadata available. For these algorithms we cannot have missing values

In [41]:
game_ids = pd.Series(train_df["item_id"].unique(), name="item_id").to_frame().set_index("item_id")
full_meta = pd.merge(game_ids, game_meta, how="left", left_index=True, right_index=True)
full_meta.shape

(1346, 62)

#### Fill NaN:s and normalize values in `metascore`, `price`, and `sentiment` columns

In [42]:
# fill NaN:s with column mean values
full_meta.fillna(full_meta.mean(), inplace=True)

# normalize values in the 2 columns
scaler = MinMaxScaler()
scaled = scaler.fit_transform(full_meta[["metascore", "price", "sentiment"]])
full_meta[["metascore", "price", "sentiment"]] = scaled

full_meta.head()

Unnamed: 0_level_0,early_access,metascore,price,sentiment,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,...,Stats,Steam Achievements,Steam Cloud,Steam Leaderboards,Steam Trading Cards,Steam Turn Notifications,Steam Workshop,SteamVR Collectibles,Tracked Motion Controllers,Valve Anti-Cheat enabled
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8930,0.0,0.916667,0.038251,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
377160,0.0,0.833333,0.038251,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
342380,0.050633,0.735917,0.021977,0.764838,0.594991,0.303972,0.004318,0.000864,0.124352,0.001727,...,0.119898,0.637755,0.417517,0.213435,0.52551,0.002551,0.142857,0.00085,0.005952,0.064626
108800,0.0,0.735917,0.038251,0.833333,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
215470,0.0,0.597222,0.009725,0.666667,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


### Recreate user-item indexes (needed for embedding layers)

In [64]:
# create new indexes for unique users/items
unique_users = pd.Series(train_df["user_id"].unique(),
                         name="user_name").to_frame().reset_index().rename(columns={"index":"user_idx"})

unique_items = pd.Series(train_df["item_id"].unique(),
                         name="original_id").to_frame().reset_index().rename(columns={"index":"item_idx"})


In [66]:
# join back on training/test sets
def assemble_reindexed_dataset(df):
    # join back on training/test sets
    df_user_idx = pd.merge(df, unique_users, how="left",
                              left_on="user_id",
                              right_on="user_name").drop(columns="user_name")
    df_idx = pd.merge(df_user_idx, unique_items, how="left",
                        left_on="item_id",
                        right_on="original_id").drop(columns="original_id")
    # join with game metadata
    df_full_idx = pd.merge(df_idx, full_meta, how="left",
                              left_on="item_id",
                              right_on="item_id").iloc[:,2:]
    return df_full_idx


In [67]:
train_full_idx = assemble_reindexed_dataset(train_df)
test_full_idx = assemble_reindexed_dataset(test_df)

In [70]:
train_full_idx.head()

Unnamed: 0,recommend,user_idx,item_idx,early_access,metascore,price,sentiment,Action,Adventure,Animation &amp; Modeling,...,Stats,Steam Achievements,Steam Cloud,Steam Leaderboards,Steam Trading Cards,Steam Turn Notifications,Steam Workshop,SteamVR Collectibles,Tracked Motion Controllers,Valve Anti-Cheat enabled
0,1,0,0,0.0,0.916667,0.038251,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1,1,1,0.0,0.833333,0.038251,0.5,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,2,2,0.050633,0.735917,0.021977,0.764838,0.594991,0.303972,0.004318,...,0.119898,0.637755,0.417517,0.213435,0.52551,0.002551,0.142857,0.00085,0.005952,0.064626
3,0,3,3,0.0,0.735917,0.038251,0.833333,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,4,4,0.0,0.597222,0.009725,0.666667,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


### Basic user-item embedding model

In [74]:
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Flatten, concatenate, dot
import keras.backend as K

In [98]:
# define model variables
n_users = len(unique_users)
n_items = len(unique_items)
embedding_size_users = 16
embedding_size_items = 16

# create user path
user_input = Input(shape=(1,), name="user")
user_embeddings = Embedding(n_users, embedding_size_users, name="user_embeddings")(user_input)
user_flat = Flatten(name="user_flattened")(user_embeddings)

# create item path
item_input = Input(shape=(1,), name="item")
item_embeddings = Embedding(n_items, embedding_size_items, name="item_embeddings")(item_input)
item_flat = Flatten(name="item_flattened")(item_embeddings)

# combine the two paths
pred = dot([user_flat, item_flat], 1, name="predicted_ratings")

# compile the model
cbf_model = Model(inputs=[user_input, item_input], outputs=pred)
cbf_model.compile("adam", loss="mse", metrics=["mse"])


In [99]:
# get the data
X_train = train_full_idx.iloc[:, [1, 2]].values
y_train = train_full_idx.iloc[:, 0].values
X_test = test_full_idx.iloc[:, [1, 2]].values
y_test = test_full_idx.iloc[:, [0]].values

In [101]:
cbf_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
item (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
user_embeddings (Embedding)     (None, 1, 16)        110496      user[0][0]                       
__________________________________________________________________________________________________
item_embeddings (Embedding)     (None, 1, 16)        21536       item[0][0]                       
__________________________________________________________________________________________________
user_flatt

In [102]:
# train model for a few epochs
cbf_model.fit(x={"user":X_train[:,0].reshape(-1,1), "item":X_train[:,1].reshape(-1,1)}, 
              y=y_train.reshape(-1,1), 
              batch_size=16, 
              epochs=10, 
              verbose=1, 
              validation_split=0.2
             )

Train on 19975 samples, validate on 4994 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


Epoch 5/10
Epoch 6/10
Epoch 7/10


Epoch 8/10
Epoch 9/10


Epoch 10/10


<keras.callbacks.History at 0x22506a64be0>

## Hybrid methods