# Game RecSys Model Building
In this notebook, I build several models of user-game ratings and test their performance against the baseline models developed using the `surprise` package   

## Load and format data

In [1]:
import numpy as np
import pandas as pd
import gzip
from functools import reduce
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

In [2]:
GAME_METADATA_PATH = "data/steam_games.json.gz"

### Load user-game ratings data
#### Training data

In [3]:
X_train = pd.read_csv("train_test_split/X_train_3k.csv")
X_train = X_train.set_index("Unnamed: 0")
y_train = pd.read_csv("train_test_split/y_train_3k.csv")
y_train = y_train.set_index("Unnamed: 0")
# join and reset index
train_df = pd.merge(X_train, y_train, left_index=True, right_index=True, validate="1:1")
train_df = train_df.reset_index(drop=True)[["user_id", "item_id", "recommend"]].copy()

In [4]:
train_df.head()

Unnamed: 0,user_id,item_id,recommend
0,Drewmatic,8930,1
1,76561198080148447,377160,1
2,AleksoSmeksoHere,342380,1
3,gaboqse,108800,0
4,piedude,215470,1


#### Test data

In [5]:
X_test = pd.read_csv("train_test_split/X_test_3k.csv")
X_test = X_test.set_index("Unnamed: 0")
y_test = pd.read_csv("train_test_split/y_test_3k.csv")
y_test = y_test.set_index("Unnamed: 0")
# join and reset index
test_df = pd.merge(X_test, y_test, left_index=True, right_index=True, validate="1:1")
test_df = test_df.reset_index(drop=True)[["user_id", "item_id", "recommend"]].copy()

In [6]:
test_df.head()

Unnamed: 0,user_id,item_id,recommend
0,sickbubblez,386360,1
1,GetALifeStopLookingAtMyUrl,4000,1
2,kineticvine,1250,1
3,LeoNoHomo,200210,1
4,itsdandytime,4000,1


### Load and format game metadata

In [7]:
def parse_gzipped_all(path):
    """reads the gzipped input data file in path
    returns - a list of dictionaries"""
    with gzip.open(path, "r") as g:
        data = [eval(l) for l in g.readlines()]
    return data

def parse_data_all(path):
    """reads the input data file in path
    returns - a list of dictionaries"""
    with open(path, "r", errors="ignore") as f:
        data = [eval(l) for l in f.readlines()]
    return data


#### Load *Steam* game metadata

In [13]:
game_meta = parse_gzipped_all(GAME_METADATA_PATH)
game_meta_df = pd.DataFrame.from_records(game_meta).dropna(subset=["id"])
game_meta_df["id"] = game_meta_df["id"].astype(int)
game_meta_df = game_meta_df.set_index("id")
game_meta_df.head()

Unnamed: 0_level_0,app_name,developer,discount_price,early_access,genres,metascore,price,publisher,release_date,reviews_url,sentiment,specs,tags,title,url
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
761140,Lost Summoner Kitty,Kotoshiro,4.49,False,"[Action, Casual, Indie, Simulation, Strategy]",,4.99,Kotoshiro,2018-01-04,http://steamcommunity.com/app/761140/reviews/?...,,[Single-player],"[Strategy, Action, Indie, Casual, Simulation]",Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...
643980,Ironbound,Secret Level SRL,,False,"[Free to Play, Indie, RPG, Strategy]",,Free To Play,"Making Fun, Inc.",2018-01-04,http://steamcommunity.com/app/643980/reviews/?...,Mostly Positive,"[Single-player, Multi-player, Online Multi-Pla...","[Free to Play, Strategy, Indie, RPG, Card Game...",Ironbound,http://store.steampowered.com/app/643980/Ironb...
670290,Real Pool 3D - Poolians,Poolians.com,,False,"[Casual, Free to Play, Indie, Simulation, Sports]",,Free to Play,Poolians.com,2017-07-24,http://steamcommunity.com/app/670290/reviews/?...,Mostly Positive,"[Single-player, Multi-player, Online Multi-Pla...","[Free to Play, Simulation, Sports, Casual, Ind...",Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...
767400,弹炸人2222,彼岸领域,0.83,False,"[Action, Adventure, Casual]",,0.99,彼岸领域,2017-12-07,http://steamcommunity.com/app/767400/reviews/?...,,[Single-player],"[Action, Adventure, Casual]",弹炸人2222,http://store.steampowered.com/app/767400/2222/
773570,Log Challenge,,1.79,False,,,2.99,,,http://steamcommunity.com/app/773570/reviews/?...,,"[Single-player, Full controller support, HTC V...","[Action, Indie, Casual, Sports]",,http://store.steampowered.com/app/773570/Log_C...


#### Keep only game ID:s that are in the training set

In [14]:
game_ids = pd.Series(train_df["item_id"].unique(), name="item_id").to_frame().set_index("item_id")
game_meta_train = game_ids.merge(game_meta_df, how="inner", left_index=True, right_index=True)
game_meta_train.head()

Unnamed: 0,app_name,developer,discount_price,early_access,genres,metascore,price,publisher,release_date,reviews_url,sentiment,specs,tags,title,url
10,Counter-Strike,Valve,,False,[Action],88.0,9.99,Valve,2000-11-01,http://steamcommunity.com/app/10/reviews/?brow...,Overwhelmingly Positive,"[Multi-player, Valve Anti-Cheat enabled]","[Action, FPS, Multiplayer, Shooter, Classic, T...",Counter-Strike,http://store.steampowered.com/app/10/CounterSt...
20,Team Fortress Classic,Valve,,False,[Action],,4.99,Valve,1999-04-01,http://steamcommunity.com/app/20/reviews/?brow...,Very Positive,"[Multi-player, Valve Anti-Cheat enabled]","[Action, FPS, Multiplayer, Classic, Shooter, C...",Team Fortress Classic,http://store.steampowered.com/app/20/Team_Fort...
30,Day of Defeat,Valve,,False,[Action],79.0,4.99,Valve,2003-05-01,http://steamcommunity.com/app/30/reviews/?brow...,Very Positive,"[Multi-player, Valve Anti-Cheat enabled]","[FPS, World War II, Multiplayer, Action, Shoot...",Day of Defeat,http://store.steampowered.com/app/30/Day_of_De...
50,Half-Life: Opposing Force,Gearbox Software,,False,[Action],,4.99,Valve,1999-11-01,http://steamcommunity.com/app/50/reviews/?brow...,Very Positive,"[Single-player, Multi-player, Valve Anti-Cheat...","[FPS, Action, Sci-fi, Singleplayer, Classic, S...",Half-Life: Opposing Force,http://store.steampowered.com/app/50/HalfLife_...
60,Ricochet,Valve,,False,[Action],,4.99,Valve,2000-11-01,http://steamcommunity.com/app/60/reviews/?brow...,Mostly Positive,"[Multi-player, Valve Anti-Cheat enabled]","[Action, FPS, Multiplayer, First-Person, Cyber...",Ricochet,http://store.steampowered.com/app/60/Ricochet/


#### Clean and re-format the dataset

In [17]:
# drop columns we will not use
game_meta_train.drop(columns=["title", "url", "discount_price", "release_date", "reviews_url"], inplace=True)
game_meta_train.head()

In [None]:
# convert data types
num_cols = ["metascore", "price"]
game_meta_train[num_cols] = game_meta_train[num_cols].apply(pd.to_numeric, errors='coerce')
game_meta_train["early_access"] = game_meta_train["early_access"].astype(int)

In [61]:
# map sentiment to ordinal values
map_sentiment = {
    "Overwhelmingly Negative":-3,
    "Very Negative":-2,
    "Mostly Negative":-1,
    "Mixed": 0,
    "Mostly Positive": 1,
    "Very Positive": 2,
    "Overwhelmingly Positive":3
}
game_meta_train["sentiment"].replace(map_sentiment, inplace=True)

In [87]:
# drop columns that result in a too large feature set
game_meta_base = game_meta_train.drop(columns=["app_name", "developer", "publisher", "tags"])
game_meta_base.head()

Unnamed: 0,early_access,genres,metascore,price,sentiment,specs
10,0,[Action],88.0,9.99,3,"[Multi-player, Valve Anti-Cheat enabled]"
20,0,[Action],,4.99,2,"[Multi-player, Valve Anti-Cheat enabled]"
30,0,[Action],79.0,4.99,2,"[Multi-player, Valve Anti-Cheat enabled]"
50,0,[Action],,4.99,2,"[Single-player, Multi-player, Valve Anti-Cheat..."
60,0,[Action],,4.99,1,"[Multi-player, Valve Anti-Cheat enabled]"


#### Encode multi-label categorical features (Multi-label binarization) 

In [67]:
from sklearn.preprocessing import MultiLabelBinarizer

In [82]:
def encode_multi(df, col):
    """encodes a single column"""
    mlb = MultiLabelBinarizer()
    # first drop nans
    df = df[col].dropna()
    # encode feature list
    encoded = mlb.fit_transform(df)
    encoded_df = pd.DataFrame(data=encoded, columns=mlb.classes_, index=df.index)
    return encoded_df

def encode_all(df, cols):
    """encodes all columns and combines
    the results into a single df"""
    df = df.copy(deep=True)
    for c in cols:
        encoded_col = encode_multi(df, c)
        df = pd.merge(df, encoded_col, how="left", left_index=True, right_index=True).drop(columns=c)
    return df


In [88]:
game_meta_encoded = encode_all(game_meta_base, ["genres", "specs"])

In [104]:
game_meta_encoded.head()

Unnamed: 0,early_access,metascore,price,sentiment,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,...,Stats,Steam Achievements,Steam Cloud,Steam Leaderboards,Steam Trading Cards,Steam Turn Notifications,Steam Workshop,SteamVR Collectibles,Tracked Motion Controllers,Valve Anti-Cheat enabled
10,0,88.0,9.99,3,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
20,0,,4.99,2,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
30,0,79.0,4.99,2,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50,0,,4.99,2,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
60,0,,4.99,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


#### Assemble full metadata set

In [98]:
full_meta = pd.merge(game_ids, game_meta_encoded, how="left", left_index=True, right_index=True)
full_meta.shape

(1346, 62)

#### Fill NaN:s and normalize values in `metascore`, `price`, and `sentiment` columns

In [97]:
from sklearn.preprocessing import MinMaxScaler

In [99]:
# fill NaN:s with column mean values
full_meta.fillna(full_meta.mean(), inplace=True)

# normalize values in the 2 columns
scaler = MinMaxScaler()
scaled = scaler.fit_transform(full_meta[["metascore", "price", "sentiment"]])
full_meta[["metascore", "price", "sentiment"]] = scaled

In [105]:
full_meta.head()

Unnamed: 0_level_0,early_access,metascore,price,sentiment,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,...,Stats,Steam Achievements,Steam Cloud,Steam Leaderboards,Steam Trading Cards,Steam Turn Notifications,Steam Workshop,SteamVR Collectibles,Tracked Motion Controllers,Valve Anti-Cheat enabled
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8930,0.0,0.916667,0.038251,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
377160,0.0,0.833333,0.038251,0.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
342380,0.050633,0.735917,0.021977,0.764838,0.594991,0.303972,0.004318,0.000864,0.124352,0.001727,...,0.119898,0.637755,0.417517,0.213435,0.52551,0.002551,0.142857,0.00085,0.005952,0.064626
108800,0.0,0.735917,0.038251,0.833333,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
215470,0.0,0.597222,0.009725,0.666667,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [106]:
# persist processed game data
full_meta.to_csv("train_test_split/processed_metadata.csv")

## Content-based filtering (CBF)

### Compute game-game similarity matrix

### CBF algorithm

### CBF evaluation

## Collaborative filtering (CF) using deep learning 

## Hybrid methods