# Recommendation System

## Pre-requisites

In [1]:
!apt-get -y install openjdk-8-jdk-headless
!wget http://apache.osuosl.org/spark/spark-2.4.2/spark-2.4.2-bin-hadoop2.7.tgz
!tar xf spark-2.4.2-bin-hadoop2.7.tgz
!pip install findspark kaggle

Reading package lists... Done
Building dependency tree       
Reading state information... Done
openjdk-8-jdk-headless is already the newest version (8u191-b12-2ubuntu0.18.04.1).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.
--2019-05-01 13:52:07--  http://apache.osuosl.org/spark/spark-2.4.2/spark-2.4.2-bin-hadoop2.7.tgz
Resolving apache.osuosl.org (apache.osuosl.org)... 64.50.236.52, 140.211.166.134, 64.50.233.100, ...
Connecting to apache.osuosl.org (apache.osuosl.org)|64.50.236.52|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 208534388 (199M) [application/x-gzip]
Saving to: ‘spark-2.4.2-bin-hadoop2.7.tgz’


2019-05-01 13:52:10 (78.8 MB/s) - ‘spark-2.4.2-bin-hadoop2.7.tgz’ saved [208534388/208534388]

Collecting findspark
  Downloading https://files.pythonhosted.org/packages/b1/c8/e6e1f6a303ae5122dc28d131b5a

In [2]:
import os

# setup environment variables
os.environ["JAVA_HOME"] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ["SPARK_HOME"] = '/content/spark-2.4.2-bin-hadoop2.7'
os.environ['KAGGLE_USERNAME'] = 'ronaldsumbayak'
os.environ['KAGGLE_KEY'] = '771009dffffa0cd0883d5fb98594d756'

# verify
print(os.environ['JAVA_HOME'])
print(os.environ['SPARK_HOME'])
print(os.environ['KAGGLE_USERNAME'])
print(os.environ['KAGGLE_KEY'])

/usr/lib/jvm/java-8-openjdk-amd64
/content/spark-2.4.2-bin-hadoop2.7
ronaldsumbayak
771009dffffa0cd0883d5fb98594d756


## Spark Initialization

In [0]:
import findspark
findspark.init()

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').getOrCreate()

## Dataset Preparation

### Download

In [5]:
!kaggle datasets download -d tamber/steam-video-games

Downloading steam-video-games.zip to /content
  0% 0.00/1.56M [00:00<?, ?B/s]
100% 1.56M/1.56M [00:00<00:00, 51.8MB/s]


### Extract

In [6]:
!unzip steam-video-games.zip -d data

Archive:  steam-video-games.zip
  inflating: data/steam-200k.csv     


### Load

In [0]:
steam = spark.read.csv('data/steam-200k.csv', inferSchema=True)

### Cleansing

In [0]:
steam = steam.drop('_c4')

In [0]:
steam = steam.toDF('user_id', 'game_name', 'behavior', 'hours_played')

### Data Preview

In [10]:
steam.count()

200000

In [11]:
steam.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- game_name: string (nullable = true)
 |-- behavior: string (nullable = true)
 |-- hours_played: double (nullable = true)



In [12]:
steam.show(10, truncate=False)

+---------+--------------------------+--------+------------+
|user_id  |game_name                 |behavior|hours_played|
+---------+--------------------------+--------+------------+
|151603712|The Elder Scrolls V Skyrim|purchase|1.0         |
|151603712|The Elder Scrolls V Skyrim|play    |273.0       |
|151603712|Fallout 4                 |purchase|1.0         |
|151603712|Fallout 4                 |play    |87.0        |
|151603712|Spore                     |purchase|1.0         |
|151603712|Spore                     |play    |14.9        |
|151603712|Fallout New Vegas         |purchase|1.0         |
|151603712|Fallout New Vegas         |play    |12.1        |
|151603712|Left 4 Dead 2             |purchase|1.0         |
|151603712|Left 4 Dead 2             |play    |8.9         |
+---------+--------------------------+--------+------------+
only showing top 10 rows



## Preprocessing

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window as W

### Normalize user_id

In [0]:
users = steam.select('user_id').distinct()
users = users.withColumn('idx', F.monotonically_increasing_id())
w = W.orderBy("idx")
users = users.withColumn('user', F.row_number().over(w))
users = users.drop("idx")

In [15]:
users.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- user: integer (nullable = true)



In [16]:
users.show(10, truncate=False)

+---------+----+
|user_id  |user|
+---------+----+
|16167221 |1   |
|166705920|2   |
|244878837|3   |
|99992274 |4   |
|174415183|5   |
|156156544|6   |
|152861732|7   |
|171911285|8   |
|128412180|9   |
|74557142 |10  |
+---------+----+
only showing top 10 rows



### Generate ID column for games entity

In [0]:
games = steam.select('game_name').distinct()
games = games.withColumn('idx', F.monotonically_increasing_id())
w = W.orderBy("idx")
games = games.withColumn('item', F.row_number().over(w))
games = games.drop("idx")

In [18]:
games.printSchema()

root
 |-- game_name: string (nullable = true)
 |-- item: integer (nullable = true)



In [19]:
games.show(10, truncate=False)

+-----------------------------------+----+
|game_name                          |item|
+-----------------------------------+----+
|Dota 2                             |1   |
|METAL GEAR SOLID V THE PHANTOM PAIN|2   |
|LEGO Batman The Videogame          |3   |
|RIFT                               |4   |
|Anodyne                            |5   |
|Legend of Grimrock                 |6   |
|Divinity Original Sin              |7   |
|Meltdown                           |8   |
|SanctuaryRPG Black Edition         |9   |
|Snuggle Truck                      |10  |
+-----------------------------------+----+
only showing top 10 rows



### Merge back to main DataFrame

In [0]:
steam = steam.join(users, ['user_id'])
steam = steam.join(games, ['game_name'])

In [21]:
steam.show(10, truncate=False)

+--------------------------+---------+--------+------------+----+----+
|game_name                 |user_id  |behavior|hours_played|user|item|
+--------------------------+---------+--------+------------+----+----+
|The Elder Scrolls V Skyrim|151603712|purchase|1.0         |3407|3037|
|The Elder Scrolls V Skyrim|151603712|play    |273.0       |3407|3037|
|Fallout 4                 |151603712|purchase|1.0         |3407|479 |
|Fallout 4                 |151603712|play    |87.0        |3407|479 |
|Spore                     |151603712|purchase|1.0         |3407|4525|
|Spore                     |151603712|play    |14.9        |3407|4525|
|Fallout New Vegas         |151603712|purchase|1.0         |3407|4471|
|Fallout New Vegas         |151603712|play    |12.1        |3407|4471|
|Left 4 Dead 2             |151603712|purchase|1.0         |3407|87  |
|Left 4 Dead 2             |151603712|play    |8.9         |3407|87  |
+--------------------------+---------+--------+------------+----+----+
only s

### Standardize Column Names

In [0]:
steam_standardized = steam.toDF('game_name', 'user_id', 'behavior', 'rating', 'user', 'item')

In [23]:
steam_standardized.show(10, truncate=False)

+--------------------------+---------+--------+------+----+----+
|game_name                 |user_id  |behavior|rating|user|item|
+--------------------------+---------+--------+------+----+----+
|The Elder Scrolls V Skyrim|151603712|purchase|1.0   |3407|3037|
|The Elder Scrolls V Skyrim|151603712|play    |273.0 |3407|3037|
|Fallout 4                 |151603712|purchase|1.0   |3407|479 |
|Fallout 4                 |151603712|play    |87.0  |3407|479 |
|Spore                     |151603712|purchase|1.0   |3407|4525|
|Spore                     |151603712|play    |14.9  |3407|4525|
|Fallout New Vegas         |151603712|purchase|1.0   |3407|4471|
|Fallout New Vegas         |151603712|play    |12.1  |3407|4471|
|Left 4 Dead 2             |151603712|purchase|1.0   |3407|87  |
|Left 4 Dead 2             |151603712|play    |8.9   |3407|87  |
+--------------------------+---------+--------+------+----+----+
only showing top 10 rows



### Normalize Rating Values

In [0]:
max_rating = steam_standardized.agg(F.max('rating')).collect()[0][0]

In [25]:
max_rating

11754.0

In [0]:
steam_normalized = steam_standardized.withColumn('rating', F.col('rating') / max_rating * 10)

In [27]:
steam_normalized.show(10, truncate=False)

+--------------------------+---------+--------+--------------------+----+----+
|game_name                 |user_id  |behavior|rating              |user|item|
+--------------------------+---------+--------+--------------------+----+----+
|The Elder Scrolls V Skyrim|151603712|purchase|8.507742045261188E-4|3407|3037|
|The Elder Scrolls V Skyrim|151603712|play    |0.23226135783563043 |3407|3037|
|Fallout 4                 |151603712|purchase|8.507742045261188E-4|3407|479 |
|Fallout 4                 |151603712|play    |0.07401735579377233 |3407|479 |
|Spore                     |151603712|purchase|8.507742045261188E-4|3407|4525|
|Spore                     |151603712|play    |0.012676535647439171|3407|4525|
|Fallout New Vegas         |151603712|purchase|8.507742045261188E-4|3407|4471|
|Fallout New Vegas         |151603712|play    |0.010294367874766038|3407|4471|
|Left 4 Dead 2             |151603712|purchase|8.507742045261188E-4|3407|87  |
|Left 4 Dead 2             |151603712|play    |0.007

### Filter play behavior only

In [0]:
steam_filtered = steam_normalized.filter(steam.behavior == 'play')

In [29]:
steam_filtered.count()

70489

In [30]:
steam_filtered.show(10, truncate=False)

+--------------------------+---------+--------+--------------------+----+----+
|game_name                 |user_id  |behavior|rating              |user|item|
+--------------------------+---------+--------+--------------------+----+----+
|The Elder Scrolls V Skyrim|151603712|play    |0.23226135783563043 |3407|3037|
|Fallout 4                 |151603712|play    |0.07401735579377233 |3407|479 |
|Spore                     |151603712|play    |0.012676535647439171|3407|4525|
|Fallout New Vegas         |151603712|play    |0.010294367874766038|3407|4471|
|Left 4 Dead 2             |151603712|play    |0.007571890420282457|3407|87  |
|HuniePop                  |151603712|play    |0.007231580738472009|3407|3894|
|Path of Exile             |151603712|play    |0.006891271056661562|3407|1807|
|Poly Bridge               |151603712|play    |0.00638080653394589 |3407|4814|
|Left 4 Dead               |151603712|play    |0.002807554874936192|3407|2318|
|Team Fortress 2           |151603712|play    |0.002

In [0]:
data = steam_filtered

## Training

In [0]:
from pyspark.ml.recommendation import ALS

### Train-Validation Split

In [0]:
train_data, valid_data = data.randomSplit([0.66, 0.33])

### Model Training

In [0]:
als = ALS(regParam=0.01, coldStartStrategy='drop')

In [0]:
model = als.fit(train_data)

## Evaluation

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
evaluator = RegressionEvaluator(labelCol='rating')

In [0]:
predictions = model.transform(valid_data)

In [39]:
predictions.show(10, truncate=False)

+--------------+---------+--------+---------------------+-----+----+------------+
|game_name     |user_id  |behavior|rating               |user |item|prediction  |
+--------------+---------+--------+---------------------+-----+----+------------+
|Sacred Citadel|140911953|play    |0.004424025863535818 |10786|148 |6.553171E-5 |
|Sacred Citadel|8585433  |play    |0.013867619533775737 |926  |148 |0.017694548 |
|Sacred Citadel|36502549 |play    |0.009358516249787306 |7338 |148 |0.010094965 |
|Sacred Citadel|73835640 |play    |0.00544495490896716  |8733 |148 |0.013697032 |
|Mass Effect 2 |34177747 |play    |0.08933129147524248  |1019 |1088|0.051740825 |
|Mass Effect 2 |59790700 |play    |0.0012761613067891783|5003 |1088|0.01793686  |
|Mass Effect 2 |17530772 |play    |0.020418580908626853 |7216 |1088|0.0076707522|
|Mass Effect 2 |7907686  |play    |7.656967840735068E-4 |12167|1088|0.016108094 |
|Mass Effect 2 |22371742 |play    |0.02552322613578356  |10485|1088|6.641926E-4 |
|Mass Effect 2 |

In [0]:
rmse = evaluator.evaluate(predictions)

In [41]:
rmse

0.16794349747653262

## Testing

### Recommendations for all Users

In [0]:
recommendations = model.recommendForAllUsers(5)

In [43]:
recommendations.show(10, truncate=False)

+-----+-------------------------------------------------------------------------------------------------------------+
|user |recommendations                                                                                              |
+-----+-------------------------------------------------------------------------------------------------------------+
|1580 |[[2959, 0.0631839], [1110, 0.04496198], [1830, 0.037446685], [3208, 0.037225492], [1, 0.03257108]]           |
|4900 |[[4324, 0.7504163], [4554, 0.50795627], [3799, 0.35190612], [1771, 0.31096798], [3585, 0.2974562]]           |
|5300 |[[1830, 0.03650146], [267, 0.033193476], [3110, 0.029301476], [3273, 0.026726682], [1611, 0.025783136]]      |
|7240 |[[1, 0.0014302569], [3585, 8.106845E-4], [3228, 7.28968E-4], [3302, 5.4942403E-4], [4962, 5.454117E-4]]      |
|7340 |[[1, 1.6826551E-4], [3585, 9.537465E-5], [3228, 8.576095E-5], [3302, 6.463812E-5], [4962, 6.416608E-5]]      |
|471  |[[1830, 3.9120223E-5], [267, 3.3943666E-5], [3273

### Recommendation for all Items (Games)

In [0]:
recommendations = model.recommendForAllItems(5)

In [45]:
recommendations.show(10, truncate=False)

+----+------------------------------------------------------------------------------------------------------------+
|item|recommendations                                                                                             |
+----+------------------------------------------------------------------------------------------------------------+
|1580|[[6329, 0.022355], [7460, 0.0108789], [10372, 0.009527797], [5601, 0.009400668], [11797, 0.0061045857]]     |
|4900|[[6329, 0.2373486], [9896, 0.18274197], [8132, 0.13951838], [6715, 0.11936343], [7460, 0.115504]]           |
|1591|[[6329, 0.57487416], [7460, 0.27975842], [10372, 0.24844491], [5601, 0.24065857], [11797, 0.13133398]]      |
|2122|[[2303, 0.18508837], [5219, 0.17504455], [5518, 0.17257887], [972, 0.1702103], [3845, 0.16773912]]          |
|2142|[[6329, 0.018054327], [7460, 0.00878601], [10372, 0.007849644], [5601, 0.007529477], [5178, 0.0027049235]]  |
|833 |[[9896, 0.03635395], [8132, 0.027621696], [6715, 0.023737509], [10

### Recommendation for a subset of Users

In [0]:
users_test = users.limit(10)

In [0]:
recommendations = model.recommendForUserSubset(users_test, 5)

In [48]:
recommendations.show(10, truncate=False)

+----+------------------------------------------------------------------------------------------------------------+
|user|recommendations                                                                                             |
+----+------------------------------------------------------------------------------------------------------------+
|1   |[[4324, 0.05422043], [4822, 0.03923504], [2569, 0.03547461], [219, 0.03436207], [4085, 0.032793526]]        |
|6   |[[1, 0.0044590365], [3585, 0.002527428], [3228, 0.0022726653], [3302, 0.0017129102], [4962, 0.0017004013]]  |
|3   |[[1, 0.0033653104], [3585, 0.0019074929], [3228, 0.0017152191], [3302, 0.0012927626], [4962, 0.0012833218]] |
|5   |[[4554, 0.011953308], [1771, 0.010481266], [1746, 0.00969277], [3799, 0.008817164], [2205, 0.00830663]]     |
|9   |[[1797, 0.0027961282], [2569, 0.0026869443], [4822, 0.0024209744], [1874, 0.002317584], [274, 0.0023028024]]|
|7   |[[1, 0.3827954], [3585, 0.23519896], [3273, 0.23053911], [1830, 0.

### Recommendation for a subset of Items (Games)

In [0]:
games_test = games.limit(10)

In [0]:
recommendations = model.recommendForItemSubset(games_test, 5)

In [51]:
recommendations.show(10, truncate=False)

+----+-----------------------------------------------------------------------------------------------------------+
|item|recommendations                                                                                            |
+----+-----------------------------------------------------------------------------------------------------------+
|1   |[[9896, 8.498187], [8132, 6.459309], [6715, 5.5467253], [10613, 4.935228], [9577, 4.857704]]               |
|6   |[[6329, 0.2439413], [7460, 0.11871229], [10372, 0.105027765], [5601, 0.10232887], [1449, 0.07350372]]      |
|3   |[[6329, 0.35485035], [9896, 0.20426798], [7460, 0.1726854], [8132, 0.1591491], [10372, 0.15123305]]        |
|5   |[[1449, 0.049378157], [282, 0.043468732], [5803, 0.042421453], [2705, 0.032898705], [6329, 0.032034222]]   |
|9   |[[6329, 0.019690938], [9896, 0.016382797], [8132, 0.012433029], [6715, 0.011085818], [10613, 0.009896639]] |
|4   |[[6329, 1.5061194], [11797, 1.0318946], [7460, 0.73294234], [2469, 0.68473

## Save Model

In [0]:
model.save('als_recommendation')