In [1]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import split
from pyspark.sql.functions import explode
from pyspark.sql.functions import trim
from pyspark.sql.functions import col
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import array

from pyspark.sql.functions import collect_list

In [2]:
# create a SparkSession object
spark = SparkSession.builder \
    .appName("MyApp") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [3]:
games_data = spark.read.option("delimiter", ";").csv("data/steam/steam_games.csv", header=True)
games_data = games_data.select('App ID', 'genre')
games_data = games_data.select('App ID', regexp_replace(col("genre"), " ", "").alias('genre'))
games_data = games_data.filter(~col("genre").like("%https%"))
games_data.show()

+-------+--------------------+
| App ID|               genre|
+-------+--------------------+
|     10|              Action|
|1000000|Action,Adventure,...|
|1000010|Adventure,Indie,R...|
|1000030|Action,Indie,Simu...|
|1000040|Action,Casual,Ind...|
|1000080|Action,Adventure,...|
|1000100|Adventure,Indie,R...|
|1000110|Action,Adventure,...|
|1000130|        Casual,Indie|
|1000280|           Indie,RPG|
|1000310| Action,RPG,Strategy|
|1000360|  Action,EarlyAccess|
|1000370|               Indie|
|1000380|Action,Adventure,...|
|1000410|  Action,EarlyAccess|
|1000470| Adventure,Indie,RPG|
|1000480|Action,Indie,RPG,...|
|1000500|Action,Casual,Ind...|
|1000510|           Education|
|1000540|Casual,FreetoPlay...|
+-------+--------------------+
only showing top 20 rows



In [4]:
playtime_data = spark.read.csv("data/steam/games_1_sample.csv", header=True)
playtime_data = playtime_data.filter(playtime_data.playtime_forever.isNotNull())
playtime_data = playtime_data.join(games_data, playtime_data['appid'] == games_data['App ID']).select('steamid', 'appid', 'playtime_forever')
playtime_data.show()

+-----------------+------+----------------+
|          steamid| appid|playtime_forever|
+-----------------+------+----------------+
|76561198015117260| 32430|             715|
|76561198015116850|   320|               7|
|76561198015116840|  1500|               6|
|76561198015116840| 10150|             479|
|76561198015116840|  8190|              77|
|76561198015116600|   240|            3004|
|76561198015115520|  8190|            5266|
|76561198015115080| 12520|              33|
|76561198015115080|  7600|             958|
|76561198015115030| 26900|               9|
|76561198015115030| 22000|             197|
|76561198015114670|    10|          125969|
|76561198015114350|    10|            9549|
|76561198015113950|   320|               7|
|76561198015113950|202970|             344|
|76561198015113900| 50130|            1025|
|76561198015113900|   620|             957|
|76561198015113900| 72850|            7415|
|76561198015113900| 46570|            1227|
|76561198015113900|233470|      

In [5]:
games_data = games_data.withColumn("genre_array", split(games_data.genre, ","))
games_data.show()

+-------+--------------------+--------------------+
| App ID|               genre|         genre_array|
+-------+--------------------+--------------------+
|     10|              Action|            [Action]|
|1000000|Action,Adventure,...|[Action, Adventur...|
|1000010|Adventure,Indie,R...|[Adventure, Indie...|
|1000030|Action,Indie,Simu...|[Action, Indie, S...|
|1000040|Action,Casual,Ind...|[Action, Casual, ...|
|1000080|Action,Adventure,...|[Action, Adventur...|
|1000100|Adventure,Indie,R...|[Adventure, Indie...|
|1000110|Action,Adventure,...|[Action, Adventur...|
|1000130|        Casual,Indie|     [Casual, Indie]|
|1000280|           Indie,RPG|        [Indie, RPG]|
|1000310| Action,RPG,Strategy|[Action, RPG, Str...|
|1000360|  Action,EarlyAccess|[Action, EarlyAcc...|
|1000370|               Indie|             [Indie]|
|1000380|Action,Adventure,...|[Action, Adventur...|
|1000410|  Action,EarlyAccess|[Action, EarlyAcc...|
|1000470| Adventure,Indie,RPG|[Adventure, Indie...|
|1000480|Act

In [6]:
games_data = games_data.select(games_data['App ID'], explode(games_data.genre_array).alias('genre'))
games_data.where(games_data['App ID'] == 1599250).show()

+-------+----------+
| App ID|     genre|
+-------+----------+
|1599250|    Action|
|1599250| Adventure|
|1599250|    Casual|
|1599250|     Indie|
|1599250|       RPG|
|1599250|Simulation|
+-------+----------+



In [7]:
games_data= games_data.crosstab('App ID', 'genre')
games_data.show()

+------------+----------+------+---------+------------------+---------------+------+-------------------+-----------+---------+----------+---------------+----+-----+--------------------+-----+------+------------+---+------+-------------+----------+----------------+------+--------+---------+---------------+-------+-------------+
|App ID_genre|Accounting|Action|Adventure|Animation&Modeling|AudioProduction|Casual|Design&Illustration|EarlyAccess|Education|FreetoPlay|GameDevelopment|Gore|Indie|MassivelyMultiplayer|Movie|Nudity|PhotoEditing|RPG|Racing|SexualContent|Simulation|SoftwareTraining|Sports|Strategy|Utilities|VideoProduction|Violent|WebPublishing|
+------------+----------+------+---------+------------------+---------------+------+-------------------+-----------+---------+----------+---------------+----+-----+--------------------+-----+------+------------+---+------+-------------+----------+----------------+------+--------+---------+---------------+-------+-------------+
|     1599250

In [8]:
columns = []

for c in games_data.columns[1:]:
    columns.append(col(c))
    
games_data = games_data.withColumn("one_hot_vector", array(columns))
games_data = games_data.select(col('App ID_genre').alias('appid'), 'one_hot_vector')
games_data.show()

+-------+--------------------+
|  appid|      one_hot_vector|
+-------+--------------------+
|1599250|[0, 1, 1, 0, 0, 1...|
|1029150|[0, 0, 1, 0, 0, 0...|
| 667880|[0, 0, 0, 0, 0, 1...|
|1381630|[0, 1, 0, 0, 0, 1...|
| 887510|[0, 1, 0, 0, 0, 0...|
|1036260|[0, 0, 1, 0, 0, 1...|
|1091380|[0, 0, 0, 0, 0, 0...|
|1104670|[0, 0, 0, 0, 0, 1...|
|1592230|[0, 1, 1, 0, 0, 0...|
|1340300|[0, 1, 0, 0, 0, 0...|
| 914050|[0, 1, 1, 0, 0, 1...|
| 670490|[0, 0, 0, 0, 0, 0...|
|1711160|[0, 0, 0, 0, 0, 1...|
|1530040|[0, 0, 1, 0, 0, 0...|
| 449160|[0, 1, 1, 0, 0, 1...|
|2085920|[0, 1, 0, 0, 0, 1...|
| 953250|[0, 1, 1, 0, 0, 1...|
| 596370|[0, 0, 0, 0, 0, 1...|
|1636700|[0, 1, 1, 0, 0, 0...|
|1531720|[0, 0, 1, 0, 0, 1...|
+-------+--------------------+
only showing top 20 rows



In [9]:
games_data.where(games_data.appid == 202970).show()

+------+--------------------+
| appid|      one_hot_vector|
+------+--------------------+
|202970|[0, 1, 0, 0, 0, 0...|
+------+--------------------+



In [10]:
playtime_data = playtime_data.join(games_data, playtime_data['appid'] == games_data['appid']).select('steamid', 'one_hot_vector', 'playtime_forever')
playtime_data.show()

+-----------------+--------------------+----------------+
|          steamid|      one_hot_vector|playtime_forever|
+-----------------+--------------------+----------------+
|76561198015117260|[0, 1, 0, 0, 0, 0...|             715|
|76561198015116850|[0, 1, 0, 0, 0, 0...|               7|
|76561198015116840|[0, 0, 0, 0, 0, 0...|               6|
|76561198015116840|[0, 1, 1, 0, 0, 0...|             479|
|76561198015116840|[0, 1, 1, 0, 0, 0...|              77|
|76561198015116600|[0, 1, 0, 0, 0, 0...|            3004|
|76561198015115520|[0, 1, 1, 0, 0, 0...|            5266|
|76561198015115080|[0, 0, 0, 0, 0, 0...|              33|
|76561198015115080|[0, 0, 0, 0, 0, 0...|             958|
|76561198015115030|[0, 0, 0, 0, 0, 1...|               9|
|76561198015115030|[0, 0, 0, 0, 0, 0...|             197|
|76561198015114670|[0, 1, 0, 0, 0, 0...|          125969|
|76561198015114350|[0, 1, 0, 0, 0, 0...|            9549|
|76561198015113950|[0, 1, 0, 0, 0, 0...|               7|
|7656119801511

In [11]:
playtime_data = playtime_data.groupBy('steamid').agg(collect_list('one_hot_vector').alias('game_vector_list'), collect_list('playtime_forever').alias('playtime_list'))


In [13]:
playtime_data = playtime_data.withColumn("playtime_list", playtime_data.playtime_list.cast("array<int>"))

In [11]:
playtime_data.show()

+-----------------+--------------------+----------------+
|          steamid|      one_hot_vector|playtime_forever|
+-----------------+--------------------+----------------+
|76561198015117260|[0, 1, 0, 0, 0, 0...|             715|
|76561198015116850|[0, 1, 0, 0, 0, 0...|               7|
|76561198015116840|[0, 0, 0, 0, 0, 0...|               6|
|76561198015116840|[0, 1, 1, 0, 0, 0...|             479|
|76561198015116840|[0, 1, 1, 0, 0, 0...|              77|
|76561198015116600|[0, 1, 0, 0, 0, 0...|            3004|
|76561198015115520|[0, 1, 1, 0, 0, 0...|            5266|
|76561198015115080|[0, 0, 0, 0, 0, 0...|              33|
|76561198015115080|[0, 0, 0, 0, 0, 0...|             958|
|76561198015115030|[0, 0, 0, 0, 0, 1...|               9|
|76561198015115030|[0, 0, 0, 0, 0, 0...|             197|
|76561198015114670|[0, 1, 0, 0, 0, 0...|          125969|
|76561198015114350|[0, 1, 0, 0, 0, 0...|            9549|
|76561198015113950|[0, 1, 0, 0, 0, 0...|               7|
|7656119801511