In [17]:
from pyspark.sql.functions import asc, first, mean, row_number, when, udf, stddev_pop, col, lit, collect_list, array
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window
from pyspark.ml.linalg import DenseVector

## Load Dataset
The dataset is loaded from MariaDB into a dataframe.

In [18]:
spark = SparkSession.builder.appName('ReadMariaDB') \
.config("spark.driver.memory", "32g") \
.config("spark.sql.pivotMaxValues", "1000000") \
.getOrCreate()


sql = "select * from 01_sampled_games_2v2 WHERE playtime_forever IS NOT NULL AND playtime_forever > 0"
database = "steam"
user = "root"
password = "example"
server = "192.168.2.62"
port = 3306
jdbc_url = f"jdbc:mysql://{server}:{port}/{database}?permitMysqlScheme"
jdbc_driver = "org.mariadb.jdbc.Driver"

# Create a data frame by reading data from Oracle via JDBC
df = spark.read.format("jdbc") \
    .option("url", jdbc_url) \
    .option("query", sql) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", jdbc_driver) \
    .load()

df = df.drop("playtime_2weeks", "dateretrieved")

In [19]:
# Count the number of rows in the DataFrame
row_count = df.count()

# Print the row count
print("The DataFrame has", row_count, "rows.")

df.show()

The DataFrame has 88349 rows.
+-----------------+-----+----------------+
|          steamid|appid|playtime_forever|
+-----------------+-----+----------------+
|76561197960268000|  300|             109|
|76561197960268000| 1300|              94|
|76561197960268000| 2100|             110|
|76561197960268000| 4000|             152|
|76561197960268000| 2600|              59|
|76561197960268000| 9000|               2|
|76561197960268000| 2300|              40|
|76561197960268000| 2200|             210|
|76561197960268000| 4500|            1002|
|76561197960268000|  400|             380|
|76561197960268000|17300|            4312|
|76561197960268000|  500|             198|
|76561197960268000|22300|             425|
|76561197960268000|19900|             111|
|76561197960268000|18500|            1160|
|76561197960268000|22200|              20|
|76561197960268000|35700|             433|
|76561197960268000|36000|              27|
|76561197960268000|39800|             510|
|76561197960268000|32400

## Pearson Correlation Matrix
Since the dataset contains at most ~4500 games, we can expect a 4500^2=81,000,000 sized matrix. Each float entry takes 4 bytes in memory. Therefore, the pearson correlation matrix would take up 324 MB of memory.

Since the memory used is relatively small, we will pre-compute the person correlation matrix and store it for use later in the algorithm.

In [20]:
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import collect_list, udf, size
from pyspark.sql.types import ArrayType, DoubleType

# Compute the maximum length of the lists of playtime_forever values
max_len = df.filter("playtime_forever IS NOT NULL") \
    .groupBy('appid').agg(size(collect_list('playtime_forever')).alias('num_playtimes')) \
    .agg({'num_playtimes': 'max'}).collect()[0][0]

# Define a UDF to pad lists with zeros
pad_zeros = udf(lambda x: x + [0.0]*(max_len-len(x)), ArrayType(DoubleType()))

# Create playtime vectors for each game
list_to_dense = udf(lambda l: Vectors.dense(l), VectorUDT())
vectors = df.filter("playtime_forever IS NOT NULL")\
    .groupBy('appid').agg(collect_list('playtime_forever'))\
        .withColumn('padded_features', pad_zeros('collect_list(playtime_forever)')) \
        .withColumn('features', list_to_dense('padded_features'))
vectors.show()

+-----+------------------------------+--------------------+--------------------+
|appid|collect_list(playtime_forever)|     padded_features|            features|
+-----+------------------------------+--------------------+--------------------+
|  100|          [14, 57, 209, 46,...|[null, null, null...|[14.0,57.0,209.0,...|
|  300|          [712, 5, 18, 19, ...|[null, null, null...|[712.0,5.0,18.0,1...|
|  400|          [110, 467, 35, 22...|[null, null, null...|[110.0,467.0,35.0...|
|  500|          [33, 349, 311, 15...|[null, null, null...|[33.0,349.0,311.0...|
| 1200|          [10, 387, 172, 20...|[null, null, null...|[10.0,387.0,172.0...|
| 1300|          [7, 2, 270, 91, 2...|[null, null, null...|[7.0,2.0,270.0,91...|
| 1500|          [8, 39, 24, 12, 2...|[null, null, null...|[8.0,39.0,24.0,12...|
| 1600|          [12, 3, 23, 1527,...|[null, null, null...|[12.0,3.0,23.0,15...|
| 1700|          [383, 11, 257, 14...|[null, null, null...|[383.0,11.0,257.0...|
| 1900|          [1, 898, 58

In [21]:
pearson_matrix = Correlation.corr(vectors, "features", "pearson")
corr_array = pearson_matrix.head()[0].toArray()
print(corr_array)

                                                                                

[[ 1.          0.13624647  0.31365756 ... -0.00613546 -0.00613546
  -0.00613546]
 [ 0.13624647  1.          0.26861489 ... -0.00189201 -0.00189201
  -0.00189201]
 [ 0.31365756  0.26861489  1.         ... -0.0054461  -0.0054461
  -0.0054461 ]
 ...
 [-0.00613546 -0.00189201 -0.0054461  ...  1.          1.
   1.        ]
 [-0.00613546 -0.00189201 -0.0054461  ...  1.          1.
   1.        ]
 [-0.00613546 -0.00189201 -0.0054461  ...  1.          1.
   1.        ]]


## Per-Game 1-5 Rating Normalization
For each game, we calculate the mean and standard deviation. We then create buckets for each rating:
### Cut points
* Cut point 1: mean - std_dev*0.5 if > 0, else 0
* Cut point 2: mean
* Cut point 3: mean + std_dev*0.5
* Cut point 4: mean + std_dev
### Ratings
* Rating 1: 0 < x < cut point 1
* Rating 2: cut point 1 < x < cut point 2
* Rating 3: cut point 2 < x < cut point 3
* Rating 4: cut point 3 < x < cut point 4
* Rating 5: cut point 5 < x < inf

In [22]:

# Calculate the per-game mean and standard deviation of the playtime column
game_stats = df.filter(col("playtime_forever") > 0).groupBy("appid").agg(
    mean("playtime_forever").alias("game_mean_playtime"),
    stddev_pop("playtime_forever").alias("game_stddev_playtime")
)
df = df.join(game_stats, "appid")

df = df.withColumn("cut_point_1", when(col("game_mean_playtime") - (col("game_stddev_playtime") * 0.5) > 0, col("game_mean_playtime") - (col("game_stddev_playtime") * 0.5)).otherwise(0))
df = df.withColumn("cut_point_2", col("game_mean_playtime"))
df = df.withColumn("cut_point_3", col("game_mean_playtime") + (col("game_stddev_playtime") * 0.5))
df = df.withColumn("cut_point_4", col("game_mean_playtime") + col("game_stddev_playtime"))

df = df.withColumn(
    "ratings",
    when(col("playtime_forever") <= col("cut_point_1"), lit(1))
    .when((col("playtime_forever") > col("cut_point_1")) & (col("playtime_forever") <= col("cut_point_2")), lit(2))
    .when((col("playtime_forever") > col("cut_point_2")) & (col("playtime_forever") <= col("cut_point_3")), lit(3))
    .when((col("playtime_forever") > col("cut_point_3")) & (col("playtime_forever") <= col("cut_point_4")), lit(4))
    .otherwise(lit(5))
)

df.show()

+------+-----------------+----------------+------------------+--------------------+-----------------+-----------+------------------+------------------+-------+
| appid|          steamid|playtime_forever|game_mean_playtime|game_stddev_playtime|      cut_point_1|cut_point_2|       cut_point_3|       cut_point_4|ratings|
+------+-----------------+----------------+------------------+--------------------+-----------------+-----------+------------------+------------------+-------+
|263700|76561198071863000|              34|           55.7500|   75.15442435412568|18.17278782293716|    55.7500| 93.32721217706285| 130.9044243541257|      2|
|263700|76561198055182000|               3|           55.7500|   75.15442435412568|18.17278782293716|    55.7500| 93.32721217706285| 130.9044243541257|      1|
|263700|76561197968825000|               2|           55.7500|   75.15442435412568|18.17278782293716|    55.7500| 93.32721217706285| 130.9044243541257|      1|
|263700|76561197968595000|             1