In [34]:
from pyspark.sql.functions import asc, first, mean, row_number, when, udf, stddev_pop, col, lit, collect_list, array
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window
from pyspark.ml.linalg import DenseVector

## Load Dataset
The dataset is loaded from MariaDB into a dataframe.

In [35]:
spark = SparkSession.builder.appName('ReadMariaDB') \
.config("spark.driver.memory", "32g") \
.config("spark.sql.pivotMaxValues", "1000000") \
.getOrCreate()


sql = "select * from 01_sampled_games_2v2 WHERE playtime_forever IS NOT NULL AND playtime_forever > 0"
database = "steam"
user = "root"
password = "example"
server = "192.168.2.62"
port = 3306
jdbc_url = f"jdbc:mysql://{server}:{port}/{database}?permitMysqlScheme"
jdbc_driver = "org.mariadb.jdbc.Driver"

# Create a data frame by reading data from Oracle via JDBC
df = spark.read.format("jdbc") \
    .option("url", jdbc_url) \
    .option("query", sql) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", jdbc_driver) \
    .load()

df = df.drop("playtime_2weeks", "dateretrieved")

In [36]:
# Count the number of rows in the DataFrame
row_count = df.count()

# Print the row count
print("The DataFrame has", row_count, "rows.")

df.show()

The DataFrame has 88349 rows.
+-----------------+-----+----------------+
|          steamid|appid|playtime_forever|
+-----------------+-----+----------------+
|76561197960268000|  300|             109|
|76561197960268000| 1300|              94|
|76561197960268000| 2100|             110|
|76561197960268000| 4000|             152|
|76561197960268000| 2600|              59|
|76561197960268000| 9000|               2|
|76561197960268000| 2300|              40|
|76561197960268000| 2200|             210|
|76561197960268000| 4500|            1002|
|76561197960268000|  400|             380|
|76561197960268000|17300|            4312|
|76561197960268000|  500|             198|
|76561197960268000|22300|             425|
|76561197960268000|19900|             111|
|76561197960268000|18500|            1160|
|76561197960268000|22200|              20|
|76561197960268000|35700|             433|
|76561197960268000|36000|              27|
|76561197960268000|39800|             510|
|76561197960268000|32400

## Split the dataset

In [37]:
# Randomly split the data into 70% training and 30% test data
training, test = df.randomSplit([0.7, 0.3], seed=1234)

## Pearson Correlation Matrix
Since the dataset contains at most ~4500 games, we can expect a 4500^2=81,000,000 sized matrix. Each float entry takes 4 bytes in memory. Therefore, the pearson correlation matrix would take up 324 MB of memory.

Since the memory used is relatively small, we will pre-compute the person correlation matrix and store it for use later in the algorithm.

In [38]:
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import collect_list, udf, size
from pyspark.sql.types import ArrayType, DoubleType

# Compute the maximum length of the lists of playtime_forever values
max_len = training.filter("playtime_forever IS NOT NULL") \
    .groupBy('appid').agg(size(collect_list('playtime_forever')).alias('num_playtimes')) \
    .agg({'num_playtimes': 'max'}).collect()[0][0]

# Define a UDF to pad lists with zeros
pad_zeros = udf(lambda x: x + [0.0]*(max_len-len(x)), ArrayType(DoubleType()))

# Create playtime vectors for each game
list_to_dense = udf(lambda l: Vectors.dense(l), VectorUDT())
vectors = training.filter("playtime_forever IS NOT NULL")\
    .groupBy('appid').agg(collect_list('playtime_forever'))\
        .withColumn('padded_features', pad_zeros('collect_list(playtime_forever)')) \
        .withColumn('features', list_to_dense('padded_features'))
# vectors = vectors.rdd.zipWithIndex().toDF(['appid', 'features', 'index'])
# vectors = vectors.withColumn("id", vectors["index"].cast("int")).drop("index")
vectors.show()

+-----+------------------------------+--------------------+--------------------+
|appid|collect_list(playtime_forever)|     padded_features|            features|
+-----+------------------------------+--------------------+--------------------+
|  100|          [14, 209, 46, 133...|[null, null, null...|[14.0,209.0,46.0,...|
|  300|          [5, 18, 19, 2373,...|[null, null, null...|[5.0,18.0,19.0,23...|
|  400|          [223, 157, 1, 77,...|[null, null, null...|[223.0,157.0,1.0,...|
|  500|          [349, 311, 151, 4...|[null, null, null...|[349.0,311.0,151....|
| 1200|          [10, 387, 172, 20...|[null, null, null...|[10.0,387.0,172.0...|
| 1300|          [7, 91, 11, 1, 10...|[null, null, null...|[7.0,91.0,11.0,1....|
| 1500|          [8, 39, 24, 28, 1...|[null, null, null...|[8.0,39.0,24.0,28...|
| 1600|          [12, 1527, 3, 102...|[null, null, null...|[12.0,1527.0,3.0,...|
| 1700|          [383, 257, 14, 40...|[null, null, null...|[383.0,257.0,14.0...|
| 1900|          [1, 432, 24

In [39]:
# Add a row number column to the game matrix
windowSpec = Window.orderBy("appid")
vectors = vectors.withColumn("row_num", row_number().over(windowSpec))
vectors.show()

23/04/04 14:16:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:16:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:16:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:16:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:16:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:16:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


[Stage 187:>                                                        (0 + 1) / 1]

+-----+------------------------------+--------------------+--------------------+-------+
|appid|collect_list(playtime_forever)|     padded_features|            features|row_num|
+-----+------------------------------+--------------------+--------------------+-------+
|  100|          [14, 209, 46, 133...|[null, null, null...|[14.0,209.0,46.0,...|      1|
|  300|          [5, 18, 19, 2373,...|[null, null, null...|[5.0,18.0,19.0,23...|      2|
|  400|          [223, 157, 1, 77,...|[null, null, null...|[223.0,157.0,1.0,...|      3|
|  500|          [349, 311, 151, 4...|[null, null, null...|[349.0,311.0,151....|      4|
| 1200|          [10, 387, 172, 20...|[null, null, null...|[10.0,387.0,172.0...|      5|
| 1300|          [7, 91, 11, 1, 10...|[null, null, null...|[7.0,91.0,11.0,1....|      6|
| 1500|          [8, 39, 24, 28, 1...|[null, null, null...|[8.0,39.0,24.0,28...|      7|
| 1600|          [12, 1527, 3, 102...|[null, null, null...|[12.0,1527.0,3.0,...|      8|
| 1700|          [383

                                                                                

In [40]:
pearson_matrix = Correlation.corr(vectors.orderBy("row_num"), "features", "pearson")
corr_array = pearson_matrix.head()[0].toArray()
print(corr_array)

23/04/04 14:16:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:16:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:16:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:16:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:16:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:16:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


                                                                                

[[ 1.          0.23675416  0.13102298 ... -0.00332081 -0.00332081
  -0.00332081]
 [ 0.23675416  1.          0.17286696 ... -0.00427187 -0.00427187
  -0.00427187]
 [ 0.13102298  0.17286696  1.         ... -0.00875646 -0.00875646
  -0.00875646]
 ...
 [-0.00332081 -0.00427187 -0.00875646 ...  1.          1.
   1.        ]
 [-0.00332081 -0.00427187 -0.00875646 ...  1.          1.
   1.        ]
 [-0.00332081 -0.00427187 -0.00875646 ...  1.          1.
   1.        ]]


## Per-Game 1-5 Rating Normalization
For each game, we calculate the mean and standard deviation. We then create buckets for each rating:
### Cut points
* Cut point 1: mean - std_dev*0.5 if > 0, else 0
* Cut point 2: mean
* Cut point 3: mean + std_dev*0.5
* Cut point 4: mean + std_dev
### Ratings
* Rating 1: 0 < x < cut point 1
* Rating 2: cut point 1 < x < cut point 2
* Rating 3: cut point 2 < x < cut point 3
* Rating 4: cut point 3 < x < cut point 4
* Rating 5: cut point 5 < x < inf

In [41]:
# Calculate the per-game mean and standard deviation of the playtime column
game_stats = training.filter(col("playtime_forever") > 0).groupBy("appid").agg(
    mean("playtime_forever").alias("game_mean_playtime"),
    stddev_pop("playtime_forever").alias("game_stddev_playtime")
)
training = training.join(game_stats, "appid")

training = training.withColumn("cut_point_1", when(col("game_mean_playtime") - (col("game_stddev_playtime") * 0.5) > 0, col("game_mean_playtime") - (col("game_stddev_playtime") * 0.5)).otherwise(0))
training = training.withColumn("cut_point_2", col("game_mean_playtime"))
training = training.withColumn("cut_point_3", col("game_mean_playtime") + (col("game_stddev_playtime") * 0.5))
training = training.withColumn("cut_point_4", col("game_mean_playtime") + col("game_stddev_playtime"))

training = training.withColumn(
    "ratings",
    when(col("playtime_forever") <= col("cut_point_1"), lit(1))
    .when((col("playtime_forever") > col("cut_point_1")) & (col("playtime_forever") <= col("cut_point_2")), lit(2))
    .when((col("playtime_forever") > col("cut_point_2")) & (col("playtime_forever") <= col("cut_point_3")), lit(3))
    .when((col("playtime_forever") > col("cut_point_3")) & (col("playtime_forever") <= col("cut_point_4")), lit(4))
    .otherwise(lit(5))
)

training.show()

+------+-----------------+----------------+------------------+--------------------+-----------------+-----------+-----------------+------------------+-------+
| appid|          steamid|playtime_forever|game_mean_playtime|game_stddev_playtime|      cut_point_1|cut_point_2|      cut_point_3|       cut_point_4|ratings|
+------+-----------------+----------------+------------------+--------------------+-----------------+-----------+-----------------+------------------+-------+
|263700|76561198071863000|              34|          109.0000|                75.0|             71.5|   109.0000|            146.5|             184.0|      1|
|263700|76561197968595000|             184|          109.0000|                75.0|             71.5|   109.0000|            146.5|             184.0|      4|
| 49900|76561198062015000|              23|           99.9355|  138.69656727242017|30.58721636378992|    99.9355|169.2837836362101|238.63206727242016|      1|
| 49900|76561198049923000|              32|   

In [42]:
# Get unique app IDs in the dataset
(training_ids, test_ids) = df.select("steamid").distinct().randomSplit([0.8, 0.2], seed=123)

## Rating Prediction

In [72]:
# Create a dictionary of appids and row numbers
all_row_num = vectors.select("appid", "row_num").rdd.collectAsMap()

def predict_rating(steamid, appid):
    # Get all the user's ratings
    user_ratings = training.filter(col("steamid") == steamid).select("appid", "ratings")

    # Get appid row number from the vectors dataframe
    appid_row_num = vectors.filter(col("appid") == appid).select("row_num").collect()[0][0]

    # Get a list of correlations between the appid and all other games
    corr = corr_array[appid_row_num]

    # Create a dictionary of appids and ratings for the user
    user_ratings_dict = user_ratings.rdd.collectAsMap()

    # Set the correlation to 0 for appids that the user has not rated
    for appid in all_row_num:
        if appid not in user_ratings_dict:
            corr[all_row_num[appid]] = 0

    # Make a list of tuples of (appid, correlation, rating)
    corr_list = []
    for appid in all_row_num:
        if appid in user_ratings_dict:
            row_num = all_row_num[appid]
            corr_list.append((appid, corr[row_num], user_ratings_dict[appid]))

    # Sort the list by correlation
    corr_list.sort(key=lambda x: x[1], reverse=True)

    # Get the top 10 most similar appids
    top_10 = corr_list[1:11]

    # Calculate the weighted average of the top 10 appids
    numerator = 0
    denominator = 0
    for appid, corr, rating in top_10:
        numerator += corr * rating
        denominator += corr
    if denominator != 0:
        return numerator / denominator
    else:
        return 0
    
print(predict_rating(76561198023872000, 500))

23/04/04 14:34:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:34:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:34:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:34:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:34:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 14:34:04 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/04/04 1