In [None]:
from pyspark.sql import SparkSession
from pyspark.rdd import RDD
from pyspark.sql import Row, Window
from pyspark.sql import DataFrame
from pyspark.sql.functions import udf, lit
from pyspark.sql.functions import desc
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import DecimalType
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import when, col, lit
from pyspark.sql.functions import avg, stddev_pop

spark = SparkSession.builder.appName('ReadMariaDB') \
.config("spark.driver.memory", "32g") \
.config("spark.sql.pivotMaxValues", "1000000") \
.getOrCreate()

# sql = "select * from 01_sampled_games_2 WHERE playtime_forever IS NOT NULL AND playtime_forever > 0"
sql = "select * from 01_sampled_games_2 WHERE playtime_forever IS NOT NULL"
database = "steam"
user = "root"
password = "example"
server = "192.168.2.62"
port = 3306
jdbc_url = f"jdbc:mysql://{server}:{port}/{database}?permitMysqlScheme"
jdbc_driver = "org.mariadb.jdbc.Driver"

# Create a data frame by reading data from MariaDB via JDBC
df = spark.read.format("jdbc") \
    .option("url", jdbc_url) \
    .option("query", sql) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", jdbc_driver) \
    .load()

## Z-Score Normalization

In [None]:
from pyspark.sql.functions import mean, stddev_pop, col

# Calculate the per-game mean and standard deviation of the playtime column
game_stats = df.filter(col("playtime_forever") > 0).groupBy("appid").agg(
    mean("playtime_forever").alias("game_mean_playtime"),
    stddev_pop("playtime_forever").alias("game_stddev_playtime")
)

# Normalize the playtime column on a per-game basis using z-score normalization
df = df.join(game_stats, "appid")
df = df.withColumn("z_score", (col("playtime_forever") - col("game_mean_playtime")) / col("game_stddev_playtime"))
df = df.dropna(subset=["z_score"])

## Per-Game 1-5 Rank Normalization

First, we calculate the mean and standard deviation for the playtime for each user. Then, we define dynamically define the cut points for the ratings from 1 to 5 on a per-user basis.

In [None]:
df = df.withColumn("cut_point_1", when(col("game_mean_playtime") - (col("game_stddev_playtime") * 0.5) > 0, col("game_mean_playtime") - (col("game_stddev_playtime") * 0.5)).otherwise(0))
df = df.withColumn("cut_point_2", col("game_mean_playtime"))
df = df.withColumn("cut_point_3", col("game_mean_playtime") + (col("game_stddev_playtime") * 0.5))
df = df.withColumn("cut_point_4", col("game_mean_playtime") + col("game_stddev_playtime"))
df = df.withColumn("cut_point_5", col("game_mean_playtime") + (col("game_stddev_playtime") * 1.5))

Afterwards, we assign ratings for each playtime record based on the cut points.

In [None]:
df = df.withColumn(
    "ranks",
    when(col("playtime_forever") <= col("cut_point_1"), lit(1))
    .when((col("playtime_forever") > col("cut_point_1")) & (col("playtime_forever") <= col("cut_point_2")), lit(2))
    .when((col("playtime_forever") > col("cut_point_2")) & (col("playtime_forever") <= col("cut_point_3")), lit(3))
    .when((col("playtime_forever") > col("cut_point_3")) & (col("playtime_forever") <= col("cut_point_4")), lit(4))
    .otherwise(lit(5))
)

In [None]:
df.show()

In [None]:
steamidIndexer = StringIndexer(inputCol="steamid", outputCol="steamidIndex")
model = steamidIndexer.fit(df)
df = model.transform(df)

appidIndexer = StringIndexer(inputCol="appid", outputCol="appidIndex")
model = appidIndexer.fit(df)
df = model.transform(df)

In [None]:
selected_rating_col = "ranks"
(training, test) = df.randomSplit([0.8, 0.2], seed=123)
ranks = [5, 10, 15, 20, 40, 60, 80, 100, 150, 200]
min_rmse = float('inf')
best_rank = -1

for rank in ranks:
    als = ALS(maxIter=10, rank=rank, regParam=0.01, coldStartStrategy='drop', userCol='steamidIndex', itemCol='appidIndex', ratingCol=selected_rating_col, implicitPrefs=True)
    als.setSeed(123)
    model = als.fit(training)

    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol=selected_rating_col, predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)

    print("Rank: " + str(rank) + " RMSE: " + str(rmse))

    if rmse < min_rmse:
        min_rmse = rmse
        best_rank = rank

print("The best model was with rank " + str(best_rank))

In [None]:
print(rmse)