Imports

In [None]:
from pyspark.sql import SparkSession
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql.window import Window #for ranking
from pyspark.sql.functions import lit, mean, stddev_pop
from pyspark.sql.functions import collect_set, collect_list
from pyspark.sql.functions import struct
from pyspark.sql.functions import slice
from pyspark.sql.functions import col
from pyspark.sql.functions import desc
from pyspark.sql.functions import udf
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import DecimalType, ArrayType, IntegerType, FloatType
import pyspark.sql.functions as F
from pyspark.sql.functions import avg, broadcast, when

Define cosine similarity and weighted avg functions

In [None]:
# cosine similarity function
def cosine_similarity_udf(a, b):
    dot_product = sum([x * y for x, y in zip(a, b)])
    norm_a = sum([x**2 for x in a])**0.5
    norm_b = sum([x**2 for x in b])**0.5
    return dot_product / (norm_a * norm_b)


# weighted average features function
def weighted_avg_features(ratings, features):
    if not ratings or not features:
        return []

    weighted_sum = [0] * len(features[0])
    total_weight = 0

    for rating, feature in zip(ratings, features):
        weight = float(rating)
        total_weight += weight
        weighted_sum = [ws + weight * f for ws, f in zip(weighted_sum, feature)]

    if total_weight == 0:
        return weighted_sum

    return [ws / total_weight for ws in weighted_sum]

Load Dataset

In [16]:
spark = SparkSession.builder.appName('ReadMySQL') \
.config("spark.driver.memory", "32g") \
.config("spark.sql.pivotMaxValues", "1000000") \
.config("spark.jars", "C:\\Program Files (x86)\\MySQL\\Connector J 8.0\\mysql-connector-j-8.0.32.jar") \
.getOrCreate()

# sql = "select * from 01_sampled_games_2v2 WHERE playtime_forever IS NOT NULL AND playtime_forever > 0"
sql = """
SELECT p.steamid, p.appid, p.playtime_2weeks, p.playtime_forever, p.dateretrieved, g.genre
FROM 01_sampled_games_2v2 AS p
JOIN games_genres AS g ON p.appid = g.appid
WHERE p.playtime_forever IS NOT NULL AND p.playtime_forever > 0
"""
database = "steam"
user = "root"
password = "root"
server = "127.0.0.1"
port = 3307
jdbc_url = f"jdbc:mysql://{server}:{port}/{database}"
jdbc_driver = "com.mysql.cj.jdbc.Driver"

# Create a data frame by reading data from Oracle via JDBC
df = spark.read.format("jdbc") \
    .option("url", jdbc_url) \
    .option("query", sql) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", jdbc_driver) \
    .load()

df = df.drop("playtime_2weeks", "dateretrieved")

In [17]:
# count number of rows in the dataframe
row_count = df.count()
# print the row count
print("Dataframe has", row_count, " rows.")
df.show()

Dataframe has 177826  rows.
+-----------------+-----+----------------+----------+
|          steamid|appid|playtime_forever|     genre|
+-----------------+-----+----------------+----------+
|76561197960268000|  300|             109|    Action|
|76561197960268000| 1300|              94|    Action|
|76561197960268000| 2100|             110|    Action|
|76561197960268000| 2100|             110|       RPG|
|76561197960268000| 4000|             152|     Indie|
|76561197960268000| 4000|             152|Simulation|
|76561197960268000| 2600|              59|    Action|
|76561197960268000| 9000|               2|    Action|
|76561197960268000| 2300|              40|    Action|
|76561197960268000| 2200|             210|    Action|
|76561197960268000| 4500|            1002|    Action|
|76561197960268000| 4500|            1002|       RPG|
|76561197960268000|  400|             380|    Action|
|76561197960268000|17300|            4312|    Action|
|76561197960268000|  500|             198|    Action|


Build item profiles

In [18]:
# build the item profiles
# Group the data by 'appid' and collect the genres for each game into a list
games_genres_df = df.groupBy("appid").agg(collect_set("genre").alias("genres"))

# Create a list of unique genres
unique_genres = sorted(df.select("genre").distinct().rdd.flatMap(lambda x: x).collect())

# Define a UDF to create a binary vector for each game's genres
@udf(returnType=ArrayType(IntegerType()))
def genre_vector(genres):
    return [1 if genre in genres else 0 for genre in unique_genres]
# Add a new column 'genre_vector' to the DataFrame
# the genre vector will now have a 1 for each genre that the game belongs to
games_genres_df = games_genres_df.withColumn("genre_vector", genre_vector("genres"))

# games_genres_df.show(truncate=False)
# Join the main DataFrame with the games_genres_df on appid to include the genre_vector
df = df.join(broadcast(games_genres_df.select("appid", "genre_vector")), on="appid")
df.show()

+-----+-----------------+----------------+----------+--------------------+
|appid|          steamid|playtime_forever|     genre|        genre_vector|
+-----+-----------------+----------------+----------+--------------------+
|  300|76561197960268000|             109|    Action|[1, 0, 0, 0, 0, 0...|
| 1300|76561197960268000|              94|    Action|[1, 0, 0, 0, 0, 0...|
| 2100|76561197960268000|             110|    Action|[1, 0, 0, 0, 0, 0...|
| 2100|76561197960268000|             110|       RPG|[1, 0, 0, 0, 0, 0...|
| 4000|76561197960268000|             152|     Indie|[0, 0, 0, 0, 0, 0...|
| 4000|76561197960268000|             152|Simulation|[0, 0, 0, 0, 0, 0...|
| 2600|76561197960268000|              59|    Action|[1, 0, 0, 0, 0, 0...|
| 9000|76561197960268000|               2|    Action|[1, 0, 0, 0, 0, 0...|
| 2300|76561197960268000|              40|    Action|[1, 0, 0, 0, 0, 0...|
| 2200|76561197960268000|             210|    Action|[1, 0, 0, 0, 0, 0...|
| 4500|76561197960268000|

Build the user profile

In [19]:
# Calculate the per-game mean and standard deviation of the playtime column
game_stats = df.filter(col("playtime_forever") > 0).groupBy("appid").agg(
    mean("playtime_forever").alias("game_mean_playtime"),
    stddev_pop("playtime_forever").alias("game_stddev_playtime")
)

# Rename the columns in game_stats dataframe
game_stats = game_stats.withColumnRenamed("game_mean_playtime", "game_mean_playtime_new")
game_stats = game_stats.withColumnRenamed("game_stddev_playtime", "game_stddev_playtime_new")

# Join the game_stats dataframe with the main dataframe
df = df.join(game_stats, "appid")

# Drop the ambiguous columns after the join operation
df = df.drop("game_mean_playtime", "game_stddev_playtime")

# Calculate the cut points and assign ratings
df = df.withColumn("cut_point_1", when(col("game_mean_playtime_new") - (col("game_stddev_playtime_new") * 0.5) > 0, col("game_mean_playtime_new") - (col("game_stddev_playtime_new") * 0.5)).otherwise(0))
df = df.withColumn("cut_point_2", col("game_mean_playtime_new"))
df = df.withColumn("cut_point_3", col("game_mean_playtime_new") + (col("game_stddev_playtime_new") * 0.5))
df = df.withColumn("cut_point_4", col("game_mean_playtime_new") + col("game_stddev_playtime_new"))

df = df.withColumn(
    "ratings",
    when(col("playtime_forever") <= col("cut_point_1"), lit(1))
    .when((col("playtime_forever") > col("cut_point_1")) & (col("playtime_forever") <= col("cut_point_2")), lit(2))
    .when((col("playtime_forever") > col("cut_point_2")) & (col("playtime_forever") <= col("cut_point_3")), lit(3))
    .when((col("playtime_forever") > col("cut_point_3")) & (col("playtime_forever") <= col("cut_point_4")), lit(4))
    .otherwise(lit(5))
)

# Update the user profile calculation to use the new ratings column
user_aggregated_data = df.groupBy("steamid").agg(
    collect_list("genre_vector").alias("genres_list"),
    collect_list("ratings").alias("ratings_list")
)
df.show()

+-----+-----------------+----------------+----------+--------------------+----------------------+------------------------+------------------+------------------+------------------+------------------+-------+
|appid|          steamid|playtime_forever|     genre|        genre_vector|game_mean_playtime_new|game_stddev_playtime_new|       cut_point_1|       cut_point_2|       cut_point_3|       cut_point_4|ratings|
+-----+-----------------+----------------+----------+--------------------+----------------------+------------------------+------------------+------------------+------------------+------------------+-------+
|  300|76561197960268000|             109|    Action|[1, 0, 0, 0, 0, 0...|    2007.2104545454545|       8925.191214339735|               0.0|2007.2104545454545| 6469.806061715322| 10932.40166888519|      2|
| 1300|76561197960268000|              94|    Action|[1, 0, 0, 0, 0, 0...|    214.26760563380282|       331.9995739750567| 48.26781864627446|214.26760563380282|380.26739262

Build user profile off this

In [23]:
# Update the user profile calculation to use the new ratings column
user_aggregated_data = df.groupBy("steamid").agg(
    collect_list("genre_vector").alias("genres_list"),
    collect_list("ratings").alias("ratings_list")
)

# Define a UDF to calculate the weighted average of genre vectors
weighted_avg_features_udf = udf(weighted_avg_features, ArrayType(FloatType()))

# Calculate the user profile as the weighted average of rated item profiles (genre vectors)
user_profiles = user_aggregated_data.withColumn("user_profile", weighted_avg_features_udf("ratings_list", "genres_list"))

user_profiles.show()

+-----------------+--------------------+--------------------+--------------------+
|          steamid|         genres_list|        ratings_list|        user_profile|
+-----------------+--------------------+--------------------+--------------------+
|76561197960271000|[[1, 0, 0, 0, 0, ...|                 [2]|[1.0, 0.0, 0.0, 0...|
|76561197960292000|[[1, 0, 0, 0, 0, ...|           [2, 3, 2]|[0.5714286, 0.0, ...|
|76561197960306000|[[1, 0, 0, 0, 0, ...|[2, 2, 2, 2, 2, 2...|[0.6454918, 0.252...|
|76561197960314000|[[1, 0, 0, 0, 0, ...|              [2, 2]|[1.0, 0.0, 0.0, 0...|
|76561197960334000|[[1, 0, 0, 0, 0, ...|[4, 2, 2, 2, 2, 1...|[1.0, 0.2, 0.0, 0...|
|76561197960338000|[[1, 0, 0, 0, 0, ...|                 [2]|[1.0, 0.0, 0.0, 0...|
|76561197960342000|[[1, 0, 0, 0, 0, ...|[2, 2, 2, 2, 2, 2...|[1.0, 0.5555556, ...|
|76561197960346000|[[1, 0, 0, 0, 0, ...|                 [2]|[1.0, 0.0, 0.0, 0...|
|76561197960394000|[[0, 0, 0, 0, 0, ...|              [2, 2]|[0.0, 0.0, 0.0, 0...|
|765

Cosine similarity

In [24]:
# prediction heuristics
# calculate cosine distance of an item and user profile

# 1. create udf for cosine similarity
cosine_similarity = udf(cosine_similarity_udf, FloatType())
# cross join the game_genres_df with the user_profiles
cross_joined = games_genres_df.crossJoin(user_profiles)

# calculate the cosine similarity between each item and user
recommendations = cross_joined.withColumn(
    "similarity", cosine_similarity("genre_vector", "user_profile")
)

# sort based on similarity score
sorted_recommendations = recommendations.sort(desc("similarity"))

sorted_recommendations.show(10)

+-----+--------+--------------------+-----------------+--------------------+------------+--------------------+----------+
|appid|  genres|        genre_vector|          steamid|         genres_list|ratings_list|        user_profile|similarity|
+-----+--------+--------------------+-----------------+--------------------+------------+--------------------+----------+
| 2200|[Action]|[1, 0, 0, 0, 0, 0...|76561197960271000|[[1, 0, 0, 0, 0, ...|         [2]|[1.0, 0.0, 0.0, 0...|       1.0|
| 6000|[Action]|[1, 0, 0, 0, 0, 0...|76561197960271000|[[1, 0, 0, 0, 0, ...|         [2]|[1.0, 0.0, 0.0, 0...|       1.0|
| 2300|[Action]|[1, 0, 0, 0, 0, 0...|76561197960271000|[[1, 0, 0, 0, 0, ...|         [2]|[1.0, 0.0, 0.0, 0...|       1.0|
|  300|[Action]|[1, 0, 0, 0, 0, 0...|76561197960271000|[[1, 0, 0, 0, 0, ...|         [2]|[1.0, 0.0, 0.0, 0...|       1.0|
| 2500|[Action]|[1, 0, 0, 0, 0, 0...|76561197960271000|[[1, 0, 0, 0, 0, ...|         [2]|[1.0, 0.0, 0.0, 0...|       1.0|
| 2600|[Action]|[1, 0, 0

recommender

In [25]:
# Create a window by steamid and similarity to get ranking
window_spec = Window.partitionBy("steamid").orderBy(desc("similarity"))

ranked_recommendations = sorted_recommendations.withColumn("rank", F.row_number().over(window_spec))

top_10_recommendations = ranked_recommendations.filter(ranked_recommendations.rank <= 10)
top_10_recommendations.show(10)

+------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+----------+----+
| appid|              genres|        genre_vector|          steamid|         genres_list|        ratings_list|        user_profile|similarity|rank|
+------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+----------+----+
|  2400|[RPG, Action, Indie]|[1, 0, 0, 0, 0, 0...|76561197960334000|[[1, 0, 0, 0, 0, ...|[4, 2, 2, 2, 2, 1...|[1.0, 0.2, 0.0, 0...|0.92450035|   1|
| 33100|[RPG, Action, Indie]|[1, 0, 0, 0, 0, 0...|76561197960334000|[[1, 0, 0, 0, 0, ...|[4, 2, 2, 2, 2, 1...|[1.0, 0.2, 0.0, 0...|0.92450035|   2|
| 37100|[RPG, Action, Indie]|[1, 0, 0, 0, 0, 0...|76561197960334000|[[1, 0, 0, 0, 0, ...|[4, 2, 2, 2, 2, 1...|[1.0, 0.2, 0.0, 0...|0.92450035|   3|
| 91700|[RPG, Action, Indie]|[1, 0, 0, 0, 0, 0...|76561197960334000|[[1, 0, 0, 0, 0, ...|[4, 2, 2, 2, 2, 1...|[1