# Content-based filtering model

In [1]:
# imports
from pyspark.sql import SparkSession
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql.window import Window #for ranking
from pyspark.sql.functions import lit, mean, stddev_pop
from pyspark.sql.functions import collect_set, collect_list
from pyspark.sql.functions import struct
from pyspark.sql.functions import slice
from pyspark.sql.functions import col
from pyspark.sql.functions import desc
from pyspark.sql.functions import udf
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import DecimalType, ArrayType, IntegerType, FloatType
import pyspark.sql.functions as F
from pyspark.sql.functions import avg, broadcast, when

## Overview
1. Load dataset.
2. Create item profiles for each game using genre, developer and publisher.
3. For each user-game pair, normalize playtime into a 1-5 rating scale.
4. Create user profiles by computing a weighted average of item profiles and ratings of previously played games.
5. Given a user, compute the cosine similarity between the user's profile and each (unplayed) game's item profile.
6. Recommend the user's 10 top similar games.

### Define cosine similarity and weighted average functions

In [2]:
# cosine similarity function
def cosine_similarity_udf(a, b):
    dot_product = sum([x * y for x, y in zip(a, b)])
    norm_a = sum([x**2 for x in a])**0.5
    norm_b = sum([x**2 for x in b])**0.5
    return dot_product / (norm_a * norm_b)


def weighted_avg_features(ratings, combined_vectors):
    if not ratings or not combined_vectors:
        return []

    weighted_sum = [0] * len(combined_vectors[0])
    total_weight = 0

    for rating, combined_vector in zip(ratings, combined_vectors):
        weight = float(rating)
        total_weight += weight
        weighted_sum = [ws + weight * f for ws, f in zip(weighted_sum, combined_vector)]

    if total_weight == 0:
        return weighted_sum

    return [ws / total_weight for ws in weighted_sum]

## Load Dataset

In [3]:
spark = SparkSession.builder.appName('ReadMySQL') \
.config("spark.driver.memory", "32g") \
.config("spark.sql.pivotMaxValues", "1000000") \
.config("spark.jars", "C:\\Program Files (x86)\\MySQL\\Connector J 8.0\\mysql-connector-j-8.0.32.jar") \
.getOrCreate()

# sql = "select * from 01_sampled_games_2v2 WHERE playtime_forever IS NOT NULL AND playtime_forever > 0"
sql = sql = """
SELECT p.steamid, p.appid, p.playtime_2weeks, p.playtime_forever, p.dateretrieved, g.genre, d.Developer, pb.Publisher
FROM 01_sampled_games_2v2 AS p
JOIN games_genres AS g ON p.appid = g.appid
JOIN games_developers AS d ON p.appid = d.appid
JOIN games_publishers AS pb ON p.appid = pb.appid
WHERE p.playtime_forever IS NOT NULL AND p.playtime_forever > 0
"""
database = "steam"
user = "root"
password = "root"
server = "127.0.0.1"
port = 3306
jdbc_url = f"jdbc:mysql://{server}:{port}/{database}"
jdbc_driver = "com.mysql.cj.jdbc.Driver"

# Create a data frame by reading data from Oracle via JDBC
df = spark.read.format("jdbc") \
    .option("url", jdbc_url) \
    .option("query", sql) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", jdbc_driver) \
    .load()

df = df.drop("playtime_2weeks", "dateretrieved")

In [4]:
# count number of rows in the dataframe
row_count = df.count()
# print the row count
print("Dataframe has", row_count, " rows.")
df.show()

Dataframe has 212589  rows.
+-----------------+-----+----------------+----------+--------------------+--------------------+
|          steamid|appid|playtime_forever|     genre|           Developer|           Publisher|
+-----------------+-----+----------------+----------+--------------------+--------------------+
|76561197960268000|  300|             109|    Action|               Valve|               Valve|
|76561197960268000| 1300|              94|    Action|Ritual Entertainment|Ritual Entertainment|
|76561197960268000| 2100|             110|    Action|      Arkane Studios|             Ubisoft|
|76561197960268000| 2100|             110|       RPG|      Arkane Studios|             Ubisoft|
|76561197960268000| 4000|             152|     Indie|   Facepunch Studios|               Valve|
|76561197960268000| 4000|             152|Simulation|   Facepunch Studios|               Valve|
|76561197960268000| 2600|              59|    Action|        Troika Games|          Activision|
|76561197960

## Create item profiles

In [5]:
# build the item profiles
# Group the data by 'appid' and collect the genres for each game into a list
games_genres_df = df.groupBy("appid").agg(collect_set("genre").alias("genres"))
# Group the data by 'appid' and collect the developers for each game into a list
games_developers_df = df.groupBy("appid").agg(collect_set("Developer").alias("developers"))
# Group the data by 'appid' and collect the publishers for each game into a list
games_publishers_df = df.groupBy("appid").agg(collect_set("Publisher").alias("publishers"))

# Create a list of unique genres
unique_genres = sorted(df.select("genre").distinct().rdd.flatMap(lambda x: x).collect())
# Create a list of unique developers
unique_developers = sorted(df.select("Developer").distinct().rdd.flatMap(lambda x: x).collect())
# Create a list of unique publishers
unique_publishers = sorted(df.select("Publisher").distinct().rdd.flatMap(lambda x: x).collect())


# Define a UDF to create a binary vector for each game's genres
@udf(returnType=ArrayType(IntegerType()))
def genre_vector(genres):
    return [1 if genre in genres else 0 for genre in unique_genres]

# Define a UDF to create a binary vector for each game's developer
@udf(returnType=ArrayType(IntegerType()))
def developer_vector(developers):
    return [1 if developer in developers else 0 for developer in unique_developers]

# Define a UDF to create a binary vector for each game's publisher
@udf(returnType=ArrayType(IntegerType()))
def publisher_vector(publishers):
    return [1 if publisher in publishers else 0 for publisher in unique_publishers]


# Add a new column 'genre_vector' to the DataFrame
# the genre vector will now have a 1 for each genre that the game belongs to
games_genres_df = games_genres_df.withColumn("genre_vector", genre_vector("genres"))
# Add a new column 'developer_vector' to the DataFrame
games_developers_df = games_developers_df.withColumn("developer_vector", developer_vector("developers"))
# Add a new column 'publisher_vector' to the DataFrame
games_publishers_df = games_publishers_df.withColumn("publisher_vector", publisher_vector("publishers"))

# games_genres_df.show(truncate=False)
# Join the main DataFrame with the games_genres_df on appid to include the genre_vector
df = df.join(broadcast(games_genres_df.select("appid", "genre_vector")), on="appid")
# Join the main DataFrame with the games_developers_df
df = df.join(broadcast(games_developers_df.select("appid", "developer_vector")), on="appid")
# Join the main DataFrame with the games_publishers_df
df = df.join(broadcast(games_publishers_df.select("appid", "publisher_vector")), on="appid")

df.show()

+-----+-----------------+----------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|appid|          steamid|playtime_forever|     genre|           Developer|           Publisher|        genre_vector|    developer_vector|    publisher_vector|
+-----+-----------------+----------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  300|76561197960268000|             109|    Action|               Valve|               Valve|[1, 0, 0, 0, 0, 0...|[0, 0, 0, 0, 0, 0...|[0, 0, 0, 0, 0, 0...|
| 1300|76561197960268000|              94|    Action|Ritual Entertainment|Ritual Entertainment|[1, 0, 0, 0, 0, 0...|[0, 0, 0, 0, 0, 0...|[0, 0, 0, 0, 0, 0...|
| 2100|76561197960268000|             110|    Action|      Arkane Studios|             Ubisoft|[1, 0, 0, 0, 0, 0...|[0, 0, 0, 0, 0, 0...|[0, 0, 0, 0, 0, 0...|
| 2100|76561197960268000|             110|    

## Per-Game 1-5 Rating Normalization
For each game, we calculate the mean and standard deviation. We then create buckets for each rating:
### Scaling factor
The cut points are scaled on a per-user basis since some users are more casual gamers while others may spend a lot more time gaming. The scaling factor is calculated as follows:

(user_playtime_average)/(global_playtime_average)

### Cut points
* Cut point 1: (mean - std_dev*0.5) * scaling_factor if > 0, else 0
* Cut point 2: mean
* Cut point 3: (mean + std_dev*0.5) * scaling_factor
* Cut point 4: (mean + std_dev) * scaling_factor
### Ratings
* Rating 1: 0 < x < cut point 1
* Rating 2: cut point 1 < x < cut point 2
* Rating 3: cut point 2 < x < cut point 3
* Rating 4: cut point 3 < x < cut point 4
* Rating 5: cut point 5 < x < inf


In [6]:
# Calculate the per-game mean and standard deviation of the playtime column
game_stats = df.filter(col("playtime_forever") > 0).groupBy("appid").agg(
    mean("playtime_forever").alias("game_mean_playtime"),
    stddev_pop("playtime_forever").alias("game_stddev_playtime")
)

# Calculate the overall playtime average
overall_playtime_avg = df.filter(col("playtime_forever") > 0).agg(avg("playtime_forever")).collect()[0][0]

# Calculate the per-steamid playtime average
user_playtime_avg = df.filter(col("playtime_forever") > 0).groupBy("steamid").agg(avg("playtime_forever")).withColumnRenamed("avg(playtime_forever)", "user_playtime_avg")

# Join the user_playtime_avg dataframe with the main dataframe
df = df.join(user_playtime_avg, "steamid")

# Join the game_stats dataframe with the main dataframe
df = df.join(game_stats, "appid")

# Calculate the scaling factor based on the ratio of user playtime to overall playtime
df = df.withColumn("scaling_factor", col("user_playtime_avg") / overall_playtime_avg)

# Calculate the adjusted cut points
df = df.withColumn("cut_point_1", when(col("game_mean_playtime") - (col("game_stddev_playtime") * 0.5 * col("scaling_factor")) > 0, col("game_mean_playtime") - (col("game_stddev_playtime") * 0.5 * col("scaling_factor"))).otherwise(0))
df = df.withColumn("cut_point_2", col("game_mean_playtime") * col("scaling_factor"))
df = df.withColumn("cut_point_3", col("game_mean_playtime") + (col("game_stddev_playtime") * 0.5 * col("scaling_factor")))
df = df.withColumn("cut_point_4", col("game_mean_playtime") + col("game_stddev_playtime") * col("scaling_factor"))

# Assign ratings based on adjusted cut points
df = df.withColumn(
    "ratings",
    when(col("playtime_forever") <= col("cut_point_1"), lit(1))
    .when((col("playtime_forever") > col("cut_point_1")) & (col("playtime_forever") <= col("cut_point_2")), lit(2))
    .when((col("playtime_forever") > col("cut_point_2")) & (col("playtime_forever") <= col("cut_point_3")), lit(3))
    .when((col("playtime_forever") > col("cut_point_3")) & (col("playtime_forever") <= col("cut_point_4")), lit(4))
    .otherwise(lit(5))
)

# Update the user profile calculation to use the new ratings column
user_aggregated_data = df.groupBy("steamid").agg(
    collect_list("genre_vector").alias("genres_list"),
    collect_list("ratings").alias("ratings_list")
)
# Show df with new changes(without genres)
# Drop the genre_vector and genre columns from the DataFrame
df_without_info = df.drop("genre_vector", "genre", "developer_vector", "Developer", "publisher_vector", "Publisher")

# Show the DataFrame without the genre_vector and genre columns
df_without_info.show()
#df.show()

+------+-----------------+----------------+-----------------+------------------+--------------------+-------------------+-----------------+------------------+------------------+------------------+-------+
| appid|          steamid|playtime_forever|user_playtime_avg|game_mean_playtime|game_stddev_playtime|     scaling_factor|      cut_point_1|       cut_point_2|       cut_point_3|       cut_point_4|ratings|
+------+-----------------+----------------+-----------------+------------------+--------------------+-------------------+-----------------+------------------+------------------+------------------+-------+
|285900|76561198047500000|              14| 1096.20987654321|             102.6|   77.31649241914691| 0.6218335774098507|78.56100446311014| 63.80012504225068|126.63899553688985| 150.6779910737797|      1|
|285900|76561198047500000|              14| 1096.20987654321|             102.6|   77.31649241914691| 0.6218335774098507|78.56100446311014| 63.80012504225068|126.63899553688985| 15

## Create user profiles

In [7]:
# combine vectors

# Define a UDF to combine the vectors
@udf(returnType=ArrayType(IntegerType()))
def combined_vector(genre_vector, developer_vector, publisher_vector):
    return genre_vector + developer_vector + publisher_vector

# Add 'combined_vector' to the DataFrame
df = df.withColumn("combined_vector", combined_vector("genre_vector", "developer_vector", "publisher_vector"))

# Update the user profile calculation to use the new ratings column and include developer and publisher vectors
user_aggregated_data = df.groupBy("steamid").agg(
    collect_list("combined_vector").alias("combined_vectors_list"),  
    collect_list("ratings").alias("ratings_list")
)

# Define a UDF to calculate the weighted average of genre vectors
weighted_avg_features_udf = udf(weighted_avg_features, ArrayType(FloatType()))

# Calculate the user profile as the weighted average of rated item profiles (combined genre, developer, and publisher vectors)
user_profiles = user_aggregated_data.withColumn("user_profile", weighted_avg_features_udf("ratings_list", "combined_vectors_list"))

user_profiles.show()

+-----------------+---------------------+--------------------+--------------------+
|          steamid|combined_vectors_list|        ratings_list|        user_profile|
+-----------------+---------------------+--------------------+--------------------+
|76561197978981000| [[0, 0, 0, 0, 0, ...|[1, 1, 1, 1, 1, 5...|[0.5641026, 0.243...|
|76561197979519000| [[0, 0, 0, 0, 0, ...|[1, 1, 1, 1, 1, 1...|[0.6481481, 0.0, ...|
|76561197993659000| [[0, 0, 0, 0, 0, ...|        [1, 1, 1, 1]|[0.5, 0.0, 0.0, 0...|
|76561197996265000| [[1, 0, 0, 0, 0, ...|[1, 1, 1, 1, 1, 1...|[0.8828125, 0.179...|
|76561198001305000| [[1, 0, 0, 0, 0, ...|[4, 4, 1, 1, 1, 1...|[0.85507244, 0.05...|
|76561198011370000| [[1, 0, 0, 0, 0, ...|[3, 3, 5, 5, 1, 1...|[0.6875, 0.5, 0.0...|
|76561198027827000| [[1, 0, 0, 0, 0, ...|[3, 3, 1, 1, 1, 3...|[0.5, 0.27272728,...|
|76561198040946000| [[1, 0, 0, 0, 0, ...|  [3, 3, 3, 1, 1, 1]|[1.0, 0.0, 0.0, 0...|
|76561198020739000| [[0, 0, 0, 1, 0, ...|[1, 1, 1, 1, 1, 1...|[0.58064514, 0

In [8]:
# Select a steam id and create a list of games that user has already played

In [9]:
# Filter user_profiles for steam_id "76561197960421000"
user_profiles = user_profiles.filter(user_profiles.steamid == '76561198013540000')
# filter for games with playtime
played_games = df.filter(df.steamid == '76561198013540000').select("appid", "ratings").distinct()

## Compute the cosine similarity between user and item profiles

In [10]:
# prediction heuristics
# calculate cosine distance of an item and user profile

# 1. create udf for cosine similarity
cosine_similarity = udf(cosine_similarity_udf, FloatType())

# create dataframe with combined vectors
games_combined_vectors_df = df.select("appid", "combined_vector").distinct()

# exclude the games played from games_combined
#games_combined_vectors_df = games_combined_vectors_df.join(played_games, "appid", how="left_anti")

# limit the number of rows for testing
#games_combined_vectors_df = games_combined_vectors_df.limit(20)
user_profiles = user_profiles.limit(1)

# cross join the the combined vectors with the user_profiles
cross_joined = games_combined_vectors_df.crossJoin(user_profiles)

# calculate the cosine similarity between each item and user
recommendations = cross_joined.withColumn(
    "similarity", cosine_similarity("combined_vector", "user_profile")
)

# sort based on similarity score
sorted_recommendations = recommendations.sort(desc("similarity"))

sorted_recommendations.show(10)

+------+--------------------+-----------------+---------------------+--------------------+--------------------+----------+
| appid|     combined_vector|          steamid|combined_vectors_list|        ratings_list|        user_profile|similarity|
+------+--------------------+-----------------+---------------------+--------------------+--------------------+----------+
| 23600|[0, 0, 0, 0, 0, 0...|76561198013540000| [[0, 0, 0, 0, 0, ...|[5, 5, 1, 1, 1, 1...|[0.28181818, 0.09...| 0.6995842|
| 29800|[1, 1, 0, 1, 0, 0...|76561198013540000| [[0, 0, 0, 0, 0, ...|[5, 5, 1, 1, 1, 1...|[0.28181818, 0.09...|0.65932465|
|235400|[0, 1, 0, 0, 0, 0...|76561198013540000| [[0, 0, 0, 0, 0, ...|[5, 5, 1, 1, 1, 1...|[0.28181818, 0.09...|0.65115124|
|107200|[1, 0, 0, 0, 0, 0...|76561198013540000| [[0, 0, 0, 0, 0, ...|[5, 5, 1, 1, 1, 1...|[0.28181818, 0.09...|  0.641776|
| 65800|[1, 0, 0, 0, 0, 0...|76561198013540000| [[0, 0, 0, 0, 0, ...|[5, 5, 1, 1, 1, 1...|[0.28181818, 0.09...|  0.641776|
|233700|[0, 0, 0

## Recommend the user's 10 top similar games

In [11]:
# Create a window by steamid and similarity to get ranking
window_spec = Window.partitionBy("steamid").orderBy(desc("similarity"))

ranked_recommendations = sorted_recommendations.withColumn("rank", F.row_number().over(window_spec))

top_10_recommendations = ranked_recommendations.filter(ranked_recommendations.rank <= 10)
top_10_recommendations.show()

+------+--------------------+-----------------+---------------------+--------------------+--------------------+----------+----+
| appid|     combined_vector|          steamid|combined_vectors_list|        ratings_list|        user_profile|similarity|rank|
+------+--------------------+-----------------+---------------------+--------------------+--------------------+----------+----+
| 23600|[0, 0, 0, 0, 0, 0...|76561198013540000| [[0, 0, 0, 0, 0, ...|[5, 5, 1, 1, 1, 1...|[0.28181818, 0.09...| 0.6995842|   1|
| 29800|[1, 1, 0, 1, 0, 0...|76561198013540000| [[0, 0, 0, 0, 0, ...|[5, 5, 1, 1, 1, 1...|[0.28181818, 0.09...|0.65932465|   2|
|235400|[0, 1, 0, 0, 0, 0...|76561198013540000| [[0, 0, 0, 0, 0, ...|[5, 5, 1, 1, 1, 1...|[0.28181818, 0.09...|0.65115124|   3|
|107200|[1, 0, 0, 0, 0, 0...|76561198013540000| [[0, 0, 0, 0, 0, ...|[5, 5, 1, 1, 1, 1...|[0.28181818, 0.09...|  0.641776|   4|
| 65800|[1, 0, 0, 0, 0, 0...|76561198013540000| [[0, 0, 0, 0, 0, ...|[5, 5, 1, 1, 1, 1...|[0.28181818, 0

## Evaluation for steamid 76561198013540000's top 10 games

In [13]:
user_top_rated_games = played_games.orderBy(desc('ratings'))
user_top_rated_games.show()

+------+-------+
| appid|ratings|
+------+-------+
| 70400|      5|
| 80200|      5|
| 21800|      5|
|202200|      5|
| 23600|      5|
| 40300|      4|
| 21600|      4|
|224600|      3|
|112100|      2|
|  9900|      2|
| 47700|      2|
|109600|      2|
|102500|      1|
|  9200|      1|
+------+-------+



In [15]:
print(f'Precision: {"{:.2%}".format(top_10_recommendations.join(user_top_rated_games, "appid").count() / 10)}')

Precision: 10.00%
