# Item-item collaborative filtering

In [198]:
from pyspark.sql.functions import mean, row_number, when, udf, stddev_pop, col, lit, collect_list, avg, stddev, collect_list, udf, size
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, DoubleType, FloatType
from pyspark.sql.window import Window
from pyspark.ml.linalg import Vectors, VectorUDT
import math

## Overview
1. Load dataset
2. Split the dataset
3. Normalize the dataset playtime into a 1-5 rating scale
4. Compute Pearson correlation matrix
5. Rating prediction by doing a weighted average of the k most similar items that the user has played before

## Load Dataset
The dataset is loaded from MariaDB into a dataframe.

In [199]:
spark = SparkSession.builder.appName('ReadMariaDB') \
.config("spark.driver.memory", "32g") \
.config("spark.sql.pivotMaxValues", "1000000") \
.getOrCreate()

spark.sparkContext.setLogLevel("ERROR")


sql = "select * from 01_sampled_games_2v2 WHERE playtime_forever IS NOT NULL AND playtime_forever > 0"
database = "steam"
user = "root"
password = "example"
server = "127.0.0.1"
port = 3306
jdbc_url = f"jdbc:mysql://{server}:{port}/{database}?permitMysqlScheme"
jdbc_driver = "org.mariadb.jdbc.Driver"

# Create a data frame by reading data from Oracle via JDBC
df = spark.read.format("jdbc") \
    .option("url", jdbc_url) \
    .option("query", sql) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", jdbc_driver) \
    .load()

df = df.drop("playtime_2weeks", "dateretrieved")

In [200]:
# Count the number of rows in the DataFrame
row_count = df.count()

# Print the row count
print("The DataFrame has", row_count, "rows.")

df.show()

The DataFrame has 408349 rows.
+-----------------+-----+----------------+
|          steamid|appid|playtime_forever|
+-----------------+-----+----------------+
|76561197960268000|   10|               8|
|76561197960268000|   20|               1|
|76561197960268000|   50|            1719|
|76561197960268000|   60|               1|
|76561197960268000|   70|            1981|
|76561197960268000|  130|             175|
|76561197960268000|  220|            3873|
|76561197960268000|  240|             221|
|76561197960268000|  320|               1|
|76561197960268000|  280|            1242|
|76561197960268000|  300|             109|
|76561197960268000|  360|               3|
|76561197960268000| 1300|              94|
|76561197960268000| 1313|             213|
|76561197960268000|  380|             944|
|76561197960268000| 2100|             110|
|76561197960268000| 4000|             152|
|76561197960268000| 3970|             586|
|76561197960268000| 2600|              59|
|76561197960268000| 691

## Split the dataset

In [201]:
# Randomly split the data into 70% training and 30% test data
training, test = df.randomSplit([0.7, 0.3], seed=1234)

## Per-Game 1-5 Rating Normalization
For each game, we calculate the mean and standard deviation. We then create buckets for each rating:
### Scaling factor
The cut points are scaled on a per-user basis since some users are more casual gamers while others may spend a lot more time gaming. The scaling factor is calculated as follows:

(user_playtime_average)/(global_playtime_average)

### Cut points
* Cut point 1: (mean - std_dev*0.5) * scaling_factor if > 0, else 0
* Cut point 2: mean
* Cut point 3: (mean + std_dev*0.5) * scaling_factor
* Cut point 4: (mean + std_dev) * scaling_factor
### Ratings
* Rating 1: 0 < x < cut point 1
* Rating 2: cut point 1 < x < cut point 2
* Rating 3: cut point 2 < x < cut point 3
* Rating 4: cut point 3 < x < cut point 4
* Rating 5: cut point 5 < x < inf

In [202]:
def normalize_dataset(df):
    # Calculate the per-game mean and standard deviation of the playtime column
    game_stats = df.filter(col("playtime_forever") > 0).groupBy("appid").agg(
        mean("playtime_forever").alias("game_mean_playtime"),
        stddev_pop("playtime_forever").alias("game_stddev_playtime")
    )

    # Calculate the overall playtime average
    overall_playtime_avg = df.filter(col("playtime_forever") > 0).agg(avg("playtime_forever")).collect()[0][0]

    # Calculate the per-steamid playtime average
    user_playtime_avg = df.filter(col("playtime_forever") > 0).groupBy("steamid").agg(avg("playtime_forever")).withColumnRenamed("avg(playtime_forever)", "user_playtime_avg")

    # Join the user_playtime_avg dataframe with the main dataframe
    df = df.join(user_playtime_avg, "steamid")

    # Join the game_stats dataframe with the main dataframe
    df = df.join(game_stats, "appid")

    # Calculate the scaling factor based on the ratio of user playtime to overall playtime
    df = df.withColumn("scaling_factor", col("user_playtime_avg") / overall_playtime_avg)

    # Calculate the adjusted cut points
    df = df.withColumn("cut_point_1", when((col("game_mean_playtime") - col("game_stddev_playtime") * 0.5) * col("scaling_factor") > 0, col("game_mean_playtime") - (col("game_stddev_playtime") * 0.5 * col("scaling_factor"))).otherwise(0))
    df = df.withColumn("cut_point_2", col("game_mean_playtime") * col("scaling_factor"))
    df = df.withColumn("cut_point_3", (col("game_mean_playtime") + col("game_stddev_playtime") * 0.5) * col("scaling_factor"))
    df = df.withColumn("cut_point_4", (col("game_mean_playtime") + col("game_stddev_playtime")) * col("scaling_factor"))

    # Assign ratings based on adjusted cut points
    df = df.withColumn(
        "ratings",
        when(col("playtime_forever") <= col("cut_point_1"), lit(1))
        .when((col("playtime_forever") > col("cut_point_1")) & (col("playtime_forever") <= col("cut_point_2")), lit(2))
        .when((col("playtime_forever") > col("cut_point_2")) & (col("playtime_forever") <= col("cut_point_3")), lit(3))
        .when((col("playtime_forever") > col("cut_point_3")) & (col("playtime_forever") <= col("cut_point_4")), lit(4))
        .otherwise(lit(5))
    )

    # Drop the columns that are no longer needed
    df = df.drop("playtime_forever", "game_mean_playtime", "game_stddev_playtime", "user_playtime_avg", "scaling_factor", "cut_point_1", "cut_point_2", "cut_point_3", "cut_point_4")
    return df

# Normalize the training dataset
training = normalize_dataset(training)

training.show()

ERROR:root:KeyboardInterrupt while sending command.][Stage 2995:> (0 + 1) / 1]  
Traceback (most recent call last):
  File "/home/kevin/Documents/Repositories/steam-recommender/venv/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/kevin/Documents/Repositories/steam-recommender/venv/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt
                                                                                

KeyboardInterrupt: 

## Pearson Correlation Matrix
Since the dataset contains at most ~4500 games, we can expect a 4500^2=81,000,000 sized matrix. Each float entry takes 4 bytes in memory. Therefore, the pearson correlation matrix would take up 324 MB of memory.

Since the memory used is relatively small, we will pre-compute the person correlation matrix and store it for use later in the algorithm.

To start off, we create a list of features for each game (appid) using the playtime_forever of all users who have played the game.

In [None]:
# Compute the maximum length of the lists of ratings values
max_len = training.filter("ratings IS NOT NULL") \
    .groupBy('appid').agg(size(collect_list('ratings')).alias('num_playtimes')) \
    .agg({'num_playtimes': 'max'}).collect()[0][0]

# Define a UDF to pad lists with zeros
pad_zeros = udf(lambda x: x + [0.0]*(max_len-len(x)), ArrayType(DoubleType()))

# Create playtime vectors for each game
list_to_dense = udf(lambda l: Vectors.dense(l), VectorUDT())
vectors = training.filter("ratings IS NOT NULL")\
    .groupBy('appid').agg(collect_list('ratings'))\
        .withColumn('padded_features', pad_zeros('collect_list(ratings)')) \
        .withColumn('features', list_to_dense('padded_features'))

                                                                                

We add a row number column to the dataframe so that we can match the appid with the row number in the correlation matrix.

In [None]:
# Add a row number column to the game matrix
windowSpec = Window.orderBy("appid")
vectors = vectors.withColumn("row_num", row_number().over(windowSpec))
vectors.show()

[Stage 2831:>                                                       (0 + 1) / 1]

+-----+---------------------+--------------------+--------------------+-------+
|appid|collect_list(ratings)|     padded_features|            features|row_num|
+-----+---------------------+--------------------+--------------------+-------+
|   10| [2, 2, 3, 2, 2, 2...|[null, null, null...|[2.0,2.0,3.0,2.0,...|      1|
|   20| [2, 3, 2, 2, 2, 3...|[null, null, null...|[2.0,3.0,2.0,2.0,...|      2|
|   30| [4, 2, 2, 3, 2, 2...|[null, null, null...|[4.0,2.0,2.0,3.0,...|      3|
|   40| [2, 2, 2, 2, 2, 5...|[null, null, null...|[2.0,2.0,2.0,2.0,...|      4|
|   50| [5, 4, 2, 2, 4, 2...|[null, null, null...|[5.0,4.0,2.0,2.0,...|      5|
|   60| [2, 2, 2, 3, 4, 2...|[null, null, null...|[2.0,2.0,2.0,3.0,...|      6|
|   70| [2, 5, 2, 5, 3, 2...|[null, null, null...|[2.0,5.0,2.0,5.0,...|      7|
|   80| [3, 2, 2, 3, 2, 2...|[null, null, null...|[3.0,2.0,2.0,3.0,...|      8|
|   92| [2, 5, 1, 1, 1, 5...|[null, null, null...|[2.0,5.0,1.0,1.0,...|      9|
|  100| [5, 2, 2, 2, 4, 2...|[null, null

                                                                                

Using the correlation method provided by Pyspark, we compute the correlation matrix.

In [None]:
pearson_matrix = Correlation.corr(vectors.orderBy("row_num"), "features", "pearson")
corr_array = pearson_matrix.head()[0].toArray()
print(corr_array)

                                                                                

[[ 1.00000000e+00 -5.28235375e-03  4.51223406e-02 ... -2.59413205e-03
  -2.59413205e-03 -2.59413205e-03]
 [-5.28235375e-03  1.00000000e+00  1.34916033e-01 ... -1.68386308e-03
  -1.68386308e-03 -1.68386308e-03]
 [ 4.51223406e-02  1.34916033e-01  1.00000000e+00 ...  7.59441798e-04
   7.59441798e-04  7.59441798e-04]
 ...
 [-2.59413205e-03 -1.68386308e-03  7.59441798e-04 ...  1.00000000e+00
   1.00000000e+00  1.00000000e+00]
 [-2.59413205e-03 -1.68386308e-03  7.59441798e-04 ...  1.00000000e+00
   1.00000000e+00  1.00000000e+00]
 [-2.59413205e-03 -1.68386308e-03  7.59441798e-04 ...  1.00000000e+00
   1.00000000e+00  1.00000000e+00]]


In [None]:
# Create a dictionary of appids and row numbers
all_row_num = vectors.select("appid", "row_num").rdd.collectAsMap()

print(all_row_num)

# Create a dictionary of appids cast to Integer and row numbers
# all_row_num = vectors.select("appid", "row_num").rdd.map(lambda x: (int(x[0]), x[1])).collectAsMap()

                                                                                

{Decimal('10'): 1, Decimal('20'): 2, Decimal('30'): 3, Decimal('40'): 4, Decimal('50'): 5, Decimal('60'): 6, Decimal('70'): 7, Decimal('80'): 8, Decimal('92'): 9, Decimal('100'): 10, Decimal('130'): 11, Decimal('211'): 12, Decimal('220'): 13, Decimal('240'): 14, Decimal('280'): 15, Decimal('300'): 16, Decimal('320'): 17, Decimal('340'): 18, Decimal('360'): 19, Decimal('380'): 20, Decimal('400'): 21, Decimal('420'): 22, Decimal('440'): 23, Decimal('500'): 24, Decimal('550'): 25, Decimal('570'): 26, Decimal('620'): 27, Decimal('630'): 28, Decimal('730'): 29, Decimal('1002'): 30, Decimal('1200'): 31, Decimal('1210'): 32, Decimal('1230'): 33, Decimal('1250'): 34, Decimal('1280'): 35, Decimal('1300'): 36, Decimal('1309'): 37, Decimal('1313'): 38, Decimal('1320'): 39, Decimal('1500'): 40, Decimal('1510'): 41, Decimal('1520'): 42, Decimal('1525'): 43, Decimal('1530'): 44, Decimal('1600'): 45, Decimal('1610'): 46, Decimal('1630'): 47, Decimal('1640'): 48, Decimal('1670'): 49, Decimal('1690'): 

## Rating Prediction


For a given user (steamid) and game (appid), we can compute the predicted rating of the game (appid) for the user (steamid) using the weighted average of the k most similar games to the game (appid) that the user (steamid) has played.

This is the same item-item collaborative filtering formula that was shown in class.

In [None]:
def predict_rating(appid, user_ratings_dict, k=3):

    # Get appid row number from the vectors dataframe
    if appid in all_row_num:
        appid_row_num = all_row_num[appid] - 1
    else:
        return 0.0

    # Get a list of correlations between the appid and all other games
    corr = corr_array[appid_row_num]

    # Create a dict of appids and correlations
    corr_dict = {}
    for appid in all_row_num:
        if appid in user_ratings_dict:
            corr_dict[appid] = corr[all_row_num[appid]]

    # Make a list of tuples of (appid, correlation, rating)
    corr_list = []
    for appid, corr in corr_dict.items():
        # Only add the appid to the list if the user has rated it
        if appid in user_ratings_dict:
            corr_list.append((appid, corr, user_ratings_dict[appid]))

    # Sort the list by correlation
    corr_list.sort(key=lambda x: x[1], reverse=True)

    # Get the top 10 most similar appids
    top_k = corr_list[1:k+1]

    # Calculate the weighted average of the top 10 appids
    numerator = 0
    denominator = 0
    for appid, corr, rating in top_k:
        numerator += corr * rating
        denominator += corr
    if denominator != 0:
        return float(numerator / denominator)
    else:
        return 0.0

def predict_single_rating(steamid, appid, training_df):
    # Get all the user's ratings
    user_ratings_dict = training_df.filter(col("steamid") == steamid).select("appid", "ratings").rdd.collectAsMap()

    # Predict the user's rating for the appid
    return predict_rating(appid, user_ratings_dict)

In [None]:
# Test the function for a given user and game
test_steamid = 76561198023872000
test_appid = 500
print(f'Predicted rating for steamid [{test_steamid}] and appid [{test_appid}]: {predict_single_rating(test_steamid, test_appid, training)}')

[Stage 2876:> (0 + 1) / 1][Stage 2877:> (0 + 1) / 1][Stage 2878:> (0 + 1) / 1]

Predicted rating for steamid [76561198023872000] and appid [500]: 3.333620801884141


                                                                                

## Recommender



We run in parallel the rating prediction for each game that the user has not played before. We then sort the games by the predicted rating and return the top k games.

In [None]:
# Get all unique appids
all_appids = vectors.select("appid").rdd.flatMap(lambda x: x).collect()

                                                                                

In [None]:
def predict_user_ratings(steamid, recommendation_count=3):
    # Get all_appids that the user has not rated
    user_ratings_dict = training.filter(col("steamid") == steamid).select("appid", "ratings").rdd.collectAsMap()
    not_rated = [appid for appid in all_appids if appid not in user_ratings_dict]
    not_rated_rdd = spark.sparkContext.parallelize(not_rated)

    # Run predict_rating for each appid and output a list of tuples of (appid, predicted rating)
    predictions = not_rated_rdd.map(lambda appid: (appid, predict_rating(appid, user_ratings_dict))).collect()

    # Sort the list by predicted rating
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Return top 3 appids outside of the tuple
    return [appid for appid, rating in predictions[:recommendation_count]]

print(f'Predicted top 3 games for steamid [{test_steamid}]: {predict_user_ratings(test_steamid)}')



Predicted top 3 games for steamid [76561198023872000]: [Decimal('235340'), Decimal('233840'), Decimal('229580')]


                                                                                

## RMSE

In [None]:
# Normalize the test dataset
test_normalized = normalize_dataset(test)
test_normalized.show()

[Stage 2910:> (0 + 1) / 1][Stage 2911:> (0 + 1) / 1][Stage 2912:> (0 + 1) / 1]  

+------+-----------------+-------+
| appid|          steamid|ratings|
+------+-----------------+-------+
|    80|76561197960389000|      3|
|   240|76561197960389000|      2|
|   240|76561197961433000|      2|
|   730|76561197961433000|      2|
| 10180|76561197961433000|      1|
|228200|76561197961433000|      2|
|   240|76561197962565000|      2|
|   300|76561197962565000|      2|
|  8930|76561197962565000|      2|
| 10500|76561197962565000|      3|
|202970|76561197962565000|      2|
|212910|76561197962565000|      2|
|    30|76561197967974000|      4|
|    50|76561197967974000|      2|
|   440|76561197970517000|      3|
|   500|76561197970517000|      2|
|   620|76561197970517000|      2|
|   320|76561197971279000|      3|
|  1250|76561197975983000|      3|
| 91310|76561197975983000|      5|
+------+-----------------+-------+
only showing top 20 rows



                                                                                

In [None]:
# Create dataframe with only one row per steamid, a list of appids and ratings from the training set and a list of appids and ratings from the test set
steamid_ratings = training.groupBy("steamid").agg(collect_list("appid").alias("train_appids"), collect_list("ratings").alias("train_ratings"))
steamid_ratings = steamid_ratings.join(test_normalized.groupBy("steamid").agg(collect_list("appid").alias("test_appids"), collect_list("ratings").alias("test_ratings")), on="steamid", how="inner")


def predict_target_ratings(train_appids, train_ratings, test_appids):
    # If the test_appids list is empty, return an empty list
    if len(test_appids) == 0:
        return []
    # Get dictionary of appids and ratings from train_appids and train_ratings
    train_ratings_dict = dict(zip(train_appids, train_ratings))

    # For each appid in test_appids, do predict_rating and return a list of tuples of (appid, predicted rating)
    predictions = [predict_rating(appid, train_ratings_dict) for appid in test_appids]
    return predictions

# Create a udf for predict_target_ratings
predict_target_ratings_udf = udf(predict_target_ratings, ArrayType(FloatType()))

# For each row in the steamid_ratings dataframe, run predict_target_ratings and add the predictions list into a new column
steamid_ratings = steamid_ratings.withColumn("predictions", predict_target_ratings_udf("train_appids", "train_ratings", "test_appids"))

# For each column in the predictions column, calculate the rmse
def calculate_rmse(predictions, test_ratings):
    # If the predictions list is empty, return 0.0
    if len(predictions) == 0:
        return 0.0
    # Calculate the rmse for the predictions list
    rmse = 0
    prediction_num = 0
    for prediction, test_rating in zip(predictions, test_ratings):
        # Check if prediction is not 0.0
        if prediction != 0.0:
            rmse += (prediction - test_rating) ** 2
            prediction_num += 1
    # Do square root
    rmse = math.sqrt(rmse / len(predictions))
    return rmse

# Create a udf for calculate_rmse
calculate_rmse_udf = udf(calculate_rmse, FloatType())

# For each row in the steamid_ratings dataframe, run calculate_rmse and add the rmse into a new column
steamid_ratings = steamid_ratings.withColumn("rmse", calculate_rmse_udf("predictions", "test_ratings"))

# Remove all rows where rmse is 0.0
steamid_ratings = steamid_ratings.filter(col("rmse") != 0.0)

# Get the rmse for the table
rmse = steamid_ratings.select(avg("rmse")).collect()[0][0]
print(f'RMSE: {rmse}')

# Calculate the standard deviation of the ratings
ratings_std = test_normalized.select(stddev("ratings")).collect()[0][0]
print(f'Standard deviation of ratings: {ratings_std}')

                                                                                

+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|          steamid|        train_appids|       train_ratings|         test_appids|        test_ratings|         predictions|
+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|76561197960271000|       [10, 30, 400]|           [2, 3, 4]|               [440]|                 [2]|         [2.7207913]|
|76561197960291000|                [30]|                 [3]|            [10, 20]|              [2, 4]|          [0.0, 0.0]|
|76561197960294000|                [10]|                 [2]|               [240]|                 [2]|               [0.0]|
|76561197960334000|[70, 80, 220, 240...|[3, 3, 1, 2, 5, 4...|[400, 730, 13240,...|        [1, 2, 1, 1]|[3.5767725, 2.340...|
|76561197960338000| [50, 380, 400, 420]|        [4, 5, 5, 1]|      [10, 240, 630]|           [2, 2, 5]|[4.767692, 3.2707...|


                                                                                

+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|          steamid|        train_appids|       train_ratings|         test_appids|        test_ratings|         predictions|      rmse|
+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|76561197960271000|       [10, 30, 400]|           [2, 3, 4]|               [440]|                 [2]|         [2.7207913]|0.72079134|
|76561197960291000|                [30]|                 [3]|            [10, 20]|              [2, 4]|          [0.0, 0.0]|       0.0|
|76561197960294000|                [10]|                 [2]|               [240]|                 [2]|               [0.0]|       0.0|
|76561197960334000|[70, 80, 220, 240...|[3, 3, 1, 2, 5, 4...|[400, 730, 13240,...|        [1, 2, 1, 1]|[3.5767725, 2.340...| 1.7201945|
|76561197960338000| [50, 380, 400, 420]|        

                                                                                

RMSE: 1.1087727828081508


                                                                                

Standard deviation of ratings: 1.408466228662225
