# Item-item collaborative filtering

In [17]:
from pyspark.sql.functions import mean, when, udf, stddev_pop, col, lit, collect_list, avg, stddev, collect_list, udf, first
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, FloatType
import math
from pyspark.ml.feature import VectorAssembler

## Overview
1. Load dataset
2. Split the dataset
3. Normalize the dataset playtime into a 1-5 rating scale
4. Compute Pearson correlation matrix
5. Rating prediction by doing a weighted average of the k most similar items that the user has played before

## Load Dataset
The dataset is loaded from MariaDB into a dataframe.

In [18]:
spark = SparkSession.builder.appName('ReadMariaDB') \
.config("spark.driver.memory", "32g") \
.config("spark.sql.pivotMaxValues", "1000000") \
.config('spark.sql.codegen.wholeStage', 'false')\
.getOrCreate()

spark.sparkContext.setLogLevel("ERROR")


sql = "select * from 01_sampled_games_2 WHERE playtime_forever IS NOT NULL AND playtime_forever > 0"
database = "steam"
user = "root"
password = "example"
server = "127.0.0.1"
port = 3306
jdbc_url = f"jdbc:mysql://{server}:{port}/{database}?permitMysqlScheme"
jdbc_driver = "org.mariadb.jdbc.Driver"

# Create a data frame by reading data from Oracle via JDBC
df = spark.read.format("jdbc") \
    .option("url", jdbc_url) \
    .option("query", sql) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", jdbc_driver) \
    .load()

df = df.drop("playtime_2weeks", "dateretrieved")

In [19]:
# Count the number of rows in the DataFrame
row_count = df.count()

# Print the row count
print("The DataFrame has", row_count, "rows.")

df.show()

The DataFrame has 408349 rows.
+-----------------+-----+----------------+
|          steamid|appid|playtime_forever|
+-----------------+-----+----------------+
|76561197960268000|   10|               8|
|76561197960268000|   20|               1|
|76561197960268000|   50|            1719|
|76561197960268000|   60|               1|
|76561197960268000|   70|            1981|
|76561197960268000|  130|             175|
|76561197960268000|  220|            3873|
|76561197960268000|  240|             221|
|76561197960268000|  320|               1|
|76561197960268000|  280|            1242|
|76561197960268000|  300|             109|
|76561197960268000|  360|               3|
|76561197960268000| 1300|              94|
|76561197960268000| 1313|             213|
|76561197960268000|  380|             944|
|76561197960268000| 2100|             110|
|76561197960268000| 4000|             152|
|76561197960268000| 3970|             586|
|76561197960268000| 2600|              59|
|76561197960268000| 691

## Split the dataset

## Per-Game 1-5 Rating Normalization
For each game, we calculate the mean and standard deviation. We then create buckets for each rating:
### Scaling factor
The cut points are scaled on a per-user basis since some users are more casual gamers while others may spend a lot more time gaming. The scaling factor is calculated as follows:

(user_playtime_average)/(global_playtime_average)

### Cut points
* Cut point 1: (mean - std_dev*0.5) * scaling_factor if > 0, else 0
* Cut point 2: mean
* Cut point 3: (mean + std_dev*0.5) * scaling_factor
* Cut point 4: (mean + std_dev) * scaling_factor
### Ratings
* Rating 1: 0 < x < cut point 1
* Rating 2: cut point 1 < x < cut point 2
* Rating 3: cut point 2 < x < cut point 3
* Rating 4: cut point 3 < x < cut point 4
* Rating 5: cut point 5 < x < inf

In [20]:
def normalize_dataset(df):
    # Calculate the per-game mean and standard deviation of the playtime column
    game_stats = df.filter(col("playtime_forever") > 0).groupBy("appid").agg(
        mean("playtime_forever").alias("game_mean_playtime"),
        stddev_pop("playtime_forever").alias("game_stddev_playtime")
    )

    # Calculate the overall playtime average
    overall_playtime_avg = df.filter(col("playtime_forever") > 0).agg(avg("playtime_forever")).collect()[0][0]

    # Calculate the per-steamid playtime average
    user_playtime_avg = df.filter(col("playtime_forever") > 0).groupBy("steamid").agg(avg("playtime_forever")).withColumnRenamed("avg(playtime_forever)", "user_playtime_avg")

    # Join the user_playtime_avg dataframe with the main dataframe
    df = df.join(user_playtime_avg, "steamid")

    # Join the game_stats dataframe with the main dataframe
    df = df.join(game_stats, "appid")

    # Calculate the scaling factor based on the ratio of user playtime to overall playtime
    df = df.withColumn("scaling_factor", col("user_playtime_avg") / overall_playtime_avg)

    # Calculate the adjusted cut points
    df = df.withColumn("cut_point_1", when((col("game_mean_playtime") - col("game_stddev_playtime") * 0.5) * col("scaling_factor") > 0, col("game_mean_playtime") - (col("game_stddev_playtime") * 0.5 * col("scaling_factor"))).otherwise(0))
    df = df.withColumn("cut_point_2", col("game_mean_playtime") * col("scaling_factor"))
    df = df.withColumn("cut_point_3", (col("game_mean_playtime") + col("game_stddev_playtime") * 0.5) * col("scaling_factor"))
    df = df.withColumn("cut_point_4", (col("game_mean_playtime") + col("game_stddev_playtime")) * col("scaling_factor"))

    # Assign ratings based on adjusted cut points
    df = df.withColumn(
        "ratings",
        when(col("playtime_forever") <= col("cut_point_1"), lit(1))
        .when((col("playtime_forever") > col("cut_point_1")) & (col("playtime_forever") <= col("cut_point_2")), lit(2))
        .when((col("playtime_forever") > col("cut_point_2")) & (col("playtime_forever") <= col("cut_point_3")), lit(3))
        .when((col("playtime_forever") > col("cut_point_3")) & (col("playtime_forever") <= col("cut_point_4")), lit(4))
        .otherwise(lit(5))
    )

    # Drop the columns that are no longer needed
    df = df.drop("playtime_forever", "game_mean_playtime", "game_stddev_playtime", "user_playtime_avg", "scaling_factor", "cut_point_1", "cut_point_2", "cut_point_3", "cut_point_4")
    return df

df = normalize_dataset(df)

In [21]:
# Randomly split the data into 70% training and 30% test data
training, test = df.randomSplit([0.9, 0.1], seed=2313)

## Pearson Correlation Matrix
Since the dataset contains at most ~4500 games, we can expect a 4500^2=81,000,000 sized matrix. Each float entry takes 4 bytes in memory. Therefore, the pearson correlation matrix would take up 324 MB of memory.

Since the memory used is relatively small, we will pre-compute the person correlation matrix and store it for use later in the algorithm.

To start off, we create a list of features for each game (appid) using the playtime_forever of all users who have played the game.

In [22]:
training_matrix = training.groupBy("steamid").pivot("appid").agg(first("ratings")).na.fill(0)

                                                                                

In [23]:
# Create features vector
assembler = VectorAssembler(inputCols=training_matrix.columns[1:], outputCol="features")
vector = assembler.transform(training_matrix).select("features")

# Calculate the correlation matrix
pearson_matrix = Correlation.corr(vector, "features", "pearson")
corr_array = pearson_matrix.head()[0].toArray()

                                                                                

In [24]:
# Create a dict of appid to index
appid_index = {}
for i, appid in enumerate(training_matrix.columns[1:]):
    appid_index[int(appid)] = i

## Rating Prediction


For a given user (steamid) and game (appid), we can compute the predicted rating of the game (appid) for the user (steamid) using the weighted average of the k most similar games to the game (appid) that the user (steamid) has played.

This is the same item-item collaborative filtering formula that was shown in class.

In [25]:
global_avg = training.select(avg("ratings")).collect()[0][0]

def predict_rating(appid, user_ratings_dict, k=3, min_k=1):

    # Get appid row number from the vectors dataframe
    if appid in appid_index:
        appid_row_num = appid_index[appid]
    else:
        return 0.0

    # Get a list of correlations between the appid and all other games
    corr = corr_array[appid_row_num]

    # Create a dict of appids and correlations
    corr_dict = {}
    for appid in appid_index:
        if appid in user_ratings_dict:
            corr_dict[appid] = corr[appid_index[appid]]

    # Make a list of tuples of (appid, correlation, rating)
    corr_list = []
    for appid, corr in corr_dict.items():
        # Only add the appid to the list if the user has rated it
        if appid in user_ratings_dict:
            corr_list.append((appid, corr, user_ratings_dict[appid]))

    # Sort the list by correlation
    corr_list.sort(key=lambda x: x[1], reverse=True)

    # Get the top k most similar appids
    top_k = corr_list[1:k+1]

    # Remove negative correlations
    top_k = [x for x in top_k if x[1] > 0]

    # If there are not enough similar appids, return the global average
    if len(top_k) < min_k:
        return global_avg

    # Calculate the weighted average of the top 10 appids
    numerator = 0.0
    denominator = 0.0
    for appid, corr, rating in top_k:
        numerator += corr * rating
        denominator += corr
    if denominator != 0:
        return float(numerator / denominator)
    else:
        return 0.0

def predict_single_rating(steamid, appid, training_df):
    # Get all the user's ratings
    user_ratings_dict = training_df.filter(col("steamid") == steamid).select("appid", "ratings").rdd.collectAsMap()

    # Predict the user's rating for the appid
    return predict_rating(appid, user_ratings_dict)

In [26]:
# Test the function for a given user and game
test_steamid = 76561198023872000
test_appid = 500
print(f'Predicted rating for steamid [{test_steamid}] and appid [{test_appid}]: {predict_single_rating(test_steamid, test_appid, training)}')

                                                                                

Predicted rating for steamid [76561198023872000] and appid [500]: 2.299157668319233


## Recommender



We run in parallel the rating prediction for each game that the user has not played before. We then sort the games by the predicted rating and return the top k games.

In [27]:
# Get all unique appids
all_appids = training.select("appid").distinct().rdd.flatMap(lambda x: x).collect()

                                                                                

In [28]:
def predict_user_ratings(steamid, recommendation_count=3):
    # Get all_appids that the user has not rated
    user_ratings_dict = training.filter(col("steamid") == steamid).select("appid", "ratings").rdd.collectAsMap()
    not_rated = [appid for appid in all_appids if appid not in user_ratings_dict]
    not_rated_rdd = spark.sparkContext.parallelize(not_rated)

    # Run predict_rating for each appid and output a list of tuples of (appid, predicted rating)
    predictions = not_rated_rdd.map(lambda appid: (appid, predict_rating(appid, user_ratings_dict))).collect()

    # Sort the list by predicted rating
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Return top 3 appids outside of the tuple
    return [appid for appid, rating in predictions[:recommendation_count]]

print(f'Predicted top 3 games for steamid [{test_steamid}]: {predict_user_ratings(test_steamid)}')

Predicted top 3 games for steamid [76561198023872000]: [Decimal('46600'), Decimal('299440'), Decimal('266550')]


                                                                                

## RMSE

In [29]:
# Normalize the test dataset
test_normalized = test

In [30]:
# Create dataframe with only one row per steamid, a list of appids and ratings from the training set and a list of appids and ratings from the test set
from pyspark.sql.functions import size


steamid_ratings = training.groupBy("steamid").agg(collect_list("appid").alias("train_appids"), collect_list("ratings").alias("train_ratings"))
steamid_ratings = steamid_ratings.join(test_normalized.groupBy("steamid").agg(collect_list("appid").alias("test_appids"), collect_list("ratings").alias("test_ratings")), on="steamid", how="inner")


def predict_target_ratings(train_appids, train_ratings, test_appids):
    # If the test_appids list is empty, return an empty list
    if len(test_appids) == 0:
        return []
    # Get dictionary of appids and ratings from train_appids and train_ratings
    train_ratings_dict = dict(zip(train_appids, train_ratings))

    # For each appid in test_appids, do predict_rating and return a list of tuples of (appid, predicted rating)
    predictions = [predict_rating(appid, train_ratings_dict) for appid in test_appids]
    return predictions

# Create a udf for predict_target_ratings
predict_target_ratings_udf = udf(predict_target_ratings, ArrayType(FloatType()))

# For each row in the steamid_ratings dataframe, run predict_target_ratings and add the predictions list into a new column
steamid_ratings = steamid_ratings.withColumn("predictions", predict_target_ratings_udf("train_appids", "train_ratings", "test_appids"))

# For each column in the predictions column, calculate the rmse
def calculate_rmse(predictions, test_ratings):
    # If the predictions list is empty, return 0.0
    if len(predictions) == 0:
        return 0.0
    # Calculate the rmse for the predictions list
    rmse = 0
    prediction_num = 0
    for prediction, test_rating in zip(predictions, test_ratings):
        # Check if prediction is not 0.0
        if prediction != 0.0:
            rmse += (prediction - test_rating) ** 2
            prediction_num += 1
    # Do square root
    if prediction_num == 0:
        return 0.0
    else:
        return rmse/prediction_num

# Create a udf for calculate_rmse
calculate_rmse_udf = udf(calculate_rmse, FloatType())

# For each row in the steamid_ratings dataframe, run calculate_rmse and add the rmse into a new column
steamid_ratings = steamid_ratings.withColumn("rmse", calculate_rmse_udf("predictions", "test_ratings"))

# Remove all rows where rmse is 0.0
steamid_ratings = steamid_ratings.filter(col("rmse") != 0.0)

# Sort the dataframe by rmse in descending order
# steamid_ratings = steamid_ratings.sort(col("rmse").desc())

# Filter columns where train_appids is less than 3
steamid_ratings = steamid_ratings.filter(size("train_appids") > 3)

steamid_ratings.show()

# Get the rmse for the table
rmse = steamid_ratings.select(avg("rmse")).collect()[0][0]

# Calculate the root mean squared error
rmse = math.sqrt(rmse)

print(f'RMSE: {rmse}')

# Calculate the standard deviation of the ratings
ratings_std = test_normalized.select(stddev("ratings")).collect()[0][0]
print(f'Standard deviation of ratings: {ratings_std}')

# Calculate the percent difference between the rmse and the standard deviation
percent_diff = (rmse - ratings_std) / ratings_std * 100
print(f'Percent difference: {percent_diff}')

# Calculate the standard deviation of rmse
rmse_std = steamid_ratings.select(stddev("rmse")).collect()[0][0]
print(f'Standard deviation of rmse: {rmse_std}')

                                                                                

+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|          steamid|        train_appids|       train_ratings|         test_appids|        test_ratings|         predictions|      rmse|
+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|76561197960334000|[70, 80, 220, 240...|[3, 4, 1, 2, 4, 2...|       [2310, 35700]|              [5, 1]|[3.3010597, 2.103...|  2.052434|
|76561197960342000|[20, 30, 70, 240,...|[2, 3, 2, 2, 2, 2...|     [24200, 108710]|              [2, 1]|[2.6842768, 2.352...| 1.1489708|
|76561197960816000|[570, 41010, 4268...|[2, 1, 3, 2, 5, 5...|    [220440, 226320]|              [1, 2]|[2.8471413, 2.207...| 1.7274506|
|76561197960884000|[240, 440, 630, 7...|[2, 2, 2, 2, 2, 5...|[24200, 49520, 21...|        [3, 1, 1, 2]|[2.3153758, 1.310...| 2.5755517|
|76561197961366000|[30, 60, 70, 211,...|[3, 2, 2

                                                                                

RMSE: 1.4075714808907511


                                                                                

Standard deviation of ratings: 1.2920454355471376
Percent difference: 8.941329938191544




Standard deviation of rmse: 2.1222401181322894


                                                                                

In [31]:
# Compute the global average of the training dataset
global_avg = training.select(avg("ratings")).collect()[0][0]

def global_avg_ratings(test_appids):
    # If the test_appids list is empty, return an empty list
    if len(test_appids) == 0:
        return []

    # For each appid in test_appids, do predict_rating and return a list of tuples of (appid, predicted rating)
    predictions = [global_avg for appid in test_appids]
    return predictions

# Create a udf for predict_target_ratings
global_avg_ratings_udf = udf(global_avg_ratings, ArrayType(FloatType()))

# For each row in the steamid_ratings dataframe, run predict_target_ratings and add the predictions list into a new column
steamid_ratings = steamid_ratings.withColumn("global_avg_predictions", global_avg_ratings_udf("test_appids"))

# For each row in the steamid_ratings dataframe, run calculate_rmse and add the rmse into a new column
steamid_ratings = steamid_ratings.withColumn("rmse_avg", calculate_rmse_udf("global_avg_predictions", "test_ratings"))

# Remove all rows where rmse is 0.0
steamid_ratings = steamid_ratings.filter(col("rmse_avg") != 0.0)

# Sort the dataframe by rmse in descending order
# steamid_ratings = steamid_ratings.sort(col("rmse").desc())

steamid_ratings.show()

# Get the rmse for the table
rmse_avg = steamid_ratings.select(avg("rmse_avg")).collect()[0][0]

# Calculate the root mean squared error
rmse_avg = math.sqrt(rmse_avg)

print(f'RMSE: {rmse_avg}')


                                                                                

+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+----------------------+----------+
|          steamid|        train_appids|       train_ratings|         test_appids|        test_ratings|         predictions|       rmse|global_avg_predictions|  rmse_avg|
+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+----------------------+----------+
|76561197960334000|[70, 80, 220, 240...|[3, 4, 1, 2, 4, 2...|[2200, 8870, 241410]|           [5, 1, 1]|[2.0729234, 2.409...|  4.3366933|  [2.477214, 2.4772...| 3.5762572|
|76561197960338000| [240, 380, 400, 10]|        [2, 5, 5, 2]|      [420, 630, 50]|           [5, 3, 4]|[4.2034855, 3.280...| 0.46569654|  [2.477214, 2.4772...| 2.9855435|
|76561197960342000|[20, 70, 240, 320...|[2, 2, 2, 2, 2, 4...|        [304930, 30]|              [3, 3]|[3.077104, 2.5227...|0.116869435|  [2.4772



RMSE: 1.2615426999557364


                                                                                

In [32]:
# Use predict_rating for appid 400 and user 76561198025744000 and print it nicely
print(f'Predicted rating for steamid [{76561198025744000}] and appid [{400}]: {predict_single_rating(76561198025744000, 400, training)}')

                                                                                

Predicted rating for steamid [76561198025744000] and appid [400]: 3.9459605833771616
