# Item-item collaborative filtering

In [17]:
from pyspark.sql.functions import mean, when, udf, stddev_pop, col, lit, collect_list, avg, stddev, collect_list, udf, first
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, FloatType
import math
from pyspark.ml.feature import VectorAssembler

## Overview
1. Load dataset
2. Split the dataset
3. Normalize the dataset playtime into a 1-5 rating scale
4. Compute Pearson correlation matrix
5. Rating prediction by doing a weighted average of the k most similar items that the user has played before

## Load Dataset
The dataset is loaded from MariaDB into a dataframe.

In [18]:
spark = SparkSession.builder.appName('ReadMariaDB') \
.config("spark.driver.memory", "32g") \
.config("spark.sql.pivotMaxValues", "1000000") \
.config('spark.sql.codegen.wholeStage', 'false')\
.getOrCreate()

spark.sparkContext.setLogLevel("ERROR")


sql = "select * from 01_sampled_games_2 WHERE playtime_forever IS NOT NULL"
database = "steam"
user = "root"
password = "example"
server = "127.0.0.1"
port = 3306
jdbc_url = f"jdbc:mysql://{server}:{port}/{database}?permitMysqlScheme"
jdbc_driver = "org.mariadb.jdbc.Driver"

# Create a data frame by reading data from Oracle via JDBC
df = spark.read.format("jdbc") \
    .option("url", jdbc_url) \
    .option("query", sql) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", jdbc_driver) \
    .load()

df = df.drop("playtime_2weeks", "dateretrieved")

In [19]:
# Count the number of rows in the DataFrame
row_count = df.count()

# Print the row count
print("The DataFrame has", row_count, "rows.")

df.show()

The DataFrame has 712806 rows.
+-----------------+-----+----------------+
|          steamid|appid|playtime_forever|
+-----------------+-----+----------------+
|76561197960268000|   10|               8|
|76561197960268000|   20|               1|
|76561197960268000|   30|               0|
|76561197960268000|   40|               0|
|76561197960268000|   50|            1719|
|76561197960268000|   60|               1|
|76561197960268000|   70|            1981|
|76561197960268000|  130|             175|
|76561197960268000|  220|            3873|
|76561197960268000|  240|             221|
|76561197960268000|  320|               1|
|76561197960268000|  340|               0|
|76561197960268000|   80|               0|
|76561197960268000|  100|               0|
|76561197960268000|  280|            1242|
|76561197960268000|  300|             109|
|76561197960268000|  360|               3|
|76561197960268000| 1300|              94|
|76561197960268000| 1309|               0|
|76561197960268000| 131

## Per-Game 1-5 Rating Normalization
For each game, we calculate the mean and standard deviation. We then create buckets for each rating:
### Scaling factor
The cut points are scaled on a per-user basis since some users are more casual gamers while others may spend a lot more time gaming. The scaling factor is calculated as follows:

(user_playtime_average)/(global_playtime_average)

### Cut points
* Cut point 1: 30
* Cut point 2: 100
* Cut point 3: 200
* Cut point 4: 500

### Ratings
* Rating 1: 0 < x < cut point 1
* Rating 2: cut point 1 < x < cut point 2
* Rating 3: cut point 2 < x < cut point 3
* Rating 4: cut point 3 < x < cut point 4
* Rating 5: cut point 5 < x < inf

In [21]:
def normalize_dataset_v2(df):
    # Bucketize the playtime_forever column with intervals 0-30, 30-70, 70-200, 200-500, 500-inf
    return df.withColumn(
        "ratings",
        when(col("playtime_forever") <= 30, lit(1))
        .when((col("playtime_forever") > 30) & (col("playtime_forever") <= 100), lit(2))
        .when((col("playtime_forever") > 100) & (col("playtime_forever") <= 200), lit(3))
        .when((col("playtime_forever") > 200) & (col("playtime_forever") <= 500), lit(4))
        .otherwise(lit(5))
    )

df = normalize_dataset_v2(df)

## Split the dataset

In [22]:
# Randomly split the data into 70% training and 30% test data
training, test = df.randomSplit([0.9, 0.1], seed=2313)

## Pearson Correlation Matrix
Since the dataset contains at most ~4500 games, we can expect a 4500^2=81,000,000 sized matrix. Each float entry takes 4 bytes in memory. Therefore, the pearson correlation matrix would take up 324 MB of memory.

Since the memory used is relatively small, we will pre-compute the person correlation matrix and store it for use later in the algorithm.

To start off, we create a list of features for each game (appid) using the playtime_forever of all users who have played the game.

In [23]:
training_matrix = training.groupBy("steamid").pivot("appid").agg(first("ratings")).na.fill(0)

                                                                                

In [24]:
# Create features vector
assembler = VectorAssembler(inputCols=training_matrix.columns[1:], outputCol="features")
vector = assembler.transform(training_matrix).select("features")

# Calculate the correlation matrix
pearson_matrix = Correlation.corr(vector, "features", "pearson")
corr_array = pearson_matrix.head()[0].toArray()

                                                                                

In [25]:
# Create a dict of appid to index
appid_index = {}
for i, appid in enumerate(training_matrix.columns[1:]):
    appid_index[int(appid)] = i

## Rating Prediction


For a given user (steamid) and game (appid), we can compute the predicted rating of the game (appid) for the user (steamid) using the weighted average of the k most similar games to the game (appid) that the user (steamid) has played.

This is the same item-item collaborative filtering formula that was shown in class.

In [26]:
global_avg = training.select(avg("ratings")).collect()[0][0]

def predict_rating(appid, user_ratings_dict, k=3, min_k=1):

    # Get appid row number from the vectors dataframe
    if appid in appid_index:
        appid_row_num = appid_index[appid]
    else:
        return 0.0

    # Get a list of correlations between the appid and all other games
    corr = corr_array[appid_row_num]

    # Create a dict of appids and correlations
    corr_dict = {}
    for appid in appid_index:
        if appid in user_ratings_dict:
            corr_dict[appid] = corr[appid_index[appid]]

    # Make a list of tuples of (appid, correlation, rating)
    corr_list = []
    for appid, corr in corr_dict.items():
        # Only add the appid to the list if the user has rated it
        if appid in user_ratings_dict:
            corr_list.append((appid, corr, user_ratings_dict[appid]))

    # Sort the list by correlation
    corr_list.sort(key=lambda x: x[1], reverse=True)

    # Get the top k most similar appids
    top_k = corr_list[1:k+1]

    # Remove negative correlations
    top_k = [x for x in top_k if x[1] > 0]

    # If there are not enough similar appids, return the global average
    if len(top_k) < min_k:
        return global_avg

    # Calculate the weighted average of the top 10 appids
    numerator = 0.0
    denominator = 0.0
    for appid, corr, rating in top_k:
        numerator += corr * rating
        denominator += corr
    if denominator != 0:
        return float(numerator / denominator)
    else:
        return 0.0

def predict_single_rating(steamid, appid, training_df):
    # Get all the user's ratings
    user_ratings_dict = training_df.filter(col("steamid") == steamid).select("appid", "ratings").rdd.collectAsMap()

    # Predict the user's rating for the appid
    return predict_rating(appid, user_ratings_dict)

                                                                                

In [27]:
# Test the function for a given user and game
test_steamid = 76561198023872000
test_appid = 500
print(f'Predicted rating for steamid [{test_steamid}] and appid [{test_appid}]: {predict_single_rating(test_steamid, test_appid, training)}')

[Stage 128:>                                                        (0 + 1) / 1]

Predicted rating for steamid [76561198023872000] and appid [500]: 3.1813717637670162


                                                                                

## Recommender



We run in parallel the rating prediction for each game that the user has not played before. We then sort the games by the predicted rating and return the top k games.

In [28]:
# Get all unique appids
all_appids = training.select("appid").distinct().rdd.flatMap(lambda x: x).collect()

                                                                                

In [29]:
def predict_user_ratings(steamid, recommendation_count=3):
    # Get all_appids that the user has not rated
    user_ratings_dict = training.filter(col("steamid") == steamid).select("appid", "ratings").rdd.collectAsMap()
    not_rated = [appid for appid in all_appids if appid not in user_ratings_dict]
    not_rated_rdd = spark.sparkContext.parallelize(not_rated)

    # Run predict_rating for each appid and output a list of tuples of (appid, predicted rating)
    predictions = not_rated_rdd.map(lambda appid: (appid, predict_rating(appid, user_ratings_dict))).collect()

    # Sort the list by predicted rating
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Return top 3 appids outside of the tuple
    return [appid for appid, rating in predictions[:recommendation_count]]

print(f'Predicted top 3 games for steamid [{test_steamid}]: {predict_user_ratings(test_steamid)}')



Predicted top 3 games for steamid [76561198023872000]: [Decimal('216150'), Decimal('298600'), Decimal('267220')]


                                                                                

## RMSE

In [30]:
# Normalize the test dataset
test_normalized = test

In [31]:
# Create dataframe with only one row per steamid, a list of appids and ratings from the training set and a list of appids and ratings from the test set
from pyspark.sql.functions import size, sum

# Put training in a single partition
training = training.repartition(1)

# Put test_normalized in a single partition
test_normalized = test_normalized.repartition(1)


steamid_ratings = training.groupBy("steamid").agg(collect_list("appid").alias("train_appids"), collect_list("ratings").alias("train_ratings"))
steamid_ratings = steamid_ratings.join(test_normalized.groupBy("steamid").agg(collect_list("appid").alias("test_appids"), collect_list("ratings").alias("test_ratings")), on="steamid", how="inner")


def predict_target_ratings(train_appids, train_ratings, test_appids):
    # If the test_appids list is empty, return an empty list
    if len(test_appids) == 0:
        return []
    # Get dictionary of appids and ratings from train_appids and train_ratings
    train_ratings_dict = dict(zip(train_appids, train_ratings))

    # For each appid in test_appids, do predict_rating and return a list of tuples of (appid, predicted rating)
    predictions = [predict_rating(appid, train_ratings_dict) for appid in test_appids]
    return predictions

# Create a udf for predict_target_ratings
predict_target_ratings_udf = udf(predict_target_ratings, ArrayType(FloatType()))

# For each row in the steamid_ratings dataframe, run predict_target_ratings and add the predictions list into a new column
steamid_ratings = steamid_ratings.withColumn("predictions", predict_target_ratings_udf("train_appids", "train_ratings", "test_appids"))

# For each column in the predictions column, calculate the rmse
def calculate_rmse(predictions, test_ratings):
    # If the predictions list is empty, return 0.0
    if len(predictions) == 0:
        return 0.0
    # Calculate the rmse for the predictions list
    meansquare = 0.0
    prediction_num = 0.0
    for prediction, test_rating in zip(predictions, test_ratings):
        # Check if prediction is not 0.0
        if prediction != 0.0:
            meansquare += (prediction - test_rating) ** 2
            prediction_num += 1
    return [meansquare, prediction_num]

# Create a udf for calculate_rmse
calculate_rmse_udf = udf(calculate_rmse, ArrayType(FloatType()))

# For each row in the steamid_ratings dataframe, run calculate_rmse and add the rmse into a new column
steamid_ratings = steamid_ratings.withColumn("meansquare", calculate_rmse_udf("predictions", "test_ratings"))

# Split the meansquare column into two columns
steamid_ratings = steamid_ratings.withColumn("ms", col("meansquare")[0])
steamid_ratings = steamid_ratings.withColumn("prediction_num", col("meansquare")[1])
steamid_ratings = steamid_ratings.drop("meansquare")

# Sort steamid_ratings by ms
steamid_ratings = steamid_ratings.sort(col("ms").desc())

# Get the sum of the ms column
ms_sum = steamid_ratings.select(sum("ms")).collect()[0][0]

# Get the sum of the prediction_num column
prediction_num_sum = steamid_ratings.select(sum("prediction_num")).collect()[0][0]

rmse = math.sqrt(ms_sum / prediction_num_sum)

# Print the rmse
print(f'RMSE: {rmse}')

# Calculate the standard deviation of the ratings
ratings_std = test_normalized.select(stddev("ratings")).collect()[0][0]
print(f'Standard deviation of ratings: {ratings_std}')

# Calculate the percent difference between the rmse and the standard deviation
percent_diff = (rmse - ratings_std) / ratings_std * 100
print(f'Percent difference: {percent_diff}')

                                                                                

RMSE: 1.6294799711267


[Stage 164:>                                                        (0 + 1) / 1]

Standard deviation of ratings: 1.6670507946114101
Percent difference: -2.2537299766842347


                                                                                

## Precision

In [32]:
# Create rdd of all appids
from typing import List


all_appids_rdd = spark.sparkContext.parallelize(all_appids)

def predict_all_user_ratings(train_appids, train_ratings, test_appids, test_ratings, k=10):
    # Return 0 if any of the lists are empty
    if len(train_appids) == 0 or len(train_ratings) == 0 or len(test_appids) == 0 or len(test_ratings) == 0:
        return 0.0
    
    all_appids = [*train_appids, *test_appids]
    all_ratings = [*train_ratings, *test_ratings]

    # If the length of all_appids is less than k, return -1.0
    if len(all_appids) < k:
        return -1.0

    # Create list of tuples of (appid, rating)
    appid_ratings = list(zip(all_appids, all_ratings))

    # Sort tuple list by rating in descending order
    appid_ratings.sort(key=lambda x: x[1], reverse=True)

    # Get the top k appids
    top_k_appids = [appid for appid, rating in appid_ratings[:k]]

    # Create dictionary of appids and ratings
    user_ratings_dict = dict(appid_ratings)

    # For each appid in test_appids, do predict_rating and return a list of tuples of (appid, predicted rating)
    predictions = [(appid, predict_rating(appid, user_ratings_dict)) for appid in all_appids]
    
    # Sort the list by predicted rating
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Return top k appids outside of the tuple
    k_recommendations = [appid for appid, rating in predictions[:k]]

    # Calculate the proportion of recommendations that are in the top k appids
    precision = len(set(top_k_appids).intersection(set(k_recommendations))) / k

    return precision

# Create udf for predict_all_user_ratings
predict_all_user_ratings_udf = udf(predict_all_user_ratings, FloatType())

# For each row in the steamid_ratings dataframe, run predict_all_user_ratings and add the predictions list into a new column
precision_df = steamid_ratings.withColumn("precision", predict_all_user_ratings_udf("train_appids", "train_ratings", "test_appids", "test_ratings"))

# Remove rows where precision is -1.0
precision_df = precision_df.filter(precision_df.precision != -1.0)

# Average the precision column
precision = precision_df.select(avg("precision")).collect()[0][0]
print(f'Precision: {precision}')



Precision: 0.5136604333185304


                                                                                