In [1]:
from pyspark.sql.functions import asc, first, mean, row_number, when, udf
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window

In [2]:
# Create a SparkSession
spark = SparkSession.builder.appName("GameMatrix").getOrCreate()

23/03/30 03:27:04 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.122.1 instead (on interface virbr0)
23/03/30 03:27:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/30 03:27:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/03/30 03:27:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Sample data
data = [("user1", "game1", 10), ("user2", "game2", 20), ("user1", "game2", 15), ("user2", "game1", 5), 
        ("user1", "game3", 30), ("user2", "game3", 25), ("user3", "game3", 15), ("user3", "game1", 10)]
df = spark.createDataFrame(data, ["steamid", "appid", "playtime_forever"])

In [4]:
# Create the game matrix
game_matrix = df.groupBy("appid").pivot("steamid").agg(first("playtime_forever"))
game_matrix = game_matrix.orderBy(asc("appid"))
game_matrix.show()

# Replace null values with 0
for game_col in game_matrix.columns[1:]:
    game_matrix = game_matrix.withColumn(game_col, game_matrix[game_col].cast(DoubleType()))
    game_matrix = game_matrix.fillna(0.0, subset=[game_col])

# Compute the mean of each column
col_means = game_matrix.agg(*(mean(game_col).alias(game_col) for game_col in game_matrix.columns[1:])).collect()[0]

# Subtract the mean from each non-zero cell in the game matrix
for game_col in game_matrix.columns[1:]:
    game_matrix = game_matrix.withColumn(game_col, when(game_matrix[game_col] != 0, game_matrix[game_col] - col_means[game_col]).otherwise(0))

                                                                                

+-----+-----+-----+-----+
|appid|user1|user2|user3|
+-----+-----+-----+-----+
|game1|   10|    5|   10|
|game2|   15|   20| null|
|game3|   30|   25|   15|
+-----+-----+-----+-----+



In [5]:
# Extract columns with numerical values and assemble them into a vector column
numeric_cols = game_matrix.columns[1:]
assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features")
vector_matrix = assembler.transform(game_matrix).select("features")

# Compute the Pearson correlation matrix between the rows
pearson_matrix = Correlation.corr(vector_matrix, "features", "pearson")
corr_array = pearson_matrix.head()[0].toArray()

In [6]:
# Add an index column to the game matrix
windowSpec = Window.partitionBy("appid").orderBy("appid")
game_matrix = game_matrix.withColumn("index", row_number().over(windowSpec))
game_matrix.show()

23/03/30 03:27:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/30 03:27:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/30 03:27:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/30 03:27:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/30 03:27:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/30 03:27:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/30 0

In [7]:
# Define the UDF closure to compute the weighted average
def udf_compute_weighted_average(user_playtimes, k):
    def compute_weighted_average(playtime, index):
        if playtime == 0.0:
            correlations = corr_array[index - 1]
            tuples = sorted(list(zip(correlations, user_playtimes)), key=lambda x: x[0], reverse=True)
            filtered_tuples = [tup for tup in tuples if tup[1] != 0.0]
            prediction = 0.0
            for i in range(min(k, len(filtered_tuples))):
                prediction = prediction + filtered_tuples[i][0] * filtered_tuples[i][1]
            return float(prediction)
        else:
            return 0.0
    return udf(compute_weighted_average, DoubleType())

In [8]:
# Apply the UDF to each user column
k = 2
predictions_df = game_matrix.select(("*"))
for index, user in enumerate(predictions_df.columns[1:len(predictions_df.columns) - 1]):
    user_playtimes = predictions_df.select(user).rdd.flatMap(lambda x: x).collect()
    predictions_df = predictions_df.withColumn(user, udf_compute_weighted_average(user_playtimes, k)(predictions_df[user], predictions_df["index"]))

predictions_df = predictions_df.drop("index")
predictions_df.show()

23/03/30 03:27:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/30 03:27:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/30 03:27:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/30 03:27:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/30 03:27:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/30 03:27:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/30 0