In [71]:
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType

In [72]:
# Create a SparkSession
spark = SparkSession.builder.appName("GameMatrix").getOrCreate()

# Sample data
data = [("user1", "game1", 10), ("user2", "game2", 20), ("user1", "game2", 15), ("user2", "game1", 5), 
        ("user1", "game3", 30), ("user2", "game3", 25), ("user3", "game3", 15), ("user3", "game1", 10)]
df = spark.createDataFrame(data, ["steamid", "appid", "playtime_forever"])

In [73]:
# Create the game matrix
game_matrix = df.groupBy("appid").pivot("steamid").agg(first("playtime_forever"))

# Replace null values with 0
for col in game_matrix.columns[1:]:
    game_matrix = game_matrix.withColumn(col, game_matrix[col].cast(DoubleType()))
    game_matrix = game_matrix.fillna(0.0, subset=[col])
game_matrix.show()

# Compute the mean of each column
col_means = game_matrix.agg(*(mean(col).alias(col) for col in game_matrix.columns[1:])).collect()[0]

# Subtract the mean from each non-zero cell in the game matrix
for col in game_matrix.columns[1:]:
    game_matrix = game_matrix.withColumn(col, when(game_matrix[col] != 0, game_matrix[col] - col_means[col]).otherwise(0))
        
game_matrix.show()

+-----+-----+-----+-----+
|appid|user1|user2|user3|
+-----+-----+-----+-----+
|game3| 30.0| 25.0| 15.0|
|game2| 15.0| 20.0|  0.0|
|game1| 10.0|  5.0| 10.0|
+-----+-----+-----+-----+

+-----+------------------+-------------------+-----------------+
|appid|             user1|              user2|            user3|
+-----+------------------+-------------------+-----------------+
|game3|11.666666666666668|  8.333333333333332|6.666666666666666|
|game2|-3.333333333333332|  3.333333333333332|              0.0|
|game1|-8.333333333333332|-11.666666666666668|1.666666666666666|
+-----+------------------+-------------------+-----------------+



In [74]:
# Extract columns with numerical values and assemble them into a vector column
numeric_cols = game_matrix.columns[1:]
assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features")
vector_matrix = assembler.transform(game_matrix).select("features")

# Compute the Pearson correlation matrix between the rows
pearson_matrix = Correlation.corr(vector_matrix, "features", "pearson")

# Print the correlation array (numpy)
corr_array = pearson_matrix.head()[0].toArray()
print(corr_array)

[[1.         0.84615385 0.88461538]
 [0.84615385 1.         0.5       ]
 [0.88461538 0.5        1.        ]]
