In [None]:
from pyspark.sql.functions import asc, first, mean, row_number, when, udf
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window

In [None]:
spark = SparkSession.builder.appName('ReadMariaDB') \
.config("spark.driver.memory", "32g") \
.config("spark.sql.pivotMaxValues", "1000000") \
.getOrCreate()


sql = "select * from 01_sampled_games_2v2 WHERE playtime_forever IS NOT NULL AND playtime_forever > 0"
database = "steam"
user = "root"
password = "example"
server = "127.0.0.1"
port = 3306
jdbc_url = f"jdbc:mysql://{server}:{port}/{database}?permitMysqlScheme"
jdbc_driver = "org.mariadb.jdbc.Driver"

# Create a data frame by reading data from Oracle via JDBC
df = spark.read.format("jdbc") \
    .option("url", jdbc_url) \
    .option("query", sql) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", jdbc_driver) \
    .load()

df = df.drop("playtime_2weeks", "dateretrieved")

In [None]:
# Count the number of rows in the DataFrame
row_count = df.count()

# Print the row count
print("The DataFrame has", row_count, "rows.")

In [None]:
# # Sample data
# data = [("user1", "game1", 10), ("user2", "game2", 20), ("user1", "game2", 15), ("user2", "game1", 5), 
#         ("user1", "game3", 30), ("user2", "game3", 25), ("user3", "game3", 15), ("user3", "game1", 10)]
# df = spark.createDataFrame(data, ["steamid", "appid", "playtime_forever"])

In [None]:
# Create the game matrix
game_matrix = df.groupBy("appid").pivot("steamid").agg(first("playtime_forever"))
game_matrix = game_matrix.orderBy(asc("appid"))

In [6]:
# Replace null values with 0
for game_col in game_matrix.columns[1:]:
    game_matrix = game_matrix.withColumn(game_col, game_matrix[game_col].cast(DoubleType()))
    game_matrix = game_matrix.fillna(0.0, subset=[game_col])

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/kevin/Documents/Repositories/steam-recommender/venv/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/kevin/Documents/Repositories/steam-recommender/venv/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
# Compute the mean of each column
col_means = game_matrix.agg(*(mean(game_col).alias(game_col) for game_col in game_matrix.columns[1:])).collect()[0]

# Subtract the mean from each non-zero cell in the game matrix
for game_col in game_matrix.columns[1:]:
    game_matrix = game_matrix.withColumn(game_col, when(game_matrix[game_col] != 0, game_matrix[game_col] - col_means[game_col]).otherwise(0))

In [None]:
# Extract columns with numerical values and assemble them into a vector column
numeric_cols = game_matrix.columns[1:]
assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features")
vector_matrix = assembler.transform(game_matrix).select("features")

# Compute the Pearson correlation matrix between the rows
pearson_matrix = Correlation.corr(vector_matrix, "features", "pearson")
corr_array = pearson_matrix.head()[0].toArray()

In [None]:
# Add an index column to the game matrix
windowSpec = Window.partitionBy("appid").orderBy("appid")
game_matrix = game_matrix.withColumn("index", row_number().over(windowSpec))
game_matrix.show()

In [None]:
# Define the UDF closure to compute the weighted average
def udf_compute_weighted_average(user_playtimes, k):
    def compute_weighted_average(playtime, index):
        if playtime == 0.0:
            correlations = corr_array[index - 1]
            tuples = sorted(list(zip(correlations, user_playtimes)), key=lambda x: x[0], reverse=True)
            filtered_tuples = [tup for tup in tuples if tup[1] != 0.0]
            prediction = 0.0
            for i in range(min(k, len(filtered_tuples))):
                prediction = prediction + filtered_tuples[i][0] * filtered_tuples[i][1]
            return float(prediction)
        else:
            return 0.0
    return udf(compute_weighted_average, DoubleType())

In [None]:
# Apply the UDF to each user column
k = 2
predictions_df = game_matrix.select(("*"))
for index, user in enumerate(predictions_df.columns[1:len(predictions_df.columns) - 1]):
    user_playtimes = predictions_df.select(user).rdd.flatMap(lambda x: x).collect()
    predictions_df = predictions_df.withColumn(user, udf_compute_weighted_average(user_playtimes, k)(predictions_df[user], predictions_df["index"]))

predictions_df = predictions_df.drop("index")
predictions_df.show()