In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, col

In [2]:
spark = (
    SparkSession.builder 
    .appName("ALSDataPrep") 
    .master("local[*]")  # Run Spark locally on all available cores within the container
    .config("spark.driver.memory", "4g")  # Adjust driver memory if needed
    .config("spark.executor.memory", "4g") # Adjust executor memory if needed
    .config("spark.sql.shuffle.partitions", "200") # Adjust shuffle partitions if needed
    .getOrCreate()
)

print("Spark Session created successfully")

Spark Session created successfully


In [3]:
file_path = "training_data.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

print("Data loaded successfully")
df.printSchema()
df.show(5)

Data loaded successfully
root
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- city: string (nullable = true)
 |-- split: string (nullable = true)

+--------------------+--------------------+-----+-------------------+---------------+-----+
|             user_id|         business_id|stars|               date|           city|split|
+--------------------+--------------------+-----+-------------------+---------------+-----+
|---2PmXbF47D870st...|pgO-fORYt4nb5Tj0x...|    5|2012-10-24 13:33:39|        Trinity|train|
|---2PmXbF47D870st...|1An4DxtMmvvSe0HX4...|    5|2012-10-28 17:16:13|New Port Richey|train|
|---2PmXbF47D870st...|eR7ieJD12PUzsYrP8...|    5|2012-11-02 00:30:24|  Pinellas Park|train|
|---2PmXbF47D870st...|HpWi2CRJlxVCYKd8k...|    5|2013-02-17 15:13:21|      Dade City|train|
|---2PmXbF47D870st...|igC3UWYb9RF5CXOQO...|    5|2013-04-03 19:06:00|     Clearwater|train|
+

In [4]:
user_id_mapping = df.select("user_id").distinct().withColumn("userCol", monotonically_increasing_id())
business_id_mapping = df.select("business_id").distinct().withColumn("itemCol", monotonically_increasing_id())

als_input_df = df.join(user_id_mapping, "user_id", "inner").drop("user_id")
als_input_df = als_input_df.join(business_id_mapping, "business_id", "inner").drop("business_id")

als_input_df = als_input_df.select(
    col("userCol").cast("int"),
    col("itemCol").cast("int"),
    col("stars").cast("float").alias("ratingCol")
)

als_input_df.printSchema()
als_input_df.show(5)

root
 |-- userCol: integer (nullable = false)
 |-- itemCol: integer (nullable = false)
 |-- ratingCol: float (nullable = true)

+-------+-------+---------+
|userCol|itemCol|ratingCol|
+-------+-------+---------+
|   2459|     61|      2.0|
|   2459|     61|      3.0|
|  11482|     61|      4.0|
|   9153|     61|      5.0|
|  12488|     61|      1.0|
+-------+-------+---------+
only showing top 5 rows



In [5]:
# Persist the mappings
output_path_user = "/home/jovyan/work/id_mappings/user_id_mapping"
user_id_mapping.write.mode("overwrite").parquet(output_path_user)
print(f"User ID mapping saved to: {output_path_user}")

output_path_business = "/home/jovyan/work/id_mappings/business_id_mapping"
business_id_mapping.write.mode("overwrite").parquet(output_path_business)
print(f"Business ID mapping saved to: {output_path_business}")

User ID mapping saved to: /home/jovyan/work/id_mappings/user_id_mapping
Business ID mapping saved to: /home/jovyan/work/id_mappings/business_id_mapping
