In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, col

In [2]:
spark = (
    SparkSession.builder 
    .appName("ALSDataPrep") 
    .master("local[*]")  # Run Spark locally on all available cores within the container
    .config("spark.driver.memory", "4g")  # Adjust driver memory if needed
    .config("spark.executor.memory", "4g") # Adjust executor memory if needed
    .config("spark.sql.shuffle.partitions", "200") # Adjust shuffle partitions if needed
    .getOrCreate()
)

print("Spark Session created successfully")

Spark Session created successfully


In [3]:
file_path = "yelp_train.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

print("Data loaded successfully")
df.printSchema()
df.show(5)

Data loaded successfully
root
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- city: string (nullable = true)

+--------------------+--------------------+-----+-------------------+------------+
|             user_id|         business_id|stars|               date|        city|
+--------------------+--------------------+-----+-------------------+------------+
|bu1mtWwxuayzV6b3V...|UnQriKTiNsYnyPE3R...|    3|2014-04-23 18:33:20| New Orleans|
|RqqG8mtD0pqhkKQks...|Ns8vJxhsc5ygiX7Vf...|    5|2016-06-18 15:40:31|   Nashville|
|x1GKrdGbQjaW1dcFB...|UFCN0bYdHroPKu6KV...|    5|2019-10-04 17:45:11|Indianapolis|
|suRoi6fJjOSdzBMWz...|p1257iwsyKj00D_14...|    3|2011-04-12 15:44:07|Philadelphia|
|UD7ZLOi6yY8K5Hi4L...|E8aDjN4OzvY4BZ4NJ...|    3|2017-04-25 11:20:06|Indianapolis|
+--------------------+--------------------+-----+-------------------+------------+
only showing top 5 rows



In [4]:
user_id_mapping = df.select("user_id").distinct().withColumn("userCol", monotonically_increasing_id())
business_id_mapping = df.select("business_id").distinct().withColumn("itemCol", monotonically_increasing_id())

als_input_df = df.join(user_id_mapping, "user_id", "inner").drop("user_id")
als_input_df = als_input_df.join(business_id_mapping, "business_id", "inner").drop("business_id")

als_input_df = als_input_df.select(
    col("userCol").cast("int"),
    col("itemCol").cast("int"),
    col("stars").cast("float").alias("ratingCol")
)

als_input_df.printSchema()
als_input_df.show(5)

root
 |-- userCol: integer (nullable = false)
 |-- itemCol: integer (nullable = false)
 |-- ratingCol: float (nullable = true)

+-------+-------+---------+
|userCol|itemCol|ratingCol|
+-------+-------+---------+
|    615|     65|      5.0|
|    411|     65|      2.0|
|    411|     65|      3.0|
|   2083|     65|      4.0|
|   1330|     65|      5.0|
+-------+-------+---------+
only showing top 5 rows



In [5]:
# Persist the mappings
output_path_user = "/home/jovyan/work/id_mappings/user_id_mapping"
user_id_mapping.write.mode("overwrite").parquet(output_path_user)
print(f"User ID mapping saved to: {output_path_user}")

output_path_business = "/home/jovyan/work/id_mappings/business_id_mapping"
business_id_mapping.write.mode("overwrite").parquet(output_path_business)
print(f"Business ID mapping saved to: {output_path_business}")

User ID mapping saved to: /home/jovyan/work/id_mappings/user_id_mapping
Business ID mapping saved to: /home/jovyan/work/id_mappings/business_id_mapping
