In [29]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col

In [30]:
# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName("MovieRecommendationSystem").getOrCreate()

24/08/19 10:21:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/08/19 10:21:36 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [31]:
# Step 2: Load the dataset
file_path = "/home/lplab/Documents/movies.json"
df = spark.read.json(file_path)

# Print schema and show some data
df.printSchema()
df.show(5)

root
 |-- helpfulness: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- profile_name: string (nullable = true)
 |-- review: string (nullable = true)
 |-- score: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)

+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|helpfulness|product_id|        profile_name|              review|score|             summary|      time|       user_id|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|        7/7|B003AI2VGA|Brian E. Erland "...|Synopsis: On the ...|  3.0|"There Is So Much...|1182729600|A141HP4LYPWMSR|
|        4/4|B003AI2VGA|          Grady Harp|THE VIRGIN OF JUA...|  3.0|Worthwhile and Im...|1181952000|A328S9RN3U5M68|
|       8/10|B003AI2VGA|Chrissy K. McVay ...|The scenes in thi...|  5.0|This m

In [32]:
# Step 3: Data Preprocessing
# Select only relevant columns (user_id, product_id, score)
df = df.select(col("user_id"), col("product_id"), col("score"))

# Rename columns for consistency
df = df.withColumnRenamed("product_id", "item_id")

# Convert user_id and item_id to integer types by hashing (if they are strings)
from pyspark.sql.functions import hash

df = df.withColumn("user_id", hash(col("user_id")).cast("integer"))
df = df.withColumn("item_id", hash(col("item_id")).cast("integer"))

# Filter out rows with null values
df = df.dropna()

In [33]:
# Step 4: Split the data into training and test sets
(train_data, test_data) = df.randomSplit([0.8, 0.2])

In [34]:
# Step 5: Train the ALS model
als = ALS(
    maxIter=10,
    regParam=0.1,
    userCol="user_id",
    itemCol="item_id",
    ratingCol="score",
    coldStartStrategy="drop"  # Drops NaN predictions
)

model = als.fit(train_data)

In [35]:
# Step 6: Make predictions on the test data
predictions = model.transform(test_data)

# Show some predictions
predictions.show(10)

+-----------+----------+-----+----------+
|    user_id|   item_id|score|prediction|
+-----------+----------+-----+----------+
|  196017723|-665333942|  5.0|  4.056804|
|-2028851759|-665333942|  4.0| 4.0376124|
| 2079407529|-115414658|  5.0|0.28418383|
|-1389191369| 517765186|  3.0| 2.8997114|
|-2129226941| 326629311|  3.0| 3.3773108|
|    6595260| 517765186|  4.0| 3.8662815|
| -774785816|1650418923|  3.0| 0.7135387|
|-2000191253|1409269258|  5.0|  4.057887|
|-1657497823| 517765186|  2.0| 1.9331408|
|-1246849886| 517765186|  3.0| 2.8997114|
+-----------+----------+-----+----------+
only showing top 10 rows



In [36]:
# Step 7: Evaluate the model
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="score",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error (RMSE): {rmse}")

Root-mean-square error (RMSE): 1.9013335557267244


In [37]:
# Step 8: Print the top product recommendations for a specific user
user_id = 196017723  # Replace with the hashed user_id you want to get recommendations for
user_recs = model.recommendForAllUsers(10)
user_recs.filter(col("user_id") == user_id).show(truncate=False)

                                                                                

+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id  |recommendations                                                                                                                                                                                                                                        |
+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|196017723|[{-683828119, 6.621094}, {-1153537597, 6.5081396}, {-239574863, 6.020622}, {-453338225, 5.9918475}, {-910607049, 5.950754}, {-1681753400, 5.574095}, {-1820840231, 5.545782}, {-784450753, 5.5292044}, {-64133841

In [38]:
# Step 9: Print the top user recommendations for a specific product
item_id = 517765186  # Replace with the hashed item_id you want to get recommendations for
item_recs = model.recommendForAllItems(10)
item_recs.filter(col("item_id") == item_id).show(truncate=False)

# Stop the Spark session
spark.stop()

+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|item_id  |recommendations                                                                                                                                                                                                                              |
+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|517765186|[{916287897, 4.893393}, {1424393811, 4.832852}, {1382091941, 4.832852}, {1251825974, 4.832852}, {1205080985, 4.832852}, {1197234795, 4.832852}, {1079193887, 4.832852}, {956841423, 4.832852}, {693021486, 4.832852}, {-474778592, 4.832852}]|
