In [30]:
!java --version
!python --version

openjdk 11.0.18 2023-01-17
OpenJDK Runtime Environment (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1)
OpenJDK 64-Bit Server VM (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1, mixed mode, sharing)
Python 3.9.16


In [31]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
# Import Apache Spark SQL
from pyspark.sql import SparkSession

# Create Spark Session/Context
# We are using local machine with all the CPU cores [*]
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Hello Pyspark") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [33]:
# Check spark session
print(spark)

<pyspark.sql.session.SparkSession object at 0x7fb0ef76a940>


In [34]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [35]:
lines = spark.read.text("/content/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

# maxIter = 5

## regParam = 0.1

In [36]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [37]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.0146066361214776


## regParam = 0.5

In [38]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als1 = ALS(maxIter=5, regParam=0.5, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model1 = als1.fit(training)

In [39]:
# Evaluate the model by computing the RMSE on the test data
predictions1 = model1.transform(test)
evaluator1 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse1 = evaluator.evaluate(predictions1)
print("Root-mean-square error = " + str(rmse1))

Root-mean-square error = 1.2146362150064365


## regParam = 1.0

In [40]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als2 = ALS(maxIter=5, regParam=1.0, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model2 = als2.fit(training)

In [41]:
# Evaluate the model by computing the RMSE on the test data
predictions2 = model2.transform(test)
evaluator2 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse2 = evaluator.evaluate(predictions2)
print("Root-mean-square error = " + str(rmse2))

Root-mean-square error = 1.4815544500971018


# maxIter = 10

## regParam = 0.1

In [42]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als3 = ALS(maxIter=10, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model3 = als3.fit(training)

In [43]:
# Evaluate the model by computing the RMSE on the test data
predictions3 = model3.transform(test)
evaluator3 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse3 = evaluator.evaluate(predictions3)
print("Root-mean-square error = " + str(rmse3))

Root-mean-square error = 0.9640166302412116


## regParam = 0.5

In [44]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als4 = ALS(maxIter=10, regParam=0.5, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model4 = als4.fit(training)

In [45]:
# Evaluate the model by computing the RMSE on the test data
predictions4 = model4.transform(test)
evaluator4 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse4 = evaluator.evaluate(predictions4)
print("Root-mean-square error = " + str(rmse4))

Root-mean-square error = 1.2113918395225025


## regParam = 1.0

In [46]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als5 = ALS(maxIter=10, regParam=1.0, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model5 = als5.fit(training)

In [47]:
# Evaluate the model by computing the RMSE on the test data
predictions5 = model5.transform(test)
evaluator5 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse5 = evaluator.evaluate(predictions5)
print("Root-mean-square error = " + str(rmse5))

Root-mean-square error = 1.4815608073452533


# maxIter = 20

## regParam = 0.1

In [48]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als6 = ALS(maxIter=20, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model6 = als6.fit(training)

In [49]:
# Evaluate the model by computing the RMSE on the test data
predictions6 = model6.transform(test)
evaluator6 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse6 = evaluator.evaluate(predictions6)
print("Root-mean-square error = " + str(rmse6))

Root-mean-square error = 0.941851252725291


## regParam = 0.5

In [50]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als7 = ALS(maxIter=20, regParam=0.5, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model7 = als7.fit(training)

In [51]:
# Evaluate the model by computing the RMSE on the test data
predictions7 = model7.transform(test)
evaluator7 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse7 = evaluator.evaluate(predictions7)
print("Root-mean-square error = " + str(rmse7))

Root-mean-square error = 1.211892353780886


## regParam = 1.0

In [52]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als8 = ALS(maxIter=20, regParam=1.0, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model8 = als8.fit(training)

In [53]:
# Evaluate the model by computing the RMSE on the test data
predictions8 = model8.transform(test)
evaluator8 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse8 = evaluator.evaluate(predictions8)
print("Root-mean-square error = " + str(rmse8))

Root-mean-square error = 1.48156082108903


# Comparation

## maxIter = 5

In [56]:
print("Root-mean-square error (regParam = 0.1) = " + str(rmse))
print("Root-mean-square error (regParam = 0.5) = " + str(rmse1))
print("Root-mean-square error (regParam = 1.0) = " + str(rmse2))

Root-mean-square error (regParam = 0.1) = 1.0146066361214776
Root-mean-square error (regParam = 0.5) = 1.2146362150064365
Root-mean-square error (regParam = 1.0) = 1.4815544500971018


## maxIter = 10

In [57]:
print("Root-mean-square error (regParam = 0.1) = " + str(rmse3))
print("Root-mean-square error (regParam = 0.5) = " + str(rmse4))
print("Root-mean-square error (regParam = 1.0) = " + str(rmse5))

Root-mean-square error (regParam = 0.1) = 0.9640166302412116
Root-mean-square error (regParam = 0.5) = 1.2113918395225025
Root-mean-square error (regParam = 1.0) = 1.4815608073452533


## maxIter = 20

In [58]:
print("Root-mean-square error (regParam = 0.1) = " + str(rmse6))
print("Root-mean-square error (regParam = 0.5) = " + str(rmse7))
print("Root-mean-square error (regParam = 1.0) = " + str(rmse8))

Root-mean-square error (regParam = 0.1) = 0.941851252725291
Root-mean-square error (regParam = 0.5) = 1.211892353780886
Root-mean-square error (regParam = 1.0) = 1.48156082108903


# Conclusion

The Root-mean-square error (RMSE) appears to decrease when the `maxIter `parameter value increases for all values of `regParam`. Increases in `maxIter` from 5 to 10 or 20 specifically led to lower values of RMSE for each value of regParam.
Additionally, regardless of the value of the `maxIter` parameter, it appears that raising the `regParam` parameter often results in greater values of RMSE. But in this case `maxIter = 20` and `regParam = 0.1` has lowest RMSE than other, that indicates a better fit of the regression model to the data.

# Generate to 10 movie

In [54]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [55]:
# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)