# Collaborative Filtering

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%%capture
!sudo apt-get update --fix-missing

!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
#!wget -q https://downloads.apache.org/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

!mv spark-3.0.0-bin-hadoop3.2.tgz sparkkk
!tar xf sparkkk
!pip install -q findspark
!pip install pyspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName('collaborative-filtering') \
    .getOrCreate()

spark   

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

lines = spark.read.text("gdrive/MyDrive/Colab Notebooks/BIG DATA/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])


### MaxIter = 5, RegParam = 0.1

In [None]:
als0 = ALS(maxIter=5, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als0.fit(training)

predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse0 = evaluator.evaluate(predictions)

### MaxIter = 5, RegParam = 0.5

In [None]:
als1 = ALS(maxIter=5, regParam=0.5, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als1.fit(training)

predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse1 = evaluator.evaluate(predictions)

### MaxIter = 5, RegParam = 1

In [None]:
als2 = ALS(maxIter=5, regParam=1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als2.fit(training)

predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse2 = evaluator.evaluate(predictions)

### MaxIter = 10, RegParam = 0.1

In [None]:
als3 = ALS(maxIter=10, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als3.fit(training)

predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse3 = evaluator.evaluate(predictions)

### MaxIter = 10, RegParam = 0.5

In [None]:
als4 = ALS(maxIter=10, regParam=0.5, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als4.fit(training)

predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse4 = evaluator.evaluate(predictions)

### MaxIter = 10, RegParam = 1

In [None]:
als5 = ALS(maxIter=10, regParam=1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als5.fit(training)

predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse5 = evaluator.evaluate(predictions)

### MaxIter = 20, RegParam = 0.1

In [None]:
als6 = ALS(maxIter=20, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als6.fit(training)

predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse6 = evaluator.evaluate(predictions)

### MaxIter = 20, RegParam = 0.5

In [None]:
als7 = ALS(maxIter=20, regParam=0.5, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als7.fit(training)

predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse7 = evaluator.evaluate(predictions)

### MaxIter = 20, RegParam = 1

In [None]:
als8 = ALS(maxIter=20, regParam=1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als8.fit(training)

predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse8 = evaluator.evaluate(predictions)

### RMSE Results

In [None]:
print("Root-mean-square error")
print("(MaxIter = 5) && (RegParam = 0.1) ==> " + str(rmse0))
print("(MaxIter = 5) && (RegParam = 0.5) ==> " + str(rmse1))
print("(MaxIter = 5) && (RegParam = 1) ==> " + str(rmse2))
print("(MaxIter = 10) && (RegParam = 0.1) ==> " + str(rmse3))
print("(MaxIter = 10) && (RegParam = 0.5) ==> " + str(rmse4))
print("(MaxIter = 10) && (RegParam = 1) ==> " + str(rmse5))
print("(MaxIter = 20) && (RegParam = 0.1) ==> " + str(rmse6))
print("(MaxIter = 20) && (RegParam = 0.5) ==> " + str(rmse7))
print("(MaxIter = 20) && (RegParam = 1) ==> " + str(rmse8))

Root-mean-square error
(MaxIter = 5) && (RegParam = 0.1) ==> 1.054592946710491
(MaxIter = 5) && (RegParam = 0.5) ==> 1.2828161971379672
(MaxIter = 5) && (RegParam = 1) ==> 1.5561797582804586
(MaxIter = 10) && (RegParam = 0.1) ==> 1.0392105854174971
(MaxIter = 10) && (RegParam = 0.5) ==> 1.2823122304669128
(MaxIter = 10) && (RegParam = 1) ==> 1.556180714385883
(MaxIter = 20) && (RegParam = 0.1) ==> 1.0496152235443992
(MaxIter = 20) && (RegParam = 0.5) ==> 1.282917999285915
(MaxIter = 20) && (RegParam = 1) ==> 1.5561807186493415


### Conclusion

In [None]:
# The lower the RMSE value, the better the performance of the Collaborative Filtering model. 
# A high RMSE value indicates that the model is less accurate in predicting ratings and needs to be improved or refined.
# So, we take models when MaxIter = 20 and RegParam = 0.1. Which use ALS6

model = als6.fit(training)

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als6.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als6.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)