<a href="https://colab.research.google.com/github/raihanhd12/BIG-DATA/blob/main/MLlib/ml_recommender_scala.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import os
os.system("rm -rf /content/ml-predictions.csv")
os.system("rm -rf /content/mymodel")

0

In [4]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, isnull

spark = SparkSession.builder.getOrCreate()

# Define the Rating class
class Rating:
    def __init__(self, userId, movieId, rating, timestamp):
        self.userId = userId
        self.movieId = movieId
        self.rating = rating
        self.timestamp = timestamp

# Define the parseRating function
def parseRating(str):
    fields = str.split("::")
    assert len(fields) == 4
    return Rating(int(fields[0]), int(fields[1]), float(fields[2]), int(fields[3]))

# Test the parseRating function
parseRating("1::1193::5::978300760")

# Load the raw data
raw = spark.sparkContext.textFile("/content/drive/MyDrive/Kuliah/Semester 6/Big Data/ml-1m/ratings.dat")

# Check one record
raw.take(1)

# Parse the ratings and create a DataFrame
ratings = raw.map(parseRating).toDF()

# Check the ratings DataFrame
ratings.show(5)

# Split the data into training and test sets
training, test = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")
model = als.fit(training)

# Save the model
model.save("mymodel")

# Generate predictions on the test set
predictions = model.transform(test)

# Calculate the root mean square error (RMSE)
squared_error = predictions.withColumn("error", (predictions["rating"] - predictions["prediction"])**2)


valid_squared_error = squared_error.filter(~isnull(squared_error["error"]) & ~isnan(squared_error["error"]))

rmse = valid_squared_error.select("error").agg({"error": "mean"}).collect()[0][0] ** 0.5
print("RMSE: ", rmse)

# Show the first 10 predictions
predictions.show(10)

# Save the predictions as a CSV file
predictions.select("userId", "movieId", "rating", "prediction").write.csv("ml-predictions.csv")

+-------+------+---------+------+
|movieId|rating|timestamp|userId|
+-------+------+---------+------+
|   1193|   5.0|978300760|     1|
|    661|   3.0|978302109|     1|
|    914|   3.0|978301968|     1|
|   3408|   4.0|978300275|     1|
|   2355|   5.0|978824291|     1|
+-------+------+---------+------+
only showing top 5 rows

RMSE:  0.891689624277305
+-------+------+---------+------+----------+
|movieId|rating|timestamp|userId|prediction|
+-------+------+---------+------+----------+
|      1|   3.0|973969929|  2564| 4.0736675|
|      1|   3.0|972882514|  2796| 3.1728563|
|      1|   2.0|975730859|   980| 3.1530662|
|      1|   3.0|974242794|  2418| 3.0192087|
|      1|   3.0|970268071|  3036|  4.190822|
|      1|   2.0|977530792|   136|  2.269647|
|      1|   3.0|972055240|  2889| 4.4077973|
|      1|   2.0|972785865|  2806| 3.1014705|
|      1|   2.0|992133251|   151|   4.27976|
|      1|   3.0|972931817|  3041| 3.4233935|
+-------+------+---------+------+----------+
only showing t