- numBlocks : parallelization of a computer 
(-1 imply auto-config)
- rank : number of latent factors in the model
- iterations ( number of iterations that have to run
- lambda: specify regularization
- implicitPref specify feedback for the ALS
- alpha

In [5]:
!pip install pyspark py4j

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=94dc84fd82fb1f046cd56389b92e079510ddb387caf27377fa405e7280b589ca
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [8]:
spark = SparkSession.builder.appName("movielens").getOrCreate()

In [9]:
df = spark.read.csv("/content/sample_data/movielens_rating.csv", inferSchema=True, header=True)

In [10]:
df.show()


+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
|     1|     70|   3.0|
|     1|    101|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
|     1|    157|   5.0|
|     1|    163|   5.0|
|     1|    216|   5.0|
|     1|    223|   3.0|
|     1|    231|   5.0|
|     1|    235|   4.0|
|     1|    260|   5.0|
|     1|    296|   3.0|
|     1|    316|   3.0|
|     1|    333|   5.0|
|     1|    349|   4.0|
+------+-------+------+
only showing top 20 rows



In [11]:
df.describe().show()

+-------+------------------+----------------+------------------+
|summary|            userId|         movieId|            rating|
+-------+------------------+----------------+------------------+
|  count|            100836|          100836|            100836|
|   mean|326.12756356856676|19435.2957177992| 3.501556983616962|
| stddev| 182.6184914635004|35530.9871987003|1.0425292390606342|
|    min|                 1|               1|               0.5|
|    max|               610|          193609|               5.0|
+-------+------------------+----------------+------------------+



In [12]:
(train, test) = df.randomSplit([0.7, 0.3], seed=42)

In [13]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")

In [14]:
model = als.fit(train)

In [15]:
predict = model.transform(test)

In [16]:
predict.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|     1|    804|   4.0| 3.7133982|
|     1|    362|   5.0|   4.86761|
|     1|    673|   3.0| 3.6269677|
|     1|    593|   4.0| 4.6645975|
|     1|    101|   5.0| 3.5708053|
|     1|    596|   5.0| 4.4156804|
|     1|    157|   5.0|  3.925525|
|     1|      6|   4.0| 4.3128166|
|     1|    923|   5.0| 4.8532896|
|     1|    235|   4.0| 4.0579624|
|     1|    780|   3.0|  4.271947|
|     1|    590|   4.0| 4.1777043|
|     1|    231|   5.0| 3.3686543|
|     1|    552|   4.0| 3.9265242|
|     1|    423|   3.0|  2.946643|
|     1|    441|   4.0| 4.4370317|
|     1|    543|   4.0|   4.61752|
|     1|    527|   5.0| 3.9532664|
|     1|    151|   5.0| 4.0183005|
|     1|    260|   5.0|  4.707357|
+------+-------+------+----------+
only showing top 20 rows



In [18]:
eval = RegressionEvaluator(metricName = "rmse", labelCol="rating",predictionCol="prediction")

In [20]:
rmse= eval.evaluate(predict)
print(f"RMSE: {rmse}")

RMSE: nan


In [21]:
user_1 = test.filter(test['userId'] == 1).select(['movieId', "userId"])

In [22]:
user_1.show()

+-------+------+
|movieId|userId|
+-------+------+
|      6|     1|
|    101|     1|
|    151|     1|
|    157|     1|
|    231|     1|
|    235|     1|
|    260|     1|
|    349|     1|
|    362|     1|
|    423|     1|
|    441|     1|
|    527|     1|
|    543|     1|
|    552|     1|
|    590|     1|
|    593|     1|
|    596|     1|
|    673|     1|
|    780|     1|
|    804|     1|
+-------+------+
only showing top 20 rows



In [23]:
recommend = model.transform(user_1)

In [24]:
recommend.orderBy("prediction", ascending=False).show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|    940|     1|  5.626531|
|   1031|     1|  5.452162|
|   2478|     1|  5.323901|
|   2959|     1|  5.057724|
|   1396|     1| 5.0410304|
|   1258|     1| 4.9596305|
|   2005|     1| 4.8964596|
|    362|     1|   4.86761|
|    923|     1| 4.8532896|
|   2529|     1|  4.851065|
|   2000|     1|  4.743887|
|    260|     1|  4.707357|
|   1198|     1| 4.6853414|
|    593|     1| 4.6645975|
|   1197|     1| 4.6285534|
|    543|     1|   4.61752|
|   1220|     1| 4.5622234|
|   1214|     1| 4.4850707|
|    441|     1| 4.4370317|
|   1805|     1|  4.432716|
+-------+------+----------+
only showing top 20 rows

