In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=a13eec4dd30fa60ecba20ac3be39fe5b32ff64add9b154db0b66a4c555fd5b13
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col, cast, sum

In [None]:
# create a SparkSession
spark = SparkSession.builder.appName("CollaborativeFiltering").getOrCreate()

In [None]:
# Read in the first CSV file
movies = spark.read.csv("/content/movies.csv", header=True, inferSchema=True)

# Read in the second CSV file
ratings = spark.read.csv("/content/ratings.csv", header=True, inferSchema=True)

# Join the two DataFrames on a common column
#data = df1.join(df2, df1.movieId == df2.movieId).drop(df2.movieId)
data = movies.join(ratings, on = 'movieId')

# Do something with the joined DataFrame
data.show()

+-------+--------------------+--------------------+------+------+---------+
|movieId|               title|              genres|userId|rating|timestamp|
+-------+--------------------+--------------------+------+------+---------+
|      1|    Toy Story (1995)|Adventure|Animati...|     1|   4.0|964982703|
|      3|Grumpier Old Men ...|      Comedy|Romance|     1|   4.0|964981247|
|      6|         Heat (1995)|Action|Crime|Thri...|     1|   4.0|964982224|
|     47|Seven (a.k.a. Se7...|    Mystery|Thriller|     1|   5.0|964983815|
|     50|Usual Suspects, T...|Crime|Mystery|Thr...|     1|   5.0|964982931|
|     70|From Dusk Till Da...|Action|Comedy|Hor...|     1|   3.0|964982400|
|    101|Bottle Rocket (1996)|Adventure|Comedy|...|     1|   5.0|964980868|
|    110|   Braveheart (1995)|    Action|Drama|War|     1|   4.0|964982176|
|    151|      Rob Roy (1995)|Action|Drama|Roma...|     1|   5.0|964984041|
|    157|Canadian Bacon (1...|          Comedy|War|     1|   5.0|964984100|
|    163|   

In [None]:
data.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [None]:
# handle NULL and NaN values in the data
data = data.na.drop()

In [None]:
# check if there are still any NULL or NaN values in the data
null_count = data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).collect()
print(null_count)

[Row(movieId=0, title=0, genres=0, userId=0, rating=0, timestamp=0)]


In [None]:
# split the data into training and test sets
(training, test) = data.randomSplit([0.8, 0.2])

In [None]:
test.show(20,False)

+-------+----------------+-------------------------------------------+------+------+----------+
|movieId|title           |genres                                     |userId|rating|timestamp |
+-------+----------------+-------------------------------------------+------+------+----------+
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|18    |3.5   |1455209816|
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|19    |4.0   |965705637 |
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|21    |3.5   |1407618878|
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|31    |5.0   |850466616 |
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|50    |3.0   |1514238116|
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|71    |5.0   |864737933 |
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|78    |4.0   |1252575124|
|1      |Toy Story (1995)|Adventure|Anim

In [None]:
# create the ALS model
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId",
          ratingCol="rating")

In [None]:
# fit the model to the training data
model = als.fit(training)

In [51]:
# make predictions on the test data
predictions = model.transform(test).filter("prediction >= 0")

In [46]:
# evaluate the predictions
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.068018670373856


In [53]:
#user the model to predict
single_user = test.filter(test["userId"] == 2)

recommendations = model.transform(single_user)
recommendations.show()

+-------+--------------------+--------------------+------+------+----------+----------+
|movieId|               title|              genres|userId|rating| timestamp|prediction|
+-------+--------------------+--------------------+------+------+----------+----------+
|    318|Shawshank Redempt...|         Crime|Drama|     2|   3.0|1445714835| 4.0287337|
|  48516|Departed, The (2006)|Crime|Drama|Thriller|     2|   4.0|1445715064| 3.6442614|
|  68157|Inglourious Baste...|    Action|Drama|War|     2|   4.5|1445715154|  3.525458|
| 109487| Interstellar (2014)|         Sci-Fi|IMAX|     2|   3.0|1445715145|  4.221439|
| 112552|     Whiplash (2014)|               Drama|     2|   4.0|1445714882| 3.5430045|
+-------+--------------------+--------------------+------+------+----------+----------+

