In [2]:
%pip install numpy

Collecting numpy
  Downloading numpy-2.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Downloading numpy-2.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-2.2.5
[0mNote: you may need to restart the kernel to use updated packages.


In [14]:
from pyspark.sql import SparkSession

In [15]:
#spark = SparkSession.builder \
    #.appName("Read MovieLens from HDFS") \
    #.getOrCreate()

#spark = SparkSession.builder \
    #.appName("ALS Model") \
    #.config("spark.driver.memory", "4g") \
    #.config("spark.executor.memory", "4g") \
   # .getOrCreate()

#.config("spark.hadoop.fs.defaultFS", "hdfs://bigdata-node:8088") \
spark = SparkSession.builder \
    .appName("MovieLensAnalysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "2g") \
    .config("spark.sql.shuffle.partitions", "20") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .getOrCreate()

In [16]:
ratings_df = spark.read.option("header", True).csv("hdfs:///data/rating.csv")

In [17]:
ratings_df.show(5)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
+------+-------+------+-------------------+
only showing top 5 rows



In [18]:
movies_df = spark.read.option("header", True).csv("hdfs:///data/movie.csv")

In [19]:
movies_df.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [20]:
df_data = ratings_df.join(
    movies_df.select("movieId", "title"),
    on="movieId",
    how="left"
)

In [21]:
df_data.show(5)

+-------+------+------+-------------------+--------------------+
|movieId|userId|rating|          timestamp|               title|
+-------+------+------+-------------------+--------------------+
|      2|     1|   3.5|2005-04-02 23:53:47|      Jumanji (1995)|
|     29|     1|   3.5|2005-04-02 23:31:16|City of Lost Chil...|
|     32|     1|   3.5|2005-04-02 23:33:39|Twelve Monkeys (a...|
|     47|     1|   3.5|2005-04-02 23:32:07|Seven (a.k.a. Se7...|
|     50|     1|   3.5|2005-04-02 23:29:40|Usual Suspects, T...|
+-------+------+------+-------------------+--------------------+
only showing top 5 rows



In [22]:
df_data = df_data.withColumnsRenamed({
    'userId': 'user',
    'movieId': 'movie',
})

In [23]:
df_data.show(5)

+-----+----+------+-------------------+--------------------+
|movie|user|rating|          timestamp|               title|
+-----+----+------+-------------------+--------------------+
|    2|   1|   3.5|2005-04-02 23:53:47|      Jumanji (1995)|
|   29|   1|   3.5|2005-04-02 23:31:16|City of Lost Chil...|
|   32|   1|   3.5|2005-04-02 23:33:39|Twelve Monkeys (a...|
|   47|   1|   3.5|2005-04-02 23:32:07|Seven (a.k.a. Se7...|
|   50|   1|   3.5|2005-04-02 23:29:40|Usual Suspects, T...|
+-----+----+------+-------------------+--------------------+
only showing top 5 rows



In [24]:
from pyspark.sql.functions import col

df_data = df_data \
    .withColumn("user", col("user").cast("int")) \
    .withColumn("movie", col("movie").cast("int")) \
    .withColumn("rating", col("rating").cast("float")) \
    .withColumn("timestamp", col("timestamp").cast("int")) \
    .withColumn("title", col("title").cast("string"))

In [25]:
df_data_train, df_data_test = df_data.randomSplit([0.8, 0.2], 42)

In [None]:
# import ALS model from spark
from pyspark.ml.recommendation import ALS

# initialise the model 
als = ALS(maxIter=10, regParam=0.1, rank=10, userCol="user", itemCol="movie", 
          ratingCol="rating", coldStartStrategy="drop", nonnegative=True)

# train model 
als_model = als.fit(df_data_train)

[Stage 17:====>                                                   (1 + 11) / 12]

In [None]:
dfs_preds = als_model.transform(df_data_test)

In [None]:
dfs_preds.show(5)

In [None]:
als_model.save("hdfs:///data/als_prediction_rating")

In [None]:
from pyspark.ml.recommendation import ALSModel

als_model_loaded = ALSModel.load("hdfs:///data/als_prediction_rating")

In [None]:
dfs_preds_loaded = als_model_loaded.transform(df_data_test)

In [None]:
dfs_preds_loaded.show(5)

In [None]:
df_data_test.show()

In [None]:
user_ratings = ratings_df.filter(ratings_df.userId == 8)
user_ratings.show()

In [None]:
user_ratings