In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count

In [2]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("MovieLensRatingsDistribution") \
    .getOrCreate()

In [3]:
# Load dataset
data_path = "movielens.txt"  
movies_df = spark.read.csv(data_path, sep='\t', inferSchema=True) \
                      .toDF("user_id", "movie_id", "rating", "timestamp")

In [4]:
# Calculate the distribution of ratings for each movie
rating_distribution = movies_df.groupBy("movie_id", "rating") \
                               .agg(count("rating").alias("rating_count")) \
                               .orderBy("movie_id", "rating")

In [5]:
# Show the results
rating_distribution.show()

+--------+------+------------+
|movie_id|rating|rating_count|
+--------+------+------------+
|       1|     1|           8|
|       1|     2|          27|
|       1|     3|          96|
|       1|     4|         202|
|       1|     5|         119|
|       2|     1|           8|
|       2|     2|          17|
|       2|     3|          55|
|       2|     4|          42|
|       2|     5|           9|
|       3|     1|          11|
|       3|     2|          20|
|       3|     3|          25|
|       3|     4|          23|
|       3|     5|          11|
|       4|     1|           6|
|       4|     2|          24|
|       4|     3|          57|
|       4|     4|          93|
|       4|     5|          29|
+--------+------+------------+
only showing top 20 rows

