In [62]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
from faker  import Faker
import random
spark = SparkSession.builder.appName("MovieRatingsAnalysis").getOrCreate()
fake = Faker()

In [63]:

def generate_data(num_records=100000):
    return [
        (random.randrange(1, 51),  # MovieID (1 to 50)
         random.randrange(1, 201),  # UserID (1 to 200)
         round(random.uniform(1.0, 5.0), 1),  # Rating (1.0 to 5.0)
         int(fake.unix_time(start_datetime="-2y")))  # Optimized Faker Unix Timestamp
        for _ in range(num_records)
    ]
schema = StructType([
    StructField("MovieID", IntegerType(), True),
    StructField("UserID", IntegerType(), True),
    StructField("Rating", FloatType(), True),
    StructField("Timestamp", IntegerType(), True)
])
data = generate_data(1000)
df = spark.createDataFrame(data, schema=schema)
df.show(5)


+-------+------+------+----------+
|MovieID|UserID|Rating| Timestamp|
+-------+------+------+----------+
|     49|   105|   4.0|1729434063|
|      5|   159|   2.6|1705842893|
|     22|   118|   3.5|1735913134|
|      6|   124|   3.6|1718814364|
|     12|   138|   3.3|1739770119|
+-------+------+------+----------+
only showing top 5 rows



In [58]:
from pyspark.sql import functions as F
average = df.groupBy("MovieID").agg(F.avg("Rating").alias("Average_Rating"))
average.show(5)


+-------+------------------+
|MovieID|    Average_Rating|
+-------+------------------+
|     31|3.0437500029802322|
|     34| 2.921212116877238|
|     28| 3.015789458626195|
|     26| 3.054999989271164|
|     27|2.8578947155099166|
+-------+------------------+
only showing top 5 rows



In [59]:

tot_count = df.groupBy("UserID").agg(count("UserID").alias("MoviesRated"))
users = tot_count.filter(tot_count.MoviesRated > 5)
users.show(5)


+------+-----------+
|UserID|MoviesRated|
+------+-----------+
|   148|          6|
|   133|          7|
|   108|          9|
|   193|          6|
|   126|          9|
+------+-----------+
only showing top 5 rows



In [60]:

avg_ratings = df.groupBy("MovieId").agg(F.avg("Rating").alias("AvgRating"))
top5 = avg_ratings.orderBy(F.col("AvgRating").desc()).limit(5)
top5.show()


+-------+------------------+
|MovieId|         AvgRating|
+-------+------------------+
|     33|3.5666666598547074|
|     41|3.5157894774487146|
|     16|3.4857142936615717|
|     42| 3.477777812216017|
|     37|3.3437500074505806|
+-------+------------------+



In [61]:
from pyspark.sql.functions import count
import plotly.express as px
user_counts = df.groupBy("UserID").agg(count("MovieID").alias("MoviesRated"))
user_counts_pd = user_counts.toPandas()
fig = px.histogram(user_counts_pd,
                   x="MoviesRated",
                   nbins=20,
                   title="Users vs. Number of Movies Rated",
                   labels={"MoviesRated": "Number of Movies Rated", "count": "Number of Users"},
                   opacity=0.75)
fig.show()
