In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, hour, dayofyear, count, sum

In [2]:
spark = SparkSession \
    .builder \
    .appName("Databases II") \
    .getOrCreate()

In [3]:
rating = (spark.read
      .format("csv")
      .option('header', 'true')
      .option("delimiter", ",")
      .option("inferSchema", "true")
      .load("rating.csv")
     )

In [4]:
# convert the string column 'timestamp' to timestamp data type
rating = rating.select(col('userId'),
                      col('movieId'),
                      col('rating'),
                      col('timestamp').cast('timestamp').alias('timestamp'))

In [5]:
# add to 'rating' dataframe 3 extra columns: One with the year values from column 'timestamp',
# one with the dayofyear values from column 'timestamp'
# and one with the hour values again from column 'timestamp'
rating = rating.select(col('userId'),
                      col('movieId'),
                      col('rating'),
                      col('timestamp'),
                      year(col('timestamp')).alias('year'),
                      dayofyear(col('timestamp')).alias('day_of_year'),
                      hour(col('timestamp')).alias('hour'))

In [6]:
# create new dataframe 'same time_user' with the columns of dataframe 'rating'
# grouped by the columns 'movieId', 'year', 'day_of_year' and 'hour',
# and count the number of users who watched the same movie at the same time
same_time = rating.select('*').groupby(rating["movieId"], rating["year"], rating["day_of_year"], rating["hour"]).agg(count('userId').alias('viewers'))

In [7]:
# from dataframe 'same_time' select only the rows which have more than one viewer
# and compute the sum of viewers of these rows
query9 = same_time.filter(same_time["viewers"] != 1).select('*').agg(sum('viewers').alias('total_viewers')).show()

+-------------+
|total_viewers|
+-------------+
|      4281178|
+-------------+



In [8]:
spark.stop()