In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, avg, row_number, rank
from pyspark.sql.window import Window

In [2]:
spark = SparkSession \
    .builder \
    .appName("Databases II") \
    .getOrCreate()

In [3]:
movie = (spark.read
      .format("csv")
      .option('header', 'true')
      .option("delimiter", ",")
      .option("inferSchema", "true")
      .load("movie.csv")
     )

In [4]:
rating = (spark.read
      .format("csv")
      .option('header', 'true')
      .option("delimiter", ",")
      .option("inferSchema", "true")
      .load("rating.csv")
     )

In [5]:
# convert the string column 'timestamp' to timestamp data type
rating = rating.select(col('userId'),
                      col('movieId'),
                      col('rating'),
                      col('timestamp').cast('timestamp').alias('timestamp'))

In [6]:
# add to 'rating' dataframe an extra column with the year values from column 'timestamp'
rating = rating.select(col('userId'),
                      col('movieId'),
                      col('rating'),
                      col('timestamp'),
                      year(col('timestamp')).alias('yearNum'))

In [7]:
# execute inner join between 'rating' and 'movie' dataframes
joined = rating.join(movie, ["movieId"], 'inner')

In [8]:
# create new dataframe 'top' with the columns of dataframe 'joined'
# grouped by the columns 'title' and 'yearNum'
# and find the average rating for each movie, every year
top = joined.select("*").groupby(joined["title"], joined["yearNum"]).agg(avg('rating').alias('average_rating'))

In [9]:
# create a window partition over the 'yearNum' column
window = Window.partitionBy(top["yearNum"]).orderBy(top["average_rating"].desc(), top["title"].asc())

In [10]:
# give the sequential row number from 1 to 10, to the result of each window partition
top = top.select('*', row_number().over(window).alias('rank')).filter(col('rank') <= 10)

In [11]:
# order the results firstly by ascending yearNum and secondly by ascending rank
# and show the results
query4 = top.orderBy(top["yearNum"].asc(), top["rank"].asc()).show(1000)

+--------------------+-------+------------------+----+
|               title|yearNum|    average_rating|rank|
+--------------------+-------+------------------+----+
|Seven (a.k.a. Se7...|   1995|               5.0|   1|
|Double Life of Ve...|   1995|               4.0|   2|
|Fish Called Wanda...|   1995|               3.0|   3|
|   Get Shorty (1995)|   1995|               3.0|   4|
|Substance of Fire...|   1996| 4.708333333333333|   1|
|I Can't Sleep (J'...|   1996| 4.538461538461538|   2|
|Wallace & Gromit:...|   1996| 4.519774011299435|   3|
|Schindler's List ...|   1996|4.5161642205474015|   4|
|Somebody is Waiti...|   1996|               4.5|   5|
|Thieves (Voleurs,...|   1996|               4.5|   6|
|Wallace & Gromit:...|   1996| 4.488428745432399|   7|
|Shawshank Redempt...|   1996|4.4781818181818185|   8|
|Marvin's Room (1996)|   1996| 4.388888888888889|   9|
|   Casablanca (1942)|   1996| 4.386454183266932|  10|
|Dangerous Beauty ...|   1997|               5.0|   1|
|Fallen An

In [None]:
spark.stop()