In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode, count, row_number
from pyspark.sql.window import Window

In [22]:
spark = SparkSession \
    .builder \
    .appName("Databases II") \
    .getOrCreate()

In [23]:
movie = (spark.read
      .format("csv")
      .option('header', 'true')
      .option("delimiter", ",")
      .option("inferSchema", "true")
      .load("movie.csv")
     )

In [24]:
rating = (spark.read
      .format("csv")
      .option('header', 'true')
      .option("delimiter", ",")
      .option("inferSchema", "true")
      .load("rating.csv")
     )

In [25]:
# because some movies have more than one genres listed,
# we split the column 'genres' with the '|' operator so we can process on each genre separately
# and then use explode so that every distinct genre will be in their own row
movie = movie.withColumn("genres", explode(split("genres", "[|]")))

In [26]:
# execute inner join between 'movie' and 'rating' dataframes
joined = movie.join(rating, ["movieId"], 'inner')

In [27]:
# create new dataframe 'top_movie_genre' with the columns of dataframe 'joined'
# grouped by the columns 'title' and 'genres'
# and count the number of ratings for each movie of each genre
top_movie_genre = joined.select('*').groupby(joined["title"], joined["genres"]).agg(count('rating').alias('total_ratings'))

In [28]:
# create a window partition over the 'genres' column
window = Window.partitionBy(top_movie_genre["genres"]).orderBy(top_movie_genre["total_ratings"].desc())

In [29]:
# give the sequential row number 1, to the result of each window partition.
# In that way, we find the rank 1 movie of each genre with the larger number of ratings
top = top_movie_genre.select('*', row_number().over(window).alias('rank')).filter(col('rank') <= 1)

In [30]:
# show the results ordered by alphabetical order of column 'genres'
query8 = top.select(top["genres"], top["title"], top["total_ratings"]).orderBy(top["genres"].asc()).show()

+------------------+--------------------+-------------+
|            genres|               title|total_ratings|
+------------------+--------------------+-------------+
|(no genres listed)|Doctor Who: The T...|           36|
|            Action|Jurassic Park (1993)|        59715|
|         Adventure|Jurassic Park (1993)|        59715|
|         Animation|    Toy Story (1995)|        49695|
|          Children|    Toy Story (1995)|        49695|
|            Comedy| Pulp Fiction (1994)|        67310|
|             Crime| Pulp Fiction (1994)|        67310|
|       Documentary|Bowling for Colum...|        12280|
|             Drama| Pulp Fiction (1994)|        67310|
|           Fantasy|    Toy Story (1995)|        49695|
|         Film-Noir|L.A. Confidential...|        26836|
|            Horror|Silence of the La...|        63299|
|              IMAX|    Apollo 13 (1995)|        47777|
|           Musical|      Aladdin (1992)|        41842|
|           Mystery|Usual Suspects, T...|       

In [31]:
spark.stop()