In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower

In [2]:
spark = SparkSession \
    .builder \
    .appName("Databases II") \
    .getOrCreate()

In [3]:
movie = (spark.read
      .format("csv")
      .option('header', 'true')
      .option("delimiter", ",")
      .option("inferSchema", "true")
      .load("movie.csv")
     )

In [4]:
tag = (spark.read
      .format("csv")
      .option('header', 'true')
      .option("delimiter", ",")
      .option("inferSchema", "true")
      .load("tag.csv")
     )

In [5]:
# change the strings of column 'tag' to lowercase
tag = tag.select((col('userId')),
                  col('movieId'),
                  lower(col('tag')).alias('lower_tag'),
                  col('timestamp'))


In [6]:
# execute inner join between 'movie' and 'tag' dataframes
joined = movie.join(tag, ["movieId"], 'inner')

In [7]:
# create new dataframe 'boring_movies' with the columns
# 'title' and 'lower_tag' from dataframe joined
# and only the rows, which contain the word 'boring' in the lower_tag column
boring_movies = joined.filter(joined['lower_tag'].contains('boring')).drop_duplicates(subset=['movieId']).select(joined['title'], joined['lower_tag'])

In [8]:
# register the 'boring_movies' dataframe as an SQL temporary view
boring_movies.createOrReplaceTempView("boring_movies")

In [9]:
# execute the SQL query
# we don't count phrases which contain the word 'boring',
# so we exclude them from the results of the query
spark.sql("SELECT * FROM boring_movies WHERE lower_tag NOT LIKE '% %' ORDER BY title ASC").show(truncate=0)

+-------------------------------------+---------+
|title                                |lower_tag|
+-------------------------------------+---------+
|(500) Days of Summer (2009)          |boring   |
|101 Reykjavik (101 Reykjavík) (2000) |boring   |
|12 Years a Slave (2013)              |boring   |
|1408 (2007)                          |boring   |
|1492: Conquest of Paradise (1992)    |boring   |
|2001: A Space Odyssey (1968)         |boring   |
|2010: The Year We Make Contact (1984)|boring   |
|2046 (2004)                          |boring   |
|21 Grams (2003)                      |boring   |
|24 Hour Party People (2002)          |boring   |
|3-Iron (Bin-jip) (2004)              |boring   |
|6 Bullets (2012)                     |boring   |
|633 Squadron (1964)                  |boring   |
|7 Plus Seven (1970)                  |boring   |
|8 Women (2002)                       |boring   |
|A.I. Artificial Intelligence (2001)  |boring   |
|According to Greta (2009)            |boring   |


In [10]:
spark.stop()