In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower

In [35]:
spark = SparkSession \
    .builder \
    .appName("Databases II") \
    .getOrCreate()

In [36]:
rating = (spark.read
      .format("csv")
      .option('header', 'true')
      .option("delimiter", ",")
      .option("inferSchema", "true")
      .load("rating.csv")
     )

In [37]:
tag = (spark.read
      .format("csv")
      .option('header', 'true')
      .option("delimiter", ",")
      .option("inferSchema", "true")
      .load("tag.csv")
     )

In [38]:
# change the strings of column 'tag' to lowercase
tag = tag.select((col('userId')),
                  col('movieId'),
                  lower(col('tag')).alias('lower_tag'),
                  col('timestamp'))

In [39]:
# execute inner join between 'rating' and 'tag' dataframes
joined = rating.join(tag, ['userId', 'movieId'], 'inner')

In [40]:
# create new dataframe 'bollywood_movies' with the columns
# 'userId', 'rating' and 'lower_tag' from dataframe joined
# and only the rows, which contain the word 'bollywood' in the lower_tag column
bollywood_movies = joined.filter(joined['lower_tag'].contains('bollywood')).drop_duplicates(subset=['userId']).select(joined['userId'], joined['rating'], joined['lower_tag'])

In [41]:
# register the 'bollywood_movies' dataframe as an SQL temporary view
bollywood_movies.createOrReplaceTempView("bollywood_movies")

In [42]:
# execute the SQL query
spark.sql("SELECT * FROM bollywood_movies WHERE rating > 3.0 AND lower_tag NOT LIKE '%not%' ORDER BY userId ASC").show()

+------+------+-------------------+
|userId|rating|          lower_tag|
+------+------+-------------------+
| 10573|   4.0|          bollywood|
| 19837|   5.0|          bollywood|
| 23333|   4.0|          bollywood|
| 25004|   5.0|          bollywood|
| 31338|   4.5|          bollywood|
| 33323|   3.5|          bollywood|
| 35170|   4.0|          bollywood|
| 40514|   5.0|          bollywood|
| 41165|   4.5|          bollywood|
| 48816|   4.5|bollywood influence|
| 51539|   4.0|          bollywood|
| 54900|   3.5|          bollywood|
| 63618|   3.5|          bollywood|
| 65908|   4.5|          bollywood|
| 70279|   4.5|          bollywood|
| 77137|   5.0|          bollywood|
| 86883|   4.0|          bollywood|
|106755|   4.0|bollywood influence|
|130827|   3.5|bollywood influence|
|131829|   4.0|          bollywood|
+------+------+-------------------+
only showing top 20 rows



In [43]:
spark.stop()