# Pyspark Dataframes
- Filter Operation
- &, |, ==
- ~

In [9]:
from pyspark.sql import SparkSession

In [10]:
spark = SparkSession.builder.appName('filter').getOrCreate()

In [11]:
df_pyspark = spark.read.option("header", "true").csv("archive/HighestGrossers_Cleaned.csv", inferSchema=True)

In [12]:
df_pyspark.show()

+----+--------------------+---------+-----------+------------------+--------------+---------------------+------------+
|YEAR|               MOVIE|    GENRE|MPAA RATING|       DISTRIBUTOR|TOTAL FOR YEAR|TOTAL IN 2019 DOLLARS|TICKETS SOLD|
+----+--------------------+---------+-----------+------------------+--------------+---------------------+------------+
|1995|      Batman Forever|    Drama|      PG-13|      Warner Bros.|     184031112|            387522978|    42306002|
|1996|    Independence Day|Adventure|      PG-13|  20th Century Fox|     306169255|            634504608|    69269062|
|1997|        Men in Black|Adventure|      PG-13|     Sony Pictures|     250650052|            500207943|    54607854|
|1998|             Titanic|Adventure|      PG-13|Paramount Pictures|     443319081|            865842808|    94524324|
|1999|Star Wars Ep. I: ...|Adventure|         PG|  20th Century Fox|     430443350|            776153749|    84732942|
|2000|How the Grinch St...|Adventure|         PG

In [13]:
df_pyspark.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- MOVIE: string (nullable = true)
 |-- GENRE: string (nullable = true)
 |-- MPAA RATING: string (nullable = true)
 |-- DISTRIBUTOR: string (nullable = true)
 |-- TOTAL FOR YEAR: integer (nullable = true)
 |-- TOTAL IN 2019 DOLLARS: integer (nullable = true)
 |-- TICKETS SOLD: integer (nullable = true)



In [21]:
from pyspark.sql.functions import col

## Movies with tickets sold less than or equal to 50 million
df_pyspark.filter(col("TICKETS SOLD") <=50000000).show()

+----+--------------------+---------+-----------+-------------+--------------+---------------------+------------+
|YEAR|               MOVIE|    GENRE|MPAA RATING|  DISTRIBUTOR|TOTAL FOR YEAR|TOTAL IN 2019 DOLLARS|TICKETS SOLD|
+----+--------------------+---------+-----------+-------------+--------------+---------------------+------------+
|1995|      Batman Forever|    Drama|      PG-13| Warner Bros.|     184031112|            387522978|    42306002|
|2000|How the Grinch St...|Adventure|         PG|    Universal|     253367455|            430583644|    47006948|
|2007|        Spider-Man 3|Adventure|      PG-13|Sony Pictures|     336530303|            448054878|    48914288|
|2011|Harry Potter and ...|   Action|      PG-13| Warner Bros.|     381011219|            440108798|    48046812|
|2014|Guardians of the ...|Adventure|      PG-13|  Walt Disney|     333055258|            373413235|    40765637|
|2020|   Bad Boys For Life|     null|          R|Sony Pictures|     204417855|          

In [31]:
## Movies with tickets sold greater than 50 million
df_pyspark.filter(~(col("TICKETS SOLD") <=50000000)).show()

+----+--------------------+---------+-----------+------------------+--------------+---------------------+------------+
|YEAR|               MOVIE|    GENRE|MPAA RATING|       DISTRIBUTOR|TOTAL FOR YEAR|TOTAL IN 2019 DOLLARS|TICKETS SOLD|
+----+--------------------+---------+-----------+------------------+--------------+---------------------+------------+
|1996|    Independence Day|Adventure|      PG-13|  20th Century Fox|     306169255|            634504608|    69269062|
|1997|        Men in Black|Adventure|      PG-13|     Sony Pictures|     250650052|            500207943|    54607854|
|1998|             Titanic|Adventure|      PG-13|Paramount Pictures|     443319081|            865842808|    94524324|
|1999|Star Wars Ep. I: ...|Adventure|         PG|  20th Century Fox|     430443350|            776153749|    84732942|
|2001|Harry Potter and ...|Adventure|         PG|      Warner Bros.|     300404434|            486166890|    53074988|
|2002|          Spider-Man|Adventure|      PG-13

In [22]:
# Top Grossing Movies after 2019
df_pyspark.filter("YEAR>2010").select(['YEAR', 'MOVIE']).show()

+----+--------------------+
|YEAR|               MOVIE|
+----+--------------------+
|2011|Harry Potter and ...|
|2012|        The Avengers|
|2013|          Iron Man 3|
|2014|Guardians of the ...|
|2015|Star Wars Ep. VII...|
|2016|        Finding Dory|
|2017|Star Wars Ep. VII...|
|2018|       Black Panther|
|2019|   Avengers: Endgame|
|2020|   Bad Boys For Life|
|2021|Shang-Chi and the...|
+----+--------------------+



In [29]:
df_pyspark.filter((df_pyspark["TOTAL IN 2019 DOLLARS"] > 700000000) &
                   (df_pyspark["TICKETS SOLD"] > 90000000)).show()

+----+-----------------+---------+-----------+------------------+--------------+---------------------+------------+
|YEAR|            MOVIE|    GENRE|MPAA RATING|       DISTRIBUTOR|TOTAL FOR YEAR|TOTAL IN 2019 DOLLARS|TICKETS SOLD|
+----+-----------------+---------+-----------+------------------+--------------+---------------------+------------+
|1998|          Titanic|Adventure|      PG-13|Paramount Pictures|     443319081|            865842808|    94524324|
|2019|Avengers: Endgame|     null|      PG-13|       Walt Disney|     858373000|            858373002|    93708843|
+----+-----------------+---------+-----------+------------------+--------------+---------------------+------------+

