In [1]:
import pyspark
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

# Read DataFrame from data source

In [2]:
n = 'https://raw.githubusercontent.com/sehgalayush1/movie_revenue/master/ml_modules/data/BollywoodMovieDetail.csv'
f = 'BollywoodMovieDetail.csv'

sc.addFile(n)
movies = spark.read.option("header", True).option("inferSchema", "true").csv(pyspark.SparkFiles.get(f))

In [3]:
movies.createOrReplaceTempView("tempView")

# Sample SQL queries

In [5]:
spark.sql("SELECT title FROM tempView WHERE releaseYear > 2010 ORDER BY title desc").show(3)

+--------------------+
|               title|
+--------------------+
|            Zokkomon|
|   Zindagi Tere Naam|
|Zindagi Na Milegi...|
+--------------------+
only showing top 3 rows



In [6]:
df = spark.sql("""SELECT title, CASE WHEN hitFlop < 5 THEN 'below average' WHEN hitFlop = 5 THEN 'average' WHEN hitFlop > 5 THEN 'above average' END AS MovieRating FROM tempView WHERE releaseYear > 2010 ORDER BY title desc""")

In [7]:
df.show()

+--------------------+-------------+
|               title|  MovieRating|
+--------------------+-------------+
|            Zokkomon|below average|
|   Zindagi Tere Naam|below average|
|Zindagi Na Milegi...|above average|
|       Zindagi 50 50|below average|
|                 Zid|below average|
|            Zed Plus|below average|
|             Zanjeer|below average|
|         Youngistaan|below average|
|   Yeh Saali Zindagi|below average|
| Yeh Jo Mohabbat Hai|below average|
|Yeh Jawaani Hai D...|above average|
|    Yeh Hai Bakrapur|below average|
|         Yeh Faasley|below average|
|      Ye Stupid Pyar|below average|
|Yamla Pagla Deewa...|below average|
| Yamla Pagla Deewana|above average|
|            Yaariyan|below average|
|              Ya Rab|below average|
|       What the Fish|below average|
+--------------------+-------------+
only showing top 20 rows



# Create a table

In [8]:
spark.sql("CREATE DATABASE spark_course")

DataFrame[]

In [9]:
spark.sql("USE spark_course")

DataFrame[]

In [34]:
spark.sql("SHOW TABLES").show(5, False)

+------------+---------------------------------------+-----------+
|namespace   |tableName                              |isTemporary|
+------------+---------------------------------------+-----------+
|spark_course|movieshortdetailunmanaged              |false      |
|spark_course|movieshortdetailusingdataframe         |false      |
|spark_course|movieshortdetailusingdataframeunmanaged|false      |
|            |tempview                               |true       |
+------------+---------------------------------------+-----------+



In [12]:
movies.write.saveAsTable("movieShortDetailUsingDataFrame")

In [13]:
spark.sql("SHOW TABLES").show(5, False)

+------------+------------------------------+-----------+
|namespace   |tableName                     |isTemporary|
+------------+------------------------------+-----------+
|spark_course|movieshortdetailusingdataframe|false      |
|            |tempview                      |true       |
+------------+------------------------------+-----------+



In [20]:
spark.sql("DROP TABLE movieShortDetailUnmanaged");
spark.sql("CREATE TABLE movieShortDetailUnmanaged (imdbID STRING, title STRING) USING parquet OPTIONS (PATH 'data/moviesFile')")

DataFrame[]

In [26]:
df = spark.sql("SELECT * FROM movieShortDetailUnmanaged")

In [27]:
df.show()

+------+-----+
|imdbID|title|
+------+-----+
+------+-----+



In [28]:
movies.show()

+---------+--------------------+-----------+-----------+--------------------+--------------------+--------------------+--------------------+------+-------+
|   imdbId|               title|releaseYear|releaseDate|               genre|             writers|              actors|           directors|sequel|hitFlop|
+---------+--------------------+-----------+-----------+--------------------+--------------------+--------------------+--------------------+------+-------+
|tt0118578|              Albela|       2001|20 Apr 2001|             Romance|Honey Irani (scre...|Govinda | Aishwar...|       Deepak Sareen|     0|      2|
|tt0169102|Lagaan: Once Upon...|       2001|08 May 2002|Adventure | Drama...|Ashutosh Gowarike...|Aamir Khan | Grac...|  Ashutosh Gowariker|     0|      6|
|tt0187279|Meri Biwi Ka Jawa...|       2004|02 Jul 2004|     Action | Comedy|                 N/A|Akshay Kumar | Sr...|Pankaj Parashar |...|     0|      1|
|tt0222024|Hum Tumhare Hain ...|       2002|24 May 2002|     Dra

In [30]:
# movies.write.option("path","data/shortMovieDetail.csv").saveAsTable("movieShortDetailUsingDataFrameUnmanaged")

In [32]:
df = spark.sql("SELECT * FROM movieShortDetailUsingDataFrameUnmanaged")

In [33]:
df.show()

+------+-----+-----------+-----------+-----+-------+------+---------+------+-------+
|imdbId|title|releaseYear|releaseDate|genre|writers|actors|directors|sequel|hitFlop|
+------+-----+-----------+-----------+-----+-------+------+---------+------+-------+
+------+-----+-----------+-----------+-----+-------+------+---------+------+-------+

