In [1]:
import pyspark
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [2]:
from pyspark.sql import Row

# Basic operations on rows

In [3]:
row = Row("Upcoming New Movie", 2021, "Comedy")

In [6]:
print(row[0])
print(row[1])
print(row[2])

Upcoming New Movie
2021
Comedy


In [7]:
rows = [("Tom Cruise Movie", 2021, "Comedy"), ("Rajinikanth Movie", 2021, "Action")]

In [15]:
df = spark.createDataFrame(data=rows,schema = ['Movie Name','Release Year','Genre'])

In [16]:
df.printSchema()

root
 |-- Movie Name: string (nullable = true)
 |-- Release Year: long (nullable = true)
 |-- Genre: string (nullable = true)



In [17]:
df.show()

+-----------------+------------+------+
|       Movie Name|Release Year| Genre|
+-----------------+------------+------+
| Tom Cruise Movie|        2021|Comedy|
|Rajinikanth Movie|        2021|Action|
+-----------------+------------+------+



# DataFrame from data source

In [18]:
n = 'https://raw.githubusercontent.com/sehgalayush1/movie_revenue/master/ml_modules/data/BollywoodMovieDetail.csv'
f = 'BollywoodMovieDetail.csv'

sc.addFile(n)
movies = spark.read.option("header", True).option("inferSchema", "true").csv(pyspark.SparkFiles.get(f))

In [20]:
from pyspark.sql.functions import col

# Basic queries

In [21]:
movies.select("title").where(col("hitFlop") > 8).show()

+--------------------+
|               title|
+--------------------+
|Gadar: Ek Prem Katha|
|            3 Idiots|
|            Dhoom: 3|
|                  PK|
+--------------------+



In [22]:
movies.select("title").where(col("hitFlop") > 8).count()

4

In [25]:
movies.select("title").filter(col("genre").contains("Romance")).count()

372

In [27]:
movies.select("title").filter(col("genre").contains("Romance")).where(col("releaseYear") > 2010).count()

116

In [33]:
 movies.select("releaseYear").distinct().sort(col("releaseYear").desc()).show()

+-----------+
|releaseYear|
+-----------+
|       2014|
|       2013|
|       2012|
|       2011|
|       2010|
|       2009|
|       2008|
|       2007|
|       2006|
|       2005|
|       2004|
|       2003|
|       2002|
|       2001|
+-----------+



# Change column name

In [34]:
moviesNewColDF = movies.withColumnRenamed("hitFlop","Rating")

In [35]:
moviesNewColDF.printSchema()

root
 |-- imdbId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- releaseYear: integer (nullable = true)
 |-- releaseDate: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- actors: string (nullable = true)
 |-- directors: string (nullable = true)
 |-- sequel: string (nullable = true)
 |-- Rating: integer (nullable = true)



# Change column type

In [37]:
from pyspark.sql.functions import to_date
newDF = movies.withColumn("launchDate", to_date(col("releaseDate"), "d MMM yyyy")).drop("releaseDate")

In [38]:
newDF.printSchema()

root
 |-- imdbId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- releaseYear: integer (nullable = true)
 |-- genre: string (nullable = true)
 |-- writers: string (nullable = true)
 |-- actors: string (nullable = true)
 |-- directors: string (nullable = true)
 |-- sequel: string (nullable = true)
 |-- hitFlop: integer (nullable = true)
 |-- launchDate: date (nullable = true)



In [47]:
 newDF.select("releaseYear","launchDate").where(col("launchDate").isNull()).show(5,False)

+-----------+----------+
|releaseYear|launchDate|
+-----------+----------+
|2001       |null      |
|2001       |null      |
|2002       |null      |
|2002       |null      |
|2002       |null      |
+-----------+----------+
only showing top 5 rows



In [48]:
 newDF.select("releaseYear","launchDate").where(col("launchDate").isNull()).count()

54

# Date functions

In [50]:
from pyspark.sql.functions import year
newDF.select(year(col("launchDate"))).distinct().orderBy(year(col("launchDate"))).show()

+----------------+
|year(launchDate)|
+----------------+
|            null|
|            2001|
|            2002|
|            2003|
|            2004|
|            2005|
|            2006|
|            2007|
|            2008|
|            2009|
|            2010|
|            2011|
|            2012|
|            2013|
|            2014|
+----------------+



# Aggregations

In [51]:
movies.select("releaseYear").groupBy("releaseYear").count().orderBy("releaseYear").show()

+-----------+-----+
|releaseYear|count|
+-----------+-----+
|       2001|   62|
|       2002|   79|
|       2003|   95|
|       2004|   88|
|       2005|  106|
|       2006|   60|
|       2007|   66|
|       2008|   98|
|       2009|   91|
|       2010|  116|
|       2011|  112|
|       2012|   99|
|       2013|  102|
|       2014|  110|
+-----------+-----+



In [52]:
from pyspark.sql.functions import max
movies.select(max(col("hitFlop"))).show()

+------------+
|max(hitFlop)|
+------------+
|           9|
+------------+



In [53]:
from pyspark.sql.functions import min
movies.select(min(col("hitFlop"))).show()

+------------+
|min(hitFlop)|
+------------+
|           1|
+------------+



In [54]:
from pyspark.sql.functions import sum
movies.select(sum(col("hitFlop"))).show()

+------------+
|sum(hitFlop)|
+------------+
|        2753|
+------------+



In [55]:
from pyspark.sql.functions import avg
movies.select(avg(col("hitFlop"))).show()

+------------------+
|      avg(hitFlop)|
+------------------+
|2.1440809968847354|
+------------------+



In [56]:
movies.select("releaseYear","hitFlop").groupBy("releaseYear").avg("hitFlop").orderBy("releaseYear").show()

+-----------+------------------+
|releaseYear|      avg(hitFlop)|
+-----------+------------------+
|       2001| 2.306451612903226|
|       2002|1.9620253164556962|
|       2003|2.0105263157894737|
|       2004|1.9545454545454546|
|       2005| 2.009433962264151|
|       2006|2.9833333333333334|
|       2007| 2.621212121212121|
|       2008|  2.13265306122449|
|       2009| 1.835164835164835|
|       2010|1.8620689655172413|
|       2011|2.0535714285714284|
|       2012| 2.393939393939394|
|       2013| 2.343137254901961|
|       2014| 2.081818181818182|
+-----------+------------------+

