# Introduction

In [1]:
%run ./Includes/paths.py

In [2]:
import pyspark
from delta import *

# start spark
builder = (pyspark.sql.SparkSession.builder.appName("Spark-Course")
                .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
                .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# setting log-level to ERROR to decrease verbosity
# log4j log-levels are: OFF, FATAL, ERROR, WARN, INFO, DEBUG, TRACE, ALL
spark.sparkContext.setLogLevel("ERROR")

spark

In [3]:
%%bash -s "$raw_data_path"
ls $1

name_basics.tsv
readme.txt
title_akas.tsv
title_basics.tsv
title_crew.tsv
title_episode.tsv
title_principals.tsv
title_ratings.tsv


In [4]:
# load the title_title_basics.tsv file
title_basics_sdf = (spark
                    .read
                    .option('inferSchema', 'true')
                    .option('header', 'true')
                    .option('delimiter', '\t')
                    .csv(raw_data_path + 'title_basics.tsv'))

title_basics_sdf.show(5)

                                                                                

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [5]:
# inferred schema
title_basics_sdf.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [6]:
# a spark dataframe *is* an execution plan
title_basics_sdf.explain()

== Physical Plan ==
FileScan csv [tconst#16,titleType#17,primaryTitle#18,originalTitle#19,isAdult#20,startYear#21,endYear#22,runtimeMinutes#23,genres#24] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/siladitya/Documents/Spark/Spark-Course/Data/imdb/title_basi..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<tconst:string,titleType:string,primaryTitle:string,originalTitle:string,isAdult:string,sta...




In [7]:
title_basics_sdf.sort('startYear').show(5)



+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|           genres|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+
| tt3155794|    short|    Passage de Venus|    Passage de Venus|      0|     1874|     \N|             1|Documentary,Short|
|tt16763740|    short|       Skipping Rope|     Le jeu de corde|      0|     1877|     \N|             1|  Animation,Short|
|tt16763674|    short|The Tight-rope Dance|La danse sur la c...|      0|     1877|     \N|             1|  Animation,Short|
|tt14495706|    short|   La Rosace Magique|   La Rosace Magique|      0|     1877|     \N|             1|  Animation,Short|
|tt16763774|    short|  Dzing. Boom. Boom!|   Zimm. Boum. Boum!|      0|     1877|     \N|             1|  Animation,Short|
+-------

                                                                                

In [8]:
# FileScan is a narrow transformation,
# sort is a wide transformation (see 'Exchange rangepartitioning' below)
# as all rows need to be compared
title_basics_sdf.sort('startYear').explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [startYear#21 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(startYear#21 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [id=#42]
      +- FileScan csv [tconst#16,titleType#17,primaryTitle#18,originalTitle#19,isAdult#20,startYear#21,endYear#22,runtimeMinutes#23,genres#24] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/siladitya/Documents/Spark/Spark-Course/Data/imdb/title_basi..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<tconst:string,titleType:string,primaryTitle:string,originalTitle:string,isAdult:string,sta...




In [9]:
value_counts_sdf = (title_basics_sdf
                        .groupBy('startYear')
                        .count())
value_counts_sdf.show(5)



+---------+-----+
|startYear|count|
+---------+-----+
|     1903| 2675|
|     1953| 9826|
|     1897| 1356|
|     1957|14635|
|     1987|40368|
+---------+-----+
only showing top 5 rows



                                                                                

In [10]:
from pyspark.sql.functions import * # lets us use col('count').desc()

value_counts_sorted_sdf = (value_counts_sdf
                                .orderBy(col('count').desc()))

value_counts_sorted_sdf.show(5)



+---------+-------+
|startYear|  count|
+---------+-------+
|       \N|1183450|
|     2018| 413159|
|     2017| 411107|
|     2019| 398502|
|     2021| 392377|
+---------+-------+
only showing top 5 rows



                                                                                

In [11]:
value_counts_sdf.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[startYear#21], functions=[count(1)])
   +- Exchange hashpartitioning(startYear#21, 200), ENSURE_REQUIREMENTS, [id=#145]
      +- HashAggregate(keys=[startYear#21], functions=[partial_count(1)])
         +- FileScan csv [startYear#21] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/siladitya/Documents/Spark/Spark-Course/Data/imdb/title_basi..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<startYear:string>




In [12]:
# the physical plan for `value_counts_sorted_sdf` is the physical plan for
# `value_counts_sdf` plus two more steps (exchange and sort)
value_counts_sorted_sdf.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#136L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#136L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [id=#165]
      +- HashAggregate(keys=[startYear#21], functions=[count(1)])
         +- Exchange hashpartitioning(startYear#21, 200), ENSURE_REQUIREMENTS, [id=#162]
            +- HashAggregate(keys=[startYear#21], functions=[partial_count(1)])
               +- FileScan csv [startYear#21] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/siladitya/Documents/Spark/Spark-Course/Data/imdb/title_basi..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<startYear:string>




In [13]:
title_basics_sdf.createOrReplaceTempView('titles')

value_counts_sql = spark.sql('select startYear, count(*) from titles group by startYear order by 2 desc;')

value_counts_sql.show(5)



+---------+--------+
|startYear|count(1)|
+---------+--------+
|       \N| 1183450|
|     2018|  413159|
|     2017|  411107|
|     2019|  398502|
|     2021|  392377|
+---------+--------+
only showing top 5 rows



                                                                                

In [14]:
# compiles to the same physical plan as for value_counts_sorted_sdf
value_counts_sql.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count(1)#162L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count(1)#162L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [id=#233]
      +- HashAggregate(keys=[startYear#21], functions=[count(1)])
         +- Exchange hashpartitioning(startYear#21, 200), ENSURE_REQUIREMENTS, [id=#230]
            +- HashAggregate(keys=[startYear#21], functions=[partial_count(1)])
               +- FileScan csv [startYear#21] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/siladitya/Documents/Spark/Spark-Course/Data/imdb/title_basi..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<startYear:string>




In [15]:
# stop spark application and cluster
# spark.stop()