# Introduction

In [None]:
%run ./Includes/paths.py

In [None]:
import pyspark
from delta import *

# start spark
builder = (pyspark.sql.SparkSession.builder.appName("Spark-Course")
                .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
                .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# setting log-level to ERROR to decrease verbosity
# log4j log-levels are: OFF, FATAL, ERROR, WARN, INFO, DEBUG, TRACE, ALL
spark.sparkContext.setLogLevel("ERROR")

spark

In [None]:
%%bash -s "$raw_data_path"
ls $1

In [None]:
# load the title_title_basics.tsv file
title_basics_sdf = (spark
                    .read
                    .option('inferSchema', 'true')
                    .option('header', 'true')
                    .option('delimiter', '\t')
                    .csv(raw_data_path + 'title_basics.tsv'))

title_basics_sdf.show(5)

In [None]:
# inferred schema
title_basics_sdf.printSchema()

In [None]:
# a spark dataframe *is* an execution plan
title_basics_sdf.explain()

In [None]:
title_basics_sdf.sort('startYear').show(5)

In [None]:
# FileScan is a narrow transformation,
# sort is a wide transformation (see 'Exchange rangepartitioning' below)
# as all rows need to be compared
title_basics_sdf.sort('startYear').explain()

In [None]:
value_counts_sdf = (title_basics_sdf
                        .groupBy('startYear')
                        .count())
value_counts_sdf.show(5)

In [None]:
from pyspark.sql.functions import * # lets us use col('count').desc()

value_counts_sorted_sdf = (value_counts_sdf
                                .orderBy(col('count').desc()))

value_counts_sorted_sdf.show(5)

In [None]:
value_counts_sdf.explain()

In [None]:
# the physical plan for `value_counts_sorted_sdf` is the physical plan for
# `value_counts_sdf` plus two more steps (exchange and sort)
value_counts_sorted_sdf.explain()

In [None]:
title_basics_sdf.createOrReplaceTempView('titles')

value_counts_sql = spark.sql('select startYear, count(*) from titles group by startYear order by 2 desc;')

value_counts_sql.show(5)

In [None]:
# compiles to the same physical plan as for value_counts_sorted_sdf
value_counts_sql.explain()

In [None]:
# stop spark application and cluster
# spark.stop()