The Spark Session

In [None]:
from pyspark.shell import spark

# Simples task to create a range of numbers
myRange = spark.range(1000).toDF("number")

Transformations

In [None]:
# Simple transformation to find all even numbers in DataFrame
divisBy2 = myRange.where("number % 2 = 0")
divisBy2.show()

Actions

In [None]:
# The total numbers of records in the DataFrame
divisBy2.count()

An End-to-End Example

Using Spark to analyze some flight data from United States Bureau of Transportation statistics

In [None]:
flightData2015 = spark \
    .read.option("inferSchema", "true")\
    .option("header", "true")\
    .csv("/datasets/flight-data/csv/2015-summary.csv")

flightData2015.take(3)

In [None]:
# Executing Explain Plan
flightData2015.sort("count").explain()

In [None]:
# Configuring the number of shuffle partitions
spark.conf.set("spark.sql.shuffle.partitions", "5")

flightData2015.sort("count").take(2)

DataFrames and Spark SQL

In [None]:
# Creating a temporary view
flightData2015.createOrReplaceTempView("flight_data_2015")

sqlWay = spark.sql("""
    SELECT dest_country_name, count(1) AS qtde
    FROM flight_data_2015
    GROUP BY dest_country_name
""")

DataFrameWay = flightData2015.groupBy("dest_country_name").count()

sqlWay.explain()
DataFrameWay.explain()

In [None]:
# Selecting data - Option 1
spark.sql("SELECT max(count) FROM flight_data_2015").take(1)

In [None]:
from pyspark.sql.functions import max

# Selecting data - Option 2
flightData2015.select(max("count")).take(1)

In [None]:
# Aggregation - Option 1
maxSql = spark.sql("""
    SELECT dest_country_name, SUM(count) AS destination_total
    FROM flight_data_2015
    GROUP BY dest_country_name
    ORDER BY destination_total DESC
    LIMIT 5
""")

maxSql.show()

In [None]:
from pyspark.sql.functions import desc

# Aggregation - Option 2
flightData2015\
    .groupBy("dest_country_name")\
    .sum("count")\
    .withColumnRenamed("sum(count)", "destination_total")\
    .sort(desc("destination_total"))\
    .limit(5)\
    .show()

In [None]:
# Explain Plan
flightData2015\
    .groupBy("dest_country_name")\
    .sum("count")\
    .withColumnRenamed("sum(count)", "destination_total")\
    .sort(desc("destination_total"))\
    .limit(5)\
    .explain()

Fim