# Introcduction to Spark

## The Spark Session

In [0]:
// Simples task to create a range of numbers
val myRange = spark.range(1000).toDF("number")

## Transformations

In [0]:
// Simple transformation to find all even numbers in DataFrame
val divisBy2 = myRange.where("number % 2 = 0")
divisBy2.show()

## Actions

In [0]:
// The total numbers of records in the DataFrame
divisBy2.count()

## An Project End-to-End Example

In [0]:
val flightData2015 = spark
    .read.option("inferSchema", "true")
    .option("header", "true")
    .csv("/datasets/flight-data/csv/2015-summary.csv")

flightData2015.take(3)

In [0]:
// Executing Explain Plan
flightData2015.sort("count").explain()

In [0]:
// Configuring the number of shuffle partitions
spark.conf.set("spark.sql.shuffle.partitions", "5")

flightData2015.sort("count").take(2)

### DataFrames and SQL

In [0]:
// Creating a temporary view
flightData2015.createOrReplaceTempView("flight_data_2015")

val sqlWay = spark.sql("""
    SELECT dest_country_name, count(1) AS qtde
    FROM flight_data_2015
    GROUP BY dest_country_name
""")

val DataFrameWay = flightData2015.groupBy("dest_country_name").count()

sqlWay.explain()
DataFrameWay.explain()

In [0]:
// Selecting data - Option 1
spark.sql("SELECT max(count) FROM flight_data_2015").take(1)

In [0]:
import org.apache.spark.sql.functions.max

// Selecting data - Option 2
flightData2015.select(max("count")).take(1)

In [0]:
// Aggregation - Option 1
val maxSql = spark.sql("""
    SELECT dest_country_name, SUM(count) AS destination_total
    FROM flight_data_2015
    GROUP BY dest_country_name
    ORDER BY destination_total DESC
    LIMIT 5
""")

maxSql.show()

In [0]:
import org.apache.spark.sql.functions.desc

// Aggregation - Option 2
flightData2015
    .groupBy("dest_country_name")
    .sum("count")
    .withColumnRenamed("sum(count)", "destination_total")
    .sort(desc("destination_total"))
    .limit(5)
    .show()

In [0]:
// Explain Plan
flightData2015
    .groupBy("dest_country_name")
    .sum("count")
    .withColumnRenamed("sum(count)", "destination_total")
    .sort(desc("destination_total"))
    .limit(5)
    .explain()

## Fim