## Introduction to Spark

In [4]:
// in Scala
val myRange = spark.range(1000).toDF("number")

myRange: org.apache.spark.sql.DataFrame = [number: bigint]


In [5]:
myRange.show()

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
|     5|
|     6|
|     7|
|     8|
|     9|
|    10|
|    11|
|    12|
|    13|
|    14|
|    15|
|    16|
|    17|
|    18|
|    19|
+------+
only showing top 20 rows



In [6]:
// in Scala
val divisBy2 = myRange.where("number % 2 = 0")

divisBy2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [number: bigint]


In [3]:
divisBy2.count()

res0: Long = 500


In [7]:
// in Scala
val flightData2015 = spark
  .read
  .option("inferSchema", "true")
  .option("header", "true")
  .csv("../data/flight-data/csv/2015-summary.csv")

flightData2015: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


In [8]:
flightData2015.take(3)


res2: Array[org.apache.spark.sql.Row] = Array([United States,Romania,15], [United States,Croatia,1], [United States,Ireland,344])


In [9]:
flightData2015.sort("count").explain()

== Physical Plan ==
*(1) Sort [count#36 ASC NULLS FIRST], true, 0
+- *(1) Project [DEST_COUNTRY_NAME#34, ORIGIN_COUNTRY_NAME#35, count#36]
   +- BatchScan[DEST_COUNTRY_NAME#34, ORIGIN_COUNTRY_NAME#35, count#36] CSVScan Location: InMemoryFileIndex[file:/home/u1/Python-Scala-Spark-Training/Exercises/Spark/data/flight-data/csv/..., ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




In [10]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [11]:
flightData2015.sort("count").take(2)

res5: Array[org.apache.spark.sql.Row] = Array([United States,Singapore,1], [Moldova,United States,1])


In [13]:
flightData2015.createOrReplaceTempView("flight_data_2015")

In [14]:
// in Scala
val sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")


sqlWay: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, count(1): bigint]


In [15]:
val dataFrameWay = flightData2015
  .groupBy('DEST_COUNTRY_NAME)
  .count()

sqlWay.explain
dataFrameWay.explain

== Physical Plan ==
*(1) HashAggregate(keys=[DEST_COUNTRY_NAME#34], functions=[count(1)])
+- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#34], functions=[partial_count(1)])
   +- *(1) Project [DEST_COUNTRY_NAME#34]
      +- BatchScan[DEST_COUNTRY_NAME#34] CSVScan Location: InMemoryFileIndex[file:/home/u1/Python-Scala-Spark-Training/Exercises/Spark/data/flight-data/csv/..., ReadSchema: struct<DEST_COUNTRY_NAME:string>


== Physical Plan ==
*(1) HashAggregate(keys=[DEST_COUNTRY_NAME#34], functions=[count(1)])
+- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#34], functions=[partial_count(1)])
   +- *(1) Project [DEST_COUNTRY_NAME#34]
      +- BatchScan[DEST_COUNTRY_NAME#34] CSVScan Location: InMemoryFileIndex[file:/home/u1/Python-Scala-Spark-Training/Exercises/Spark/data/flight-data/csv/..., ReadSchema: struct<DEST_COUNTRY_NAME:string>




dataFrameWay: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, count: bigint]


In [16]:
spark.sql("SELECT max(count) from flight_data_2015").take(1)

res8: Array[org.apache.spark.sql.Row] = Array([370002])


In [17]:
// in Scala
import org.apache.spark.sql.functions.max


import org.apache.spark.sql.functions.max


In [18]:
flightData2015.select(max("count")).take(1)

res9: Array[org.apache.spark.sql.Row] = Array([370002])


In [19]:
// in Scala
val maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")

maxSql.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



maxSql: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, destination_total: bigint]


In [20]:
// in Scala
import org.apache.spark.sql.functions.desc

flightData2015
  .groupBy("DEST_COUNTRY_NAME")
  .sum("count")
  .withColumnRenamed("sum(count)", "destination_total")
  .sort(desc("destination_total"))
  .limit(5)
  .show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



import org.apache.spark.sql.functions.desc


In [21]:
// in Scala
flightData2015
  .groupBy("DEST_COUNTRY_NAME")
  .sum("count")
  .withColumnRenamed("sum(count)", "destination_total")
  .sort(desc("destination_total"))
  .limit(5)
  .explain()

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#164L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#34,destination_total#164L])
+- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#34], functions=[sum(cast(count#36 as bigint))])
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#34], functions=[partial_sum(cast(count#36 as bigint))])
      +- *(1) Project [DEST_COUNTRY_NAME#34, count#36]
         +- BatchScan[DEST_COUNTRY_NAME#34, count#36] CSVScan Location: InMemoryFileIndex[file:/home/u1/Python-Scala-Spark-Training/Exercises/Spark/data/flight-data/csv/..., ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>




In [41]:
// in Scala
import spark.implicits._
case class Flight(DEST_COUNTRY_NAME: String,
                  ORIGIN_COUNTRY_NAME: String,
                  count: BigInt)
val flightsDF = spark.read
  .parquet("../data/flight-data/parquet/2010-summary.parquet/")

// Convert to dataset
val flights = flightsDF.as[Flight]

import spark.implicits._
defined class Flight
flightsDF: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]
flights: org.apache.spark.sql.Dataset[Flight] = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


In [39]:
flightsDF

res22: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


In [40]:
flights

res23: org.apache.spark.sql.Dataset[Flight] = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


In [26]:
flights
  .take(5)
  .filter(flight_row => flight_row.ORIGIN_COUNTRY_NAME != "Canada")
  .map(fr => Flight(fr.DEST_COUNTRY_NAME, fr.ORIGIN_COUNTRY_NAME, fr.count + 5))

res15: Array[Flight] = Array(Flight(United States,Romania,6), Flight(United States,Ireland,269), Flight(United States,India,74), Flight(Egypt,United States,29), Flight(Equatorial Guinea,United States,6))


In [27]:
// in Scala
val staticDataFrame = spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("../data/retail-data/by-day/*.csv")

staticDataFrame.createOrReplaceTempView("retail_data")
val staticSchema = staticDataFrame.schema

staticDataFrame: org.apache.spark.sql.DataFrame = [InvoiceNo: string, StockCode: string ... 6 more fields]
staticSchema: org.apache.spark.sql.types.StructType = StructType(StructField(InvoiceNo,StringType,true), StructField(StockCode,StringType,true), StructField(Description,StringType,true), StructField(Quantity,IntegerType,true), StructField(InvoiceDate,StringType,true), StructField(UnitPrice,DoubleType,true), StructField(CustomerID,DoubleType,true), StructField(Country,StringType,true))


In [28]:
// in Scala
import org.apache.spark.sql.functions.{window, column, desc, col}
staticDataFrame
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")
  .groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))
  .sum("total_cost")
  .show(5)

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   14075.0|[2011-12-05 00:00...|316.78000000000003|
|   18180.0|[2011-12-05 00:00...|            310.73|
|   15358.0|[2011-12-05 00:00...| 830.0600000000003|
|   15392.0|[2011-12-05 00:00...|304.40999999999997|
|   15290.0|[2011-12-05 00:00...|263.02000000000004|
+----------+--------------------+------------------+
only showing top 5 rows



import org.apache.spark.sql.functions.{window, column, desc, col}


In [29]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [30]:
val streamingDataFrame = spark.readStream
    .schema(staticSchema)
    .option("maxFilesPerTrigger", 1)
    .format("csv")
    .option("header", "true")
    .load("../data/retail-data/by-day/*.csv")



streamingDataFrame: org.apache.spark.sql.DataFrame = [InvoiceNo: string, StockCode: string ... 6 more fields]


In [31]:
streamingDataFrame.isStreaming // returns true

res18: Boolean = true


In [32]:
// in Scala
val purchaseByCustomerPerHour = streamingDataFrame
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")
  .groupBy(
    $"CustomerId", window($"InvoiceDate", "1 day"))
  .sum("total_cost")


purchaseByCustomerPerHour: org.apache.spark.sql.DataFrame = [CustomerId: double, window: struct<start: timestamp, end: timestamp> ... 1 more field]


In [33]:
// in Scala
purchaseByCustomerPerHour.writeStream
    .format("memory") // memory = store in-memory table
    .queryName("customer_purchases") // the name of the in-memory table
    .outputMode("complete") // complete = all the counts should be in the table
    .start()


res19: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@6961756c


In [34]:
// in Scala
spark.sql("""
  SELECT *
  FROM customer_purchases
  ORDER BY `sum(total_cost)` DESC
  """)
  .show(5)

+----------+--------------------+------------------+
|CustomerId|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|      null|[2011-12-08 00:00...|31975.590000000007|
|      null|[2011-11-22 00:00...|13216.889999999894|
|   18102.0|[2011-12-08 00:00...|           11016.1|
|   16210.0|[2011-12-08 00:00...|            3599.4|
|   16532.0|[2011-11-22 00:00...|            2232.0|
+----------+--------------------+------------------+
only showing top 5 rows



In [35]:
staticDataFrame.printSchema()


root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [81]:
staticDataFrame.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)

