In [1]:
dbutils.fs.ls("/databricks-datasets/definitive-guide/data/flight-data/parquet")

In [2]:
%scala

// Scala
val spark2 = spark // 책과는 다르게 
import spark2.implicits._

case class Flight(DEST_COUNTRY_NAME: String,
                  ORIGIN_COUNTRY_NAME: String,
                  count: BigInt)
val flightsDF = spark2.read.parquet("/databricks-datasets/definitive-guide/data/flight-data/parquet/2010-summary.parquet")
val flights = flightsDF.as[Flight]

In [3]:
%scala

display(flights.filter(flight_row => flight_row.ORIGIN_COUNTRY_NAME != "United States").map(flight_row => flight_row).limit(5))

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,1
United States,Ireland,264
United States,India,69
United States,Singapore,25
United States,Grenada,54


In [4]:
%scala

flights.take(5).filter(flight_row => flight_row.ORIGIN_COUNTRY_NAME != "United States").map(fr => Flight(fr.DEST_COUNTRY_NAME, fr.ORIGIN_COUNTRY_NAME, fr.count + 5))

## 3.3 구조적 스트리밍

In [6]:
%scala

val staticDataFrame = spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("/databricks-datasets/definitive-guide/data/retail-data/by-day/*.csv")

staticDataFrame.createOrReplaceTempView("retail_data")

In [7]:
%sql
show tables

database,tableName,isTemporary
default,flights2,False
default,flights_from_select,False
default,hive_flight2,False
default,hive_flights,False
,retail_data,True


In [8]:
%scala

val staticSchema = staticDataFrame.schema

import org.apache.spark.sql.functions.{window, col}

display(staticDataFrame.selectExpr("CustomerId", "(UnitPrice * Quantity) as total_cost", "InvoiceDate")
  .groupBy(col("CustomerId"), window(col("InvoiceDate"), "1 day"))
  .sum("total_cost")
  .limit(10))

CustomerId,window,sum(total_cost)
16057.0,"List(2011-12-05T00:00:00.000+0000, 2011-12-06T00:00:00.000+0000)",-37.6
14126.0,"List(2011-11-29T00:00:00.000+0000, 2011-11-30T00:00:00.000+0000)",643.6300000000001
13500.0,"List(2011-11-16T00:00:00.000+0000, 2011-11-17T00:00:00.000+0000)",497.9700000000001
17160.0,"List(2011-11-08T00:00:00.000+0000, 2011-11-09T00:00:00.000+0000)",516.8499999999999
15608.0,"List(2011-11-11T00:00:00.000+0000, 2011-11-12T00:00:00.000+0000)",122.4
15253.0,"List(2011-11-23T00:00:00.000+0000, 2011-11-24T00:00:00.000+0000)",277.6
15124.0,"List(2011-11-17T00:00:00.000+0000, 2011-11-18T00:00:00.000+0000)",93.44
12539.0,"List(2011-11-17T00:00:00.000+0000, 2011-11-18T00:00:00.000+0000)",1050.66
13658.0,"List(2011-11-30T00:00:00.000+0000, 2011-12-01T00:00:00.000+0000)",542.4000000000001
17396.0,"List(2011-10-31T00:00:00.000+0000, 2011-11-01T00:00:00.000+0000)",495.0


In [9]:
spark.conf.set("spark.sql.shuffle.partition", "5")

In [10]:
%scala

val streamingDataFrame = spark.readStream.schema(staticSchema)
  .option("maxFilesPerTrigger", 20)
  .format("csv")
  .option("head", "true")
  .load("/databricks-datasets/definitive-guide/data/retail-data/by-day/*.csv")

In [11]:
%scala

streamingDataFrame.isStreaming

In [12]:
%scala

val purchaseByCustomerPerHour = streamingDataFrame
  .selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")
  .groupBy(
    col("CustomerId"), window(col("InvoiceDate"), "1 day"))
  .sum("total_cost")

In [13]:
%scala

purchaseByCustomerPerHour.writeStream
  .format("memory")
  .queryName("customer_purchases")
  .outputMode("complete")
  .start()

In [14]:
%scala

display(spark2.sql(
"""
SELECT *
FROM customer_purchases
ORDER BY 'sum(total_cost)' DESC
"""))

CustomerId,window,sum(total_cost)
14560.0,"List(2011-02-04T00:00:00.000+0000, 2011-02-05T00:00:00.000+0000)",46.7
17790.0,"List(2010-12-13T00:00:00.000+0000, 2010-12-14T00:00:00.000+0000)",154.8
12647.0,"List(2010-12-05T00:00:00.000+0000, 2010-12-06T00:00:00.000+0000)",372.0
17537.0,"List(2011-02-09T00:00:00.000+0000, 2011-02-10T00:00:00.000+0000)",630.7000000000004
17460.0,"List(2010-12-01T00:00:00.000+0000, 2010-12-02T00:00:00.000+0000)",19.9
13408.0,"List(2010-12-01T00:00:00.000+0000, 2010-12-02T00:00:00.000+0000)",1024.6800000000003
16255.0,"List(2011-01-04T00:00:00.000+0000, 2011-01-05T00:00:00.000+0000)",299.6
13269.0,"List(2010-12-05T00:00:00.000+0000, 2010-12-06T00:00:00.000+0000)",351.43
14548.0,"List(2011-01-26T00:00:00.000+0000, 2011-01-27T00:00:00.000+0000)",326.4
16950.0,"List(2010-12-07T00:00:00.000+0000, 2010-12-08T00:00:00.000+0000)",172.0


In [15]:
%scala

purchaseByCustomerPerHour.writeStream
  .format("console")
  .queryName("customer_purchases_2")
  .outputMode("complete")
  .start()