In [ ]:
import java.sql.Timestamp
case class Rate(timestamp: Timestamp, value: Long)

import java.sql.Timestamp
defined class Rate


In [ ]:
val rate = sparkSession.readStream.format("rate").load().as[Rate]

rate: org.apache.spark.sql.Dataset[Rate] = [timestamp: timestamp, value: bigint]


In [ ]:
val uids = List("d1e46a42", "d8e16e2a", "d1b06f88", 
                "d2e710aa", "d2f731cc", "d4c162ee", 
                "d4a11632", "d7e277b2", "d59018de", 
                "d60779f6" )

uids: List[String] = List(d1e46a42, d8e16e2a, d1b06f88, d2e710aa, d2f731cc, d4c162ee, d4a11632, d7e277b2, d59018de, d60779f6)


In [ ]:
val locationGenerator: () => (Double, Double) = {
  // Europe bounds 
  val longBounds = (-10.89,39.82)
  val latBounds = (35.52,56.7)
  def pointInRange(bounds:(Double, Double)): Double = {
    val (a, b) = bounds
    Math.abs(scala.util.Random.nextDouble())*b+a
  }
  () => (pointInRange(longBounds), pointInRange(latBounds))
}    

locationGenerator: () => (Double, Double) = <function0>


In [ ]:
def pickOne[T](list: List[T]): T = list(scala.util.Random.nextInt(list.size))

pickOne: [T](list: List[T])T


In [ ]:
val pressureGen: () => Double = () => scala.util.Random.nextDouble + 101.0
val tempGen: () => Double = () => scala.util.Random.nextDouble * 60 - 20

pressureGen: () => Double = <function0>
tempGen: () => Double = <function0>


In [ ]:
import java.sql.Timestamp
case class WeatherEvent(stationId: String, timestamp: Timestamp, location:(Double,Double), pressure: Double, temp: Double)

import java.sql.Timestamp
defined class WeatherEvent


In [ ]:
val weatherEvents = rate.map{case Rate(ts, value) => WeatherEvent(pickOne(uids), ts, locationGenerator(), pressureGen(), tempGen())}

weatherEvents: org.apache.spark.sql.Dataset[WeatherEvent] = [stationId: string, timestamp: timestamp ... 3 more fields]


In [ ]:
import scala.collection.immutable.Queue
case class FIFOBuffer[T](capacity: Int, data: Queue[T] = Queue.empty) extends Serializable {
  def add(element: T): FIFOBuffer[T] = this.copy(data = data.enqueue(element).take(capacity))
  def get: List[T] = data.toList
  def size: Int = data.size
}

import scala.collection.immutable.Queue
defined class FIFOBuffer


In [ ]:
import java.sql.Timestamp
case class WeatherEventAverage(stationId: String, 
                               startTime: Timestamp, 
                               endTime:Timestamp, 
                               pressureAvg: Double, 
                               tempAvg: Double)

import java.sql.Timestamp
defined class WeatherEventAverage


In [ ]:
import org.apache.spark.sql.streaming.GroupState
def mappingFunction(key: String, values: Iterator[WeatherEvent], state: GroupState[FIFOBuffer[WeatherEvent]]): WeatherEventAverage = {
  val ElementCountWindowSize = 10
  // get current state or create a new one if there's no previous state
  val currentState = state.getOption.getOrElse(new FIFOBuffer[WeatherEvent](ElementCountWindowSize))
  // enrich the state with the new events
  val updatedState = values.foldLeft(currentState){case (st, ev) => st.add(ev)}
  // update the state with the enriched state
  state.update(updatedState)
  // if we have enough data, create a WeatherEventAverage from the accumulated state
  // otherwise, make a zeroed record
  val data = updatedState.get
  if (data.size > 2) {
    val start = data.head
    val end = data.last
    val pressureAvg = data.map(event => event.pressure).sum/data.size
    val tempAvg = data.map(event => event.temp).sum/data.size
    WeatherEventAverage(key, start.timestamp, end.timestamp, pressureAvg, tempAvg)
  } else {
    WeatherEventAverage(key, new Timestamp(0), new Timestamp(0), 0.0, 0.0)
  }
}                                              

import org.apache.spark.sql.streaming.GroupState
mappingFunction: (key: String, values: Iterator[WeatherEvent], state: org.apache.spark.sql.streaming.GroupState[FIFOBuffer[WeatherEvent]])WeatherEventAverage


In [ ]:
import org.apache.spark.sql.streaming.GroupStateTimeout
val weatherEventsMovingAverage = weatherEvents.groupByKey(record => record.stationId)
.mapGroupsWithState(GroupStateTimeout.ProcessingTimeTimeout)(mappingFunction)

import org.apache.spark.sql.streaming.GroupStateTimeout
weatherEventsMovingAverage: org.apache.spark.sql.Dataset[WeatherEventAverage] = [stationId: string, startTime: timestamp ... 3 more fields]


In [ ]:
val outQuery = weatherEventsMovingAverage.writeStream
  .format("memory")
  .queryName("weatherAverage")
  .outputMode("update")
  .start()

outQuery: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@533df10c


In [ ]:
outQuery.stop()

In [ ]:
val table  = sparkSession.sql("select * from weatherAverage where pressureAvg == 0.0")

table: org.apache.spark.sql.DataFrame = [stationId: string, startTime: timestamp ... 3 more fields]


In [ ]:
table.show(truncate= false)

+---------+-------------------+-------------------+-----------+-------+
|stationId|startTime          |endTime            |pressureAvg|tempAvg|
+---------+-------------------+-------------------+-----------+-------+
|d2e710aa |1970-01-01 01:00:00|1970-01-01 01:00:00|0.0        |0.0    |
|d1e46a42 |1970-01-01 01:00:00|1970-01-01 01:00:00|0.0        |0.0    |
|d4a11632 |1970-01-01 01:00:00|1970-01-01 01:00:00|0.0        |0.0    |
|d4c162ee |1970-01-01 01:00:00|1970-01-01 01:00:00|0.0        |0.0    |
|d1e46a42 |1970-01-01 01:00:00|1970-01-01 01:00:00|0.0        |0.0    |
|d60779f6 |1970-01-01 01:00:00|1970-01-01 01:00:00|0.0        |0.0    |
|d2f731cc |1970-01-01 01:00:00|1970-01-01 01:00:00|0.0        |0.0    |
|d1b06f88 |1970-01-01 01:00:00|1970-01-01 01:00:00|0.0        |0.0    |
|d8e16e2a |1970-01-01 01:00:00|1970-01-01 01:00:00|0.0        |0.0    |
|d8e16e2a |1970-01-01 01:00:00|1970-01-01 01:00:00|0.0        |0.0    |
|d4c162ee |1970-01-01 01:00:00|1970-01-01 01:00:00|0.0        |0

In [ ]:
outQuery.stop

In [ ]:
val m5 = 1000*60*5
val next5MinSlot = ((System.currentTimeMillis + m5)/m5)*m5
val m5Date = new java.sql.Timestamp(next5MinSlot)

m5: Int = 300000
next5MinSlot: Long = 1530482100000
m5Date: java.sql.Timestamp = 2018-07-01 23:55:00.0


In [ ]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._


val perMinuteAvg = weatherDF
    .withWatermark("timestamp","0 minutes")
    .groupBy(window($"timestamp","15 minute", "15 minute", "0 minute"))
    //.agg(avg($"pressure") as "pressureAvg", avg($"temp") as "tempAvg")
    .agg(count($"stationId"))

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
perMinuteAvg: org.apache.spark.sql.DataFrame = [window: struct<start: timestamp, end: timestamp>, count(stationId): bigint]


In [ ]:
perMinuteAvg.printSchema

root
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- count(stationId): long (nullable = false)



In [ ]:
val query = perMinuteAvg.writeStream.format("memory").outputMode("append").queryName("tenMinAvg").start()

query: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@618e8f71


In [ ]:
val df = sparkSession.sql("select * from tenMinAvg")

df: org.apache.spark.sql.DataFrame = [window: struct<start: timestamp, end: timestamp>, count(stationId): bigint]


In [ ]:
df.show(truncate=false)

+---------------------------------------------+----------------+
|window                                       |count(stationId)|
+---------------------------------------------+----------------+
|[2018-06-24 23:45:00.0,2018-06-25 00:00:00.0]|587             |
+---------------------------------------------+----------------+



In [ ]:
query.stop


In [ ]:
df.show

+--------------------+---------+------------------+-------------------+
|              window|stationId|       pressureAvg|            tempAvg|
+--------------------+---------+------------------+-------------------+
|[1970-01-18 00:00...| d7e771cc|101.01088217991148|-5.8995974443145744|
|[1970-01-18 00:00...| d7e76a42|101.84763550125572| 16.365236842957728|
|[1970-01-18 00:00...| d7e76f88| 101.9718887595217| 34.596594920941314|
|[1970-01-18 00:00...| d7e77672|101.15962690464693| 18.331916524151467|
|[1970-01-18 00:00...| d7e77672|101.33512527465564| 14.472673136283124|
|[1970-01-18 00:00...| d7e770aa|101.40406589851482| 12.631897019107264|
|[1970-01-18 00:00...| d7e778de|101.65262104046391| 30.251680016296675|
|[1970-01-18 00:00...| d7e772ee|101.49646793425526| 10.075008485762066|
|[1970-01-18 00:00...| d7e76a42|101.58496517622028| 1.5693482754141428|
|[1970-01-18 00:00...| d7e779f6|  101.603830921723| 20.764354974202504|
|[1970-01-18 00:00...| d7e76e2a|101.47581528098183| 28.273156326

In [ ]:
df.printSchema

root
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- pressureAvg: double (nullable = true)
 |-- tempAvg: double (nullable = true)



In [ ]:
val q2 = weatherEvents.writeStream.format("memory").queryName("raw2").outputMode("update").start()

q2: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@7f025feb


In [ ]:
sparkSession.sql("select * from raw2")

res84: org.apache.spark.sql.DataFrame = [stationId: string, ts: date ... 3 more fields]


In [ ]:
rate.writeStream.format("memory").queryName("rate").start

res77: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@7e4eeefb


In [ ]:
sparkSession.sql("select * from rate")

res79: org.apache.spark.sql.DataFrame = [timestamp: timestamp, value: bigint]


In [ ]:
val ts = System.currentTimeMillis

ts: Long = 1529240368112


In [ ]:
val sqlTs = new java.sql.Timestamp(ts)

sqlTs: java.sql.Timestamp = 2018-06-17 14:59:28.112


In [ ]:
val rates = (1 to 10).map(i => Rate(ts, i))

rates: scala.collection.immutable.IndexedSeq[Rate] = Vector(Rate(1529247155834,1), Rate(1529247155834,2), Rate(1529247155834,3), Rate(1529247155834,4), Rate(1529247155834,5), Rate(1529247155834,6), Rate(1529247155834,7), Rate(1529247155834,8), Rate(1529247155834,9), Rate(1529247155834,10))


In [ ]:
val spark = sparkSession
import spark.implicits._
val ratesDS= rates.toDS
val tsDS = rates.map{case Rate(ts, value) => (ts, new java.sql.Timestamp(ts))}.toDF("ts", "sqlts")

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@1a0895e
import spark.implicits._
ratesDS: org.apache.spark.sql.Dataset[Rate] = [timestamp: bigint, value: bigint]
tsDS: org.apache.spark.sql.DataFrame = [ts: bigint, sqlts: timestamp]


In [ ]:
val query = tsDS.select($"ts", $"sqlts", $"sqlts".cast("Long"))

query: org.apache.spark.sql.DataFrame = [ts: bigint, sqlts: timestamp ... 1 more field]


In [ ]:
query.show

+-------------+--------------------+----------+
|           ts|               sqlts|     sqlts|
+-------------+--------------------+----------+
|1529247155834|2018-06-17 16:52:...|1529247155|
|1529247155834|2018-06-17 16:52:...|1529247155|
|1529247155834|2018-06-17 16:52:...|1529247155|
|1529247155834|2018-06-17 16:52:...|1529247155|
|1529247155834|2018-06-17 16:52:...|1529247155|
|1529247155834|2018-06-17 16:52:...|1529247155|
|1529247155834|2018-06-17 16:52:...|1529247155|
|1529247155834|2018-06-17 16:52:...|1529247155|
|1529247155834|2018-06-17 16:52:...|1529247155|
|1529247155834|2018-06-17 16:52:...|1529247155|
+-------------+--------------------+----------+

