Fetching contributors…
Cannot retrieve contributors at this time
114 lines (95 sloc) 3.99 KB
package com.vishnuviswanath.spark.streaming
import java.sql.Timestamp
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.sql.functions._
* Created by vviswanath on 1/10/18.
* Spark streaming application from Kafka source
* To setup kafka:
* Download from
* start zookeeper:
* bin/ conig/
* start kafka broker(s):
* bin/ config/
* create kafka topics:
* bin/ --create --topic "cars" --replication-factor 1 --partitions 1 --zookeeper localhost:2181
* bin/ --create --topic "fastcars" --replication-factor 1 --partitions 1 --zookeeper localhost:2181
* bin/ --topic fastcars --broker-list localhost:9092
* bin/ --topic fastcars --bootstrap-server localhost:2181
* describe:
* bin/ --zookeeper localhost:2181 --describe --topic cars
* bin/ --zookeeper localhost:2181 --describe --topic fastcars
object KafkaSourceStreaming {
//convert aggregates into typed data
case class CarEvent(carId: String, speed: Option[Int], acceleration: Option[Double], timestamp: Timestamp)
object CarEvent {
def apply(rawStr: String): CarEvent = {
val parts = rawStr.split(",")
CarEvent(parts(0), Some(Integer.parseInt(parts(1))), Some(java.lang.Double.parseDouble(parts(2))), new Timestamp(parts(3).toLong))
def main(args: Array[String]): Unit = {
//create a spark session, and run it on local mode
val spark = SparkSession.builder()
import spark.implicits._
//read the source
val df: DataFrame = spark
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "cars")
//.schema(schema) : we cannot set a schema for kafka source. Kafka source has a fixed schema of (key, value)
val cars: Dataset[CarEvent] = df
.selectExpr("CAST(value AS STRING)")
.map(r CarEvent(r.getString(0)))
//aggregation without window
/*val aggregates = cars
val aggregates = cars
.withWatermark("timestamp", "3 seconds")
//.groupBy(window($"timestamp","4 seconds","1 seconds"), $"carId") //sliding window of size 4 seconds, that slides every 1 second
.groupBy(window($"timestamp","4 seconds"), $"carId") //tumbling window of size 4 seconds (event time)
//.groupBy(window(current_timestamp(),"4 seconds"), $"carId") //Use processing time.
//.where("speed > 70")
val writeToConsole = aggregates
.option("truncate", "false") //prevent trimming output fields
.queryName("kafka spark streaming console")
val writeToKafka = aggregates
.selectExpr("CAST(carId AS STRING) AS key", "CAST(speed AS STRING) AS value")
.option("topic", "fastcars")
//.option("startingOffsets", "earliest") //earliest, latest or offset location. default latest for streaming
//.option("endingOffsets", "latest") // used only for batch queries
.option("checkpointLocation", "/tmp/sparkcheckpoint/") //must when not memory or console output
.queryName("kafka spark streaming kafka")
//.outputMode("complete") // output everything
//.outputMode("append") // only supported when we set watermark. output only new
.outputMode("update") //ouput new and updated
spark.streams.awaitAnyTermination() //running multiple streams at a time