In [1]:
import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, TimestampType,  StringType, StructField, StructType}
import  org.apache.spark.sql._
import  org.apache.spark.sql.functions._

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.80.128:4043
SparkContext available as 'sc' (version = 3.1.3, master = local[*], app id = local-1647893264014)
SparkSession available as 'spark'


import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, TimestampType, StringType, StructField, StructType}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._


In [2]:
val kafkaDf = spark.readStream.format("kafka")
  .option("kafka.bootstrap.servers", "hadoop-vm:9092")
  .option("subscribe", "stock-ticks")
  .option("group.id", "stock-ticks-group-sg5")
  .load()

kafkaDf: org.apache.spark.sql.DataFrame = [key: binary, value: binary ... 5 more fields]


In [3]:
kafkaDf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [4]:
val ticksDf = kafkaDf.selectExpr("CAST(value AS STRING)", "timestamp")

ticksDf.printSchema() 

root
 |-- value: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



ticksDf: org.apache.spark.sql.DataFrame = [value: string, timestamp: timestamp]


In [5]:
val schema = StructType(
    List(
    StructField("symbol", StringType, true),
    StructField("price", DoubleType, true),
    StructField("volume", LongType, true),
    StructField("timestamp", LongType, true)
))

schema: org.apache.spark.sql.types.StructType = StructType(StructField(symbol,StringType,true), StructField(price,DoubleType,true), StructField(volume,LongType,true), StructField(timestamp,LongType,true))


In [6]:
val jsonDf = ticksDf.withColumn("value", from_json($"value", schema))

jsonDf.printSchema()

root
 |-- value: struct (nullable = true)
 |    |-- symbol: string (nullable = true)
 |    |-- price: double (nullable = true)
 |    |-- volume: long (nullable = true)
 |    |-- timestamp: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)



jsonDf: org.apache.spark.sql.DataFrame = [value: struct<symbol: string, price: double ... 2 more fields>, timestamp: timestamp]


In [8]:
val stockTickDf = jsonDf.select (col("value.*"))
stockTickDf.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- price: double (nullable = true)
 |-- volume: long (nullable = true)
 |-- timestamp: long (nullable = true)



stockTickDf: org.apache.spark.sql.DataFrame = [symbol: string, price: double ... 2 more fields]


In [9]:
val stockTickDf1 = stockTickDf
                .withColumn("timestampTemp", (col("timestamp") / 1000).cast("timestamp"))
                .withColumn("trade_time", date_trunc("minute", col("timestampTemp")))
                .drop("timestamp")
                .drop("timestampTemp")
                .withColumnRenamed("trade_time", "timestamp")

stockTickDf1: org.apache.spark.sql.DataFrame = [symbol: string, price: double ... 2 more fields]


In [11]:
import org.apache.spark.sql.streaming.Trigger

(stockTickDf1.withColumn("year", date_format(col("timestamp"), "yyyy"))
            .withColumn("month", date_format(col("timestamp"), "MM"))
            .withColumn("day", date_format(col("timestamp"), "dd"))
            .withColumn("hour",date_format(col("timestamp"), "HH"))
            .withColumn("SYMBOL", col("symbol"))
            .writeStream
            .trigger(Trigger.ProcessingTime("65 seconds"))
            .queryName("write to csv")
            .format("csv")
            .option("path", "hdfs://localhost:9000/dump-1min-scala")
            .option("header", true)
            .option("checkpointLocation", "hdfs://localhost:9000/checkpoint/tickscsvtohdfs6")
            .partitionBy("year", "month", "day", "hour", "SYMBOL")
            .option("truncate", false)
            .start()
)

import org.apache.spark.sql.streaming.Trigger
res6: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@1315bad8


In [None]:
// def processBatchData(candleBatchDf, batch_id: Long)= {
//     print ("process batch called", batch_id, "writing ", candleBatchDf.count())
//      val candleBatchDf = (candleBatchDf
//          .coalesce(1)
//          .write
//          .mode("append")
//          .partitionBy("year", "month", "day", "hour", "SYMBOL")
//          .option("header", true)
//          .save("hdfs://localhost:9000/dump-1min-scala")
//                           )
//  } 
// stockTickDf1Min.writeStream.foreachBatch(processBatchData_).outputMode("append").start()