In [None]:
# setup-- 
import os
import pyspark
from splicemachine.spark.context import PySpliceContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

# make sure pyspark tells workers to use python3 not 2 if both are installed
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
jdbc_host = os.environ['JDBC_HOST']

conf = pyspark.SparkConf()
sc = pyspark.SparkContext(conf=conf)

spark = SparkSession.builder.config(conf=conf).getOrCreate()

splicejdbc=f"jdbc:splice://{jdbc_host}:1527/splicedb;user=splice;password=admin"

splice = PySpliceContext(spark, splicejdbc)


In [None]:
%%sql
%defaultDatasource jdbc:splice://jrtest01-splice-hregion:1527/splicedb;user=splice;password=admin

<link rel="stylesheet" href="https://doc.splicemachine.com/zeppelin/css/zepstyles.css" />

# Spark Stream
Spark Streaming is used to read data from Kafka Queue, which is then ingested into Splice using Splice Machine Adpater.

The steps involved are:

* Create Spark Direct Streaming to consume the data from Kafka Queue  
* Initialize the Splice Machine Adapter
* As data is streamed in, blocks of the data are parsed and ingested into Splice Machine using the Adapter

The streaming of data can be viewed in Spark UI 
In the next notebook, we can query against the streamed data that is ingested in real time.

Before running this notebook, ensure that brokerlist and jdbc url are set appropriatly 

In [None]:
%%scala 
import _root_.kafka.serializer.DefaultDecoder
import _root_.kafka.serializer.StringDecoder
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.Row
import scala.collection.mutable.ListBuffer

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer

import org.apache.spark.SparkConf
import org.apache.spark.streaming.Seconds
import org.apache.spark.sql.SQLContext

import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import com.splicemachine.spark.splicemachine._
import com.splicemachine.derby.utils._


//Set properties

val topics = Array("iotdemo")

val brokerList="kafka-0-node.{FRAMEWORKNAME}.mesos:9092"
val consumerGroup="kstest"
val batchInterval=5

val JDBC_URL="jdbc:splice://{FRAMEWORKNAME}-proxy.marathon.mesos:1527/splicedb;user=splice;password=admin;useSpark=true"
val SPLICE_TABLE_ITEM="IOTDEMO.ITEMFLOW"


//Create Streaming Context
val ssc = new StreamingContext(sc, Seconds(5)) 

//Set Kafka Queue parameters
val kafkaParams = Map[String, Object](
    "bootstrap.servers" -> brokerList,
    "group.id"-> consumerGroup,
    "auto.offset.reset" -> "earliest",
    "enable.auto.commit" -> (false: java.lang.Boolean),
    "key.deserializer" -> classOf[StringDeserializer],
    "value.deserializer" -> classOf[StringDeserializer]
    )
    
    
//Create Direct Stream
val stream = KafkaUtils.createDirectStream[String, String](
  ssc,
  PreferConsistent,
  Subscribe[String, String](topics, kafkaParams)
)


//Parse the queue messages
val toPair = stream.map(record => (record.key, record.value))
val msgs = toPair.map(_._2)

val splicemachineContext = new SplicemachineContext(JDBC_URL)
val schema_item = splicemachineContext.getSchema(SPLICE_TABLE_ITEM)
 

 msgs.foreachRDD { rdd =>
   
   //Create dataframes
    val lines =  rdd.map(line => line.split(","))
   
    val rowRdd_item = lines.map { p => 
        Row (
             p(0).toLong, 
             if(p(1).length==0) null else SpliceDateFunctions.TO_TIMESTAMP( p(1), "yyyy-MM-dd HH:mm:ss"), 
            if(p(2).length==0) null else SpliceDateFunctions.TO_TIMESTAMP( p(2), "yyyy-MM-dd HH:mm:ss"), 
            p(3), 
            if(p(4).length==0) null else  p(4).toLong,
            if(p(5).length==0) null else BigDecimal(p(5)), 
            if(p(6).length==0) null else SpliceDateFunctions.TO_TIMESTAMP( p(6), "yyyy-MM-dd HH:mm:ss"),
            if(p(7).length==0) null else  p(7).toLong, 
            if(p(8).length==0) null else SpliceDateFunctions.TO_TIMESTAMP( p(8), "yyyy-MM-dd HH:mm:ss"),
            if(p(9).length==0) null else p(9).toLong, 
            if(p(10).length==0) null else SpliceDateFunctions.TO_TIMESTAMP( p(10), "yyyy-MM-dd HH:mm:ss"),
            if(p(11).length==0) null else BigDecimal(p(11)),  
            if(p(12).length==0) null else BigDecimal(p(12)), 
            if(p(13).length==0) null else BigDecimal(p(13)),
            if(p(14).length==0) null else SpliceDateFunctions.TO_TIMESTAMP( p(14), "yyyy-MM-dd HH:mm:ss"),
            if(p.length >15) if(p(15).length==0) null else SpliceDateFunctions.TO_TIMESTAMP( p(15), "yyyy-MM-dd HH:mm:ss") else null,
             if(p.length >16) if(p(16).length==0) null else  SpliceDateFunctions.TO_TIMESTAMP( p(16), "yyyy-MM-dd HH:mm:ss") else null,
              if(p.length >17) if(p(17).length==0) null else SpliceDateFunctions.TO_TIMESTAMP( p(17), "yyyy-MM-dd HH:mm:ss") else null
         )
    }
    

    val df_item = sqlContext.createDataFrame(rowRdd_item, schema_item)
  
    //If there are records, use Splice Adapter to insert the data to table
    if(df_item.count > 0)
        splicemachineContext.insert(df_item,  SPLICE_TABLE_ITEM)
  
    }
    
  
//Stop gracefully when driver is stopped
 sys.ShutdownHookThread {
      ssc.stop(true, true)
  }
  

ssc.start()
//ssc.awaitTermination()
ssc.awaitTerminationOrTimeout(5000000)
ssc.stop(stopSparkContext = false, stopGracefully = true)

