In [None]:
import os

SCALA_VERSION = '2.12'
SPARK_VERSION = '3.1.3'

os.environ['PYSPARK_SUBMIT_ARGS'] = f'--packages org.apache.spark:spark-sql-kafka-0-10_{SCALA_VERSION}:{SPARK_VERSION} pyspark-shell'

In [None]:
import findspark
findspark.init()

In [None]:
import pyspark


from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]")\
                            .appName("SparkStreamingKafkaBasic").getOrCreate()

In [None]:
kafkaDf = spark.readStream.format("kafka")\
              .option("kafka.bootstrap.servers", "localhost:9092")\
              .option("subscribe", "orders")\
              .load()

In [None]:
kafkaDf.printSchema()

In [None]:
orderRawDf = kafkaDf.selectExpr("CAST(value AS STRING)", "timestamp")
orderRawDf.printSchema()

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, DoubleType, DateType, LongType

schema = StructType([
            StructField("order_id", IntegerType(), True),
            StructField("item_id", StringType(), True),
            StructField("price", IntegerType(), True),
            StructField("qty", IntegerType(), True),
            StructField("order_date", LongType(), True),
            StructField("state", StringType(), True),
        ])

In [None]:
jsonDf = orderRawDf.withColumn("value", F.from_json("value", schema))
jsonDf.printSchema()

In [None]:
orderDf = jsonDf.select(F.col("value.*"))
orderDf.printSchema()

In [None]:
orderDf = orderDf\
                .withColumn("timestampTemp", (F.col("order_date") / 1000).cast("timestamp"))\
                .withColumn("order_time", F.date_trunc("minute", F.col("timestampTemp")))\
                .drop("order_date")\
                .drop("timestampTemp")\
                .withColumnRenamed("order_time", "timestamp")

orderDf.printSchema()

In [None]:
orderDf5min= orderDf.withColumn("amount", (F.col("price") * F.col("qty")))\
                    .withWatermark("timestamp", "1 minutes")\
                    .groupBy("state", F.window("timestamp", "5 minutes"))\
                    .agg( F.sum("amount").alias("amount"))

orderDf5min.printSchema()                            

In [None]:
# echoOnconsole = orderDf5min\
#                  .writeStream\
#                 .outputMode("update")\
#                 .format("console")\
#                 .option("truncate", False)\
#                 .start() # start the query. spark will subscribe for data

In [None]:
orderDf5minKafka = orderDf5min\
                            .selectExpr("to_json(struct(*)) AS value")

orderDf5minKafka\
            .writeStream\
             .format("kafka")\
            .outputMode("update")\
             .option("kafka.bootstrap.servers", "localhost:9092")\
            .option("topic", "statewise-earning")\
            .option("checkpointLocation", "file:///tmp/spark31")\
            .start()

In [None]:
def processBatchData(candleBatchDf, batch_id):
    print ("process batch called", batch_id, "writing ", candleBatchDf.count())
    
    (
     candleBatchDf
        .select('*', F.col("window.*"))
        .withColumnRenamed("start", "start_time")
        .withColumnRenamed("end", "end_time")
        .drop("window")
        .write
        .format("jdbc")
        .mode("append")
        .option("url", "jdbc:mysql://localhost:3306/stockdb?allowPublicKeyRetrieval=true&useSSL=false")
        .option("driver", "com.mysql.jdbc.Driver")
        .option("user", "team")
        .option("password", "Team1234!")
        .option("dbtable", "StatewiseEarning")
         .save()
    )
    
orderDf5min.writeStream.outputMode("append").foreachBatch(processBatchData).start()

                                                                                

process batch called 48 writing  0


                                                                                

process batch called 49 writing  18


                                                                                

process batch called 50 writing  0


                                                                                

process batch called 51 writing  0


                                                                                

process batch called 52 writing  1


                                                                                

process batch called 53 writing  0


                                                                                

process batch called 54 writing  0


                                                                                

process batch called 55 writing  0


                                                                                

process batch called 56 writing  0


                                                                                

process batch called 57 writing  0


                                                                                

process batch called 58 writing  0


                                                                                

process batch called 59 writing  0


                                                                                

process batch called 60 writing  0


                                                                                

process batch called 61 writing  0


                                                                                

process batch called 62 writing  0


                                                                                

process batch called 63 writing  0




process batch called 64 writing  0


                                                                                

process batch called 65 writing  23


                                                                                

process batch called 66 writing  0


                                                                                

process batch called 67 writing  0


                                                                                

process batch called 68 writing  0


                                                                                

process batch called 69 writing  0


                                                                                

process batch called 70 writing  0


                                                                                

process batch called 71 writing  0


                                                                                

process batch called 72 writing  0


                                                                                

process batch called 73 writing  0


                                                                                

process batch called 74 writing  0


                                                                                

process batch called 75 writing  0


                                                                                

process batch called 76 writing  0


                                                                                

process batch called 77 writing  0


                                                                                

process batch called 78 writing  0


                                                                                

process batch called 79 writing  19


                                                                                

process batch called 80 writing  0


                                                                                

process batch called 81 writing  0


                                                                                

process batch called 82 writing  0


                                                                                

process batch called 83 writing  0


                                                                                

process batch called 84 writing  0




process batch called 85 writing  0


                                                                                

process batch called 86 writing  0


                                                                                

process batch called 87 writing  0




process batch called 88 writing  0


[Stage 804:>                                                        (0 + 1) / 1]

process batch called 89 writing  0


                                                                                

process batch called 90 writing  0


                                                                                

process batch called 91 writing  0


                                                                                

process batch called 92 writing  23


                                                                                