
# Spark Producer


## Install Spark




In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget https://downloads.apache.org/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
!tar xf spark-2.4.7-bin-hadoop2.7.tgz
!pip install -q findspark

## Set java and spark home

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"

## Install Kafka Dependencies

In [None]:
!wget https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.11/2.4.7/spark-sql-kafka-0-10_2.11-2.4.7.jar
!wget https://repo1.maven.org/maven2/org/apache/spark/spark-streaming-kafka-0-10-assembly_2.11/2.4.7/spark-streaming-kafka-0-10-assembly_2.11-2.4.7.jar

## Add Kafka Dependecies to spark shell

In [None]:
import os
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /content/spark-sql-kafka-0-10_2.11-2.4.7.jar pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /content/spark-streaming-kafka-0-10-assembly_2.11-2.4.7.jar,/content/spark-sql-kafka-0-10_2.11-2.4.7.jar pyspark-shell'

## Initialize Spark

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

## Create Data Producer

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import struct, to_json, expr
import time


class RateToConsoleApp:
    """
    The RateToConsoleApp reads records from a Apache Spark rate (fake) stream and shows them in the console.
    Useful to emulate events.
    """

    def __init__(self, processing_time):
        self.spark = SparkSession.builder.master("local[*]").getOrCreate()
        print("Spark version is: %s" % self.spark.version)
        print(self.spark.sparkContext.getConf().getAll())
        self.processingTime = processing_time

    @staticmethod
    def write_micro_batch(micro_batch_df, batch_id):
        ts = time.localtime()
        print("Showing batch %s at %s" % (batch_id, time.strftime("%Y-%m-%d %H:%M:%S", ts)))
        micro_batch_df.show(truncate=False)

    def load(self, output_mode):
        events_df = self.get_events_df()

        events_df.writeStream \
            .outputMode(output_mode) \
            .trigger(processingTime=self.processingTime) \
            .foreachBatch(self.write_micro_batch) \
            .start()

        self.spark.streams.awaitAnyTermination()

    def get_events_df(self):
        rate_df = self.spark.readStream.format("rate").load()
        events_df = rate_df \
            .withColumn("key", expr("uuid()")) \
            .withColumn("value",
                        to_json(struct(rate_df["value"].alias("ordinal"),
                                       expr("value % 3 +1").alias("locationId"),
                                       rate_df["timestamp"],
                                       expr("floor(rand() * 100000000 / 100)").alias("amount")))) \
            .select("key", "value")

        return events_df


In [None]:
x = RateToConsoleApp('5 seconds')
x.load("append")

## Set kafka variables

In [None]:
username = "pujo"
server_ip = "34.87.150.250"
bootstrap_servers = f"{server_ip}:9092,{server_ip}:9093,{server_ip}:9094"
schema_registry_url = f"http://{server_ip}:8081"

## Create Kafka Producer

In [None]:
import time


class RateToKafkaApp(RateToConsoleApp):
    """
    The RateToConsoleApp reads records from a Apache Spark rate (fake) stream and writes them to an Apache Kafka topic.
    Useful to emulate events and send them to Kafka.
    """

    @staticmethod
    def write_micro_batch(micro_batch_df, batch_id):
        ts = time.localtime()
        print("Writting batch %s to kafka, at %s" % (batch_id, time.strftime("%Y-%m-%d %H:%M:%S", ts)))
        micro_batch_df.show(truncate=False)
        micro_batch_df.write \
            .format("kafka") \
            .option("kafka.bootstrap.servers", bootstrap_servers) \
            .option("topic", f"{username}-spark-events") \
            .save()

## Produce data to kafka

In [None]:
x = RateToKafkaApp(processing_time="5 seconds")
x.load("append")