In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col, avg, concat, lit, from_csv, to_timestamp, when
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from time import sleep


sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("stream_assignment_57")
sparkConf.set("spark.driver.memory", "4g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#  Google Storage File Path
gsc_file_path = 'gs://in_assignment_2/Donors.csv'  #  upload Donors.csv first

dataSchema_donor = StructType(
    [StructField("Donor_ID", StringType(), True),
     StructField("Donor_City", StringType(), True),
     StructField("Donor_State", StringType(), True),
     StructField("Donor_Is_Teacher", StringType(), True),
     StructField("Donor_Zip", StringType(), True)
     ])

# Load data from google storage
donors = spark.read.format("csv").schema(dataSchema_donor).option("header", "true") \
       .load(gsc_file_path)
donors.printSchema()


dataSchema_donation = StructType(
    [StructField("Project_ID", StringType(), True),
     StructField("Donation_ID", StringType(), True),
     StructField("Donor_ID", StringType(), True),
     StructField("Donation_Included_Optional_Donation", StringType(), True),
     StructField("Donation_Amount", FloatType(), True),
     StructField("Donor_Cart_Sequence", IntegerType(), True),
     StructField("Donation_Received_Date", StringType(), True)
     ])

def read_kafka(topic, dataSchema):
    kafkaStream = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka1:9093") \
        .option("failOnDataLoss", "false") \
        .option("subscribe", topic) \
        .option("startingOffsets", "latest") \
        .load()
    df = kafkaStream.selectExpr("CAST(value AS STRING)")
    df1 = df.select(from_csv(df.value, dataSchema.simpleString()))
    sdf = df1.select(col("from_csv(value).*"))
    return sdf

# read data from kafka topic
donations = read_kafka("data", dataSchema_donation)

# join two tables
donation_donor = donors.join(donations, "Donor_ID")

# create the event time column 
withEventTimedf = donation_donor.selectExpr(
    "*")

withEventTimedf = withEventTimedf.withColumn("event_time",to_timestamp("Donation_Received_Date"))

withEventTimedf.printSchema()

# query: find average amount of donation for city and state of donor
avgscoredf = withEventTimedf \
    .groupBy(window(col("event_time"), "10 seconds"), "Donor_City", "Donor_State") \
    .agg(avg('Donation_Amount').alias("value"))

resultdf = avgscoredf.select(concat(col("Donor_City"), lit(" "), col("Donor_State")).alias("key"), col("value").cast("string"))

resultdf.printSchema()

# write the result to a kafka topic
query = resultdf \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9093") \
    .option("checkpointLocation", "/home/jovyan/checkpoint") \
    .option("topic", "cost_p") \
    .outputMode("complete") \
    .start()

try:
    query.awaitTermination()
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")

root
 |-- Donor_ID: string (nullable = true)
 |-- Donor_City: string (nullable = true)
 |-- Donor_State: string (nullable = true)
 |-- Donor_Is_Teacher: string (nullable = true)
 |-- Donor_Zip: string (nullable = true)

root
 |-- Donor_ID: string (nullable = true)
 |-- Donor_City: string (nullable = true)
 |-- Donor_State: string (nullable = true)
 |-- Donor_Is_Teacher: string (nullable = true)
 |-- Donor_Zip: string (nullable = true)
 |-- Project_ID: string (nullable = true)
 |-- Donation_ID: string (nullable = true)
 |-- Donation_Included_Optional_Donation: string (nullable = true)
 |-- Donation_Amount: float (nullable = true)
 |-- Donor_Cart_Sequence: integer (nullable = true)
 |-- Donation_Received_Date: string (nullable = true)
 |-- event_time: timestamp (nullable = true)

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)

None
Stoped the streaming query and the spark context


In [2]:
spark.stop()
