In [1]:
import pandas as pd
import findspark
findspark.init()
import numpy as np
import json
import os
import pyspark
from pyspark.sql import SparkSession, Row, Window
from pyspark.sql.functions import udf, col, array, when, size, spark_partition_id, pandas_udf, PandasUDFType
import pyspark.sql.functions as func
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType, FloatType, DoubleType, LongType, StructType, StructField, IntegerType, TimestampType, BooleanType, DecimalType
import sqlalchemy
from sqlalchemy.dialects import postgresql 
import psutil

TOPIC = "session_meta"


In [2]:
spark = SparkSession.builder \
    .appName("SoDa-TAP") \
    .master("local[*]") \
    .config("spark.local.dir", "/home/jovyan/sodatap") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config('spark.sql.repl.eagerEval.enabled', True) \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0") \
    .config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') \
    .config('spark.streaming.kafka.consumer.cache.enabled', 'false') \
    .config('spark.kryoserializer.buffer.max', '2000M') \
    .config('spark.driver.maxResultSize', '1G') \
    .getOrCreate()

:: loading settings :: url = jar:file:/opt/conda/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-17038993-de59-46ca-98b5-2198367467a9;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central

In [3]:
sc = spark.sparkContext

In [4]:
def write_sink(df, epoch_id):
    df.toPandas().to_sql("sessions", 'crate://129.128.184.214:4200', if_exists='append', index=False, chunksize=100)

In [5]:
def infer_topic_schema_json(topic):
    df_json = (spark.read.format("kafka") \
                .option("kafka.bootstrap.servers", "broker:29092") \
                .option("subscribe", topic) \
                .option("startingOffsets", "earliest") \
                .option("maxOffsetsPerTrigger", "1") \
                .option("failOnDataLoss", "false") \
                .load() \
                .withColumn("value", F.expr("string(value)")) \
                .select("value"))
    
    df_read = spark.read.json(df_json.rdd.map(lambda x: x.value), multiLine=True)
    return df_read.schema.json()

In [6]:
infer_schema = True
schema_location = "meta_schema.json"

if not infer_schema: 
    try:
        with open(schema_location, 'r') as f:
            topic_schema_txt = json.load(f)
    except:
        infer_schema = True
        pass

if infer_schema:
    topic_schema_txt = infer_topic_schema_json(TOPIC)
    with open(schema_location, 'w') as f:
        json.dump(topic_schema_txt, f)
        
topic_schema = StructType.fromJson(json.loads(topic_schema_txt))

22/09/26 21:44:47 WARN KafkaSourceProvider: maxOffsetsPerTrigger option ignored in batch queries
22/09/26 21:44:53 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
                                                                                

In [None]:
json_results = spark.readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", "broker:29092") \
            .option("subscribe", TOPIC) \
            .option("maxOffsetsPerTrigger", "100000") \
            .option("startingOffsets", "earliest") \
            .option("failOnDataLoss", "false") \
            .load() \
            .withColumn("value", F.expr("string(value)")) \
            .select("value") \
            .withColumn('value', F.from_json(col("value"), topic_schema)) \
            .select("value.*")
'''
debug_sink = json_results.writeStream \
    .outputMode("update") \
    .trigger(processingTime='1 seconds') \
    .option("truncate", "false")\
    .format("console") \
    .start()
'''

debug_sink = json_results.writeStream \
    .foreachBatch(write_sink) \
    .start()

debug_sink.awaitTermination()