# Stream test  
hdfs 에 구매 요청 정보가 들어오는 경우, stream 방식으로 요청을 수신하고 처리  

In [1]:
# 필요 라이브러리 임포트  
import socket
import sys
import os
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SparkSession
from os.path import abspath
import findspark
import time 

# 환경변수 정의  
scale = 1000 # 1000 만 건 수준
APP_NAME = 'stream-kafka'


# 스파크 생성 
def spark_creation_yarn():
    spark = SparkSession.builder.master('yarn').appName(APP_NAME)\
    .config('spark.driver.cores', '2').config('spark.driver.memory', '4g')\
    .config('spark.num.executors', '2')\
    .config('spark.executor.cores', '2').config('spark.executor.memory', '4g')\
    .config('spark.jars', '/hive-bin/lib/mysql-connector-java-5.1.49-bin.jar')\
    .config('spark.driver.extraClassPath', '/hive-bin/lib/mysql-connector-java-5.1.49-bin.jar')\
    .getOrCreate()
    sc = spark.sparkContext
    return spark

def spark_creation_local():
    kafka_jar_root = "/spark-git/spark/spark-local/notebooks/scenarios/kafka/jar"
    kafka_spark_jar_path = f"{kafka_jar_root}/commons-pool2-2.11.0.jar,{kafka_jar_root}/spark-token-provider-kafka-0-10_2.12-3.2.1.jar,{kafka_jar_root}/spark-streaming-kafka-0-10_2.12-3.2.1.jar,{kafka_jar_root}/spark-sql-kafka-0-10_2.12-3.2.1.jar,/{kafka_jar_root}/kafka-clients-3.2.0.jar"
    spark = SparkSession.builder.master('local[*]').appName(APP_NAME)\
    .config("spark.jars", kafka_spark_jar_path)\
    .config("spark.driver.extraClassPath", f"{kafka_jar_root}/*.jar")\
    .config("spark.executor.extraClassPath", f"{kafka_jar_root}/*.jar")\
    .getOrCreate()
# .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12-3.2.1")\
    #    .config("spark.driver.extraClassPath", kafka_spark_jar_path)\
    #     .config("spark.jars.packages", pydeequ.deequ_maven_coord) \
    # .config("spark.jars.excludes", pydeequ.f2j_maven_coord) \
    sc = spark.sparkContext
    # spark.conf.set("spark.driver.extraClassPath", kafka_spark_jar_path)
    return spark
def spark_creation():
    return spark_creation_local()

def display_output(outTarget, sqlString, interval = 1):
    from IPython.display import display, clear_output
    from pyspark.sql.streaming import StreamingQuery 
    import time 
    if isinstance(outTarget, StreamingQuery):
        while True:
            clear_output(wait=False)
            print(f"Query : {sqlString}")
            # display(spark.sql(sqlString).show())
            print(f"outTarget : {outTarget}")
            # print(f"temp : {(temp[0])}")
            #display(outTarget.select("type").show())
            time.sleep(interval)
    else:
        print("Not instance......")
        spark.sql(sqlString).show()
        
def define_schema():
    from pyspark.sql.types import StructType, StructField, StringType, LongType
    columns = [
        StructField("type", StringType())
        , StructField("qty", LongType())
    ]
    inven_schema = StructType(columns)
    return inven_schema

In [2]:
%%time
spark = spark_creation()

spark



CPU times: user 61 ms, sys: 17.9 ms, total: 78.8 ms
Wall time: 15.3 s


In [3]:
# Subscribe to 1 topic
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("subscribe", "quickstart-events") \
  .load()

# df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [None]:
# df.createOrReplaceTempView("mytable")
query_name = "lines_12"
# lines.createOrReplaceTempView(query_name)
sql = f"select * from mytable "


In [5]:
# activityQuery = df.writeStream.queryName(query_name).format("memory").outputMode("append").start()
# activityQuery.awaitTermination() 
# spark.streams.active

# from time import sleep
# for x in range(3):
#     spark.sql(sql).show(3)
#     sleep(2)

In [4]:
outQ = df \
    .withWatermark("timestamp", "5 seconds")\
    .writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("mytable") \
    .start()

# outQ.awaitTermination()


In [8]:
from IPython.display import display, clear_output
import time  

while True:
    clear_output(wait=True)
    display(outQ.status)
    display(outQ.lastProgress)
    display(spark.sql(f'SELECT * FROM mytable').show())
    time.sleep(10)

{'message': 'Getting offsets from KafkaV2[Subscribe[quickstart-events]]',
 'isDataAvailable': False,
 'isTriggerActive': True}

{'id': '4cd1f631-4f6c-4b22-bde7-e7d17a20b258',
 'runId': '43ea08d1-c63b-4190-8626-80c0cdb56647',
 'name': 'mytable',
 'timestamp': '2022-06-22T02:29:46.688Z',
 'batchId': 12,
 'numInputRows': 0,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'latestOffset': 1, 'triggerExecution': 1},
 'eventTime': {'watermark': '2022-06-22T02:28:40.020Z'},
 'stateOperators': [],
 'sources': [{'description': 'KafkaV2[Subscribe[quickstart-events]]',
   'startOffset': {'quickstart-events': {'0': 38}},
   'endOffset': {'quickstart-events': {'0': 38}},
   'latestOffset': {'quickstart-events': {'0': 38}},
   'numInputRows': 0,
   'inputRowsPerSecond': 0.0,
   'processedRowsPerSecond': 0.0,
   'metrics': {'avgOffsetsBehindLatest': '0.0',
    'maxOffsetsBehindLatest': '0',
    'minOffsetsBehindLatest': '0'}}],
 'sink': {'description': 'MemorySink', 'numOutputRows': 0}}

+----+--------------------+-----------------+---------+------+--------------------+-------------+
| key|               value|            topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----------------+---------+------+--------------------+-------------+
|null|       [61 61 61 61]|quickstart-events|        0|    16|2022-06-22 11:17:...|            0|
|null|       [62 62 62 62]|quickstart-events|        0|    17|2022-06-22 11:19:...|            0|
|null|                [61]|quickstart-events|        0|    18|2022-06-22 11:26:...|            0|
|null|                [62]|quickstart-events|        0|    19|2022-06-22 11:26:...|            0|
|null|          [76 76 76]|quickstart-events|        0|    20|2022-06-22 11:26:...|            0|
|null| [64 67 72 67 64 66]|quickstart-events|        0|    21|2022-06-22 11:26:...|            0|
|null|[72 67 66 78 62 6...|quickstart-events|        0|    22|2022-06-22 11:26:...|            0|
|null|[71 71 71 71 7

None

KeyboardInterrupt: 

In [9]:
outQ.processAllAvailable()
outQ.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [9]:
# help(outQ)

In [10]:
spark.stop()