In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON
# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]


spark = (
    SparkSession
    .builder
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .getOrCreate()
)

# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")

In [2]:
topic_name = "rawdata"
kafkaReader = (
    spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "kafka:9093")
  .option("subscribe", topic_name)
  .option("startingOffsets", "earliest")
  .load()
)


kafkaSchema = (
    StructType()
    .add(StructField("status", StringType()))
    .add(StructField("id", StringType()))
    .add(StructField("screen_name", StringType()))
    .add(StructField("user_id", IntegerType()))
    .add(StructField("profile", StringType()))
    .add(StructField("time", DateType()))
    .add(StructField("text", StringType()))
    .add(StructField("retweet_count", IntegerType()))
    .add(StructField("favorite_count", IntegerType()))
    .add(StructField("lat", IntegerType()))
    .add(StructField("long", IntegerType()))
)


kafkaSelector = (
    kafkaReader
    .select(
        col("key").cast("string"),
        from_json(col("value").cast("string"), kafkaSchema).alias("rawtweet")
    )
    .selectExpr("rawtweet.id as key", "rawtweet.status", "rawtweet.screen_name", "rawtweet.user_id", "rawtweet.profile", "rawtweet.time", "rawtweet.text", "rawtweet.retweet_count", "rawtweet.favorite_count", "rawtweet.lat", "rawtweet.long")
)



In [3]:
# 한글, 영어, 숫자, 모든 특수문자, 이모지를 제외한 나머지 (중국어, 일본어, 태국어 등)를 포함하고있다면
# None리턴, 아닐시 s 리턴
def processing(s):
    import re
    pattern = re.compile('[^ ㄱ-ㅣ가-힣a-zA-Z0-9\nu"\U00010000-\U0010FFFF"[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]+', flags=re.UNICODE)
    if len(pattern.findall(s)) > 10:
        return None
    else:
        return s
processing_udf = udf(lambda x: processing(x),StringType())

# 어떤 그룹에 대한 데이터인지 파악
def find_group(s):
    s = s.lower()
    if 'blackpink' in s:
        return 'blackpink'
    elif 'bts' in s:
        return 'bts'
    else:
        return None
find_group_udf = udf(lambda x: find_group(x), StringType())

kafkaSelector = (
    kafkaSelector
    .where("favorite_count>0")
    .withColumn("processed_text", processing_udf(col("text")))
    .withColumn("group", find_group_udf(col("text")))
)

p_kafkaSelector = (
    kafkaSelector
    .withColumn("value", to_json(struct("screen_name", "profile", "time", "processed_text", "retweet_count", "favorite_count", "group")))
)

### write to kafka

In [4]:
def displayStatus(name, query, iterations, sleep_secs):
    from time import sleep
    i = 1
    for x in range(iterations):
        clear_output(wait=True)      # Output Cell 의 내용을 지웁니다
        display('[' + name + '] Iteration: '+str(i)+', Status: '+query.status['message'])
        display(query.lastProgress)  # 마지막 수행된 쿼리의 상태를 출력합니다
        sleep(sleep_secs)            # 지정된 시간(초)을 대기합니다
        i += 1

In [None]:
qname = "kafkaQ"
kafkaWriter_origin = (
    p_kafkaSelector.select("key", "value")
    .writeStream
    .queryName(qname)
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9093")
    .option("topic", "processed")
    .outputMode("append")
)
checkpointLocation = f"{work_dir}/tmp/{qname}"
!rm -rf $checkpointLocation
kafkaTrigger = (
    kafkaWriter_origin
    .trigger(processingTime="5 second")
    .option("checkpointLocation", checkpointLocation)
)

kafkaQuery = kafkaTrigger.start()

displayStatus(qname, kafkaQuery, 1000, 5)
    
kafkaQuery.stop()

'[kafkaQ] Iteration: 66, Status: Processing new data'

{'id': '66ebca71-a511-4b1e-ae2c-667273a2c6e4',
 'runId': 'b098a77b-cd3a-43ca-9221-67ee2aa8de82',
 'name': 'kafkaQ',
 'timestamp': '2022-12-15T13:46:05.000Z',
 'batchId': 63,
 'numInputRows': 1,
 'inputRowsPerSecond': 0.20004000800160032,
 'processedRowsPerSecond': 1.3297872340425532,
 'durationMs': {'addBatch': 610,
  'getBatch': 0,
  'latestOffset': 3,
  'queryPlanning': 21,
  'triggerExecution': 752,
  'walCommit': 50},
 'stateOperators': [],
 'sources': [{'description': 'KafkaV2[Subscribe[rawdata]]',
   'startOffset': {'rawdata': {'0': 88}},
   'endOffset': {'rawdata': {'0': 89}},
   'latestOffset': {'rawdata': {'0': 89}},
   'numInputRows': 1,
   'inputRowsPerSecond': 0.20004000800160032,
   'processedRowsPerSecond': 1.3297872340425532,
   'metrics': {'avgOffsetsBehindLatest': '0.0',
    'maxOffsetsBehindLatest': '0',
    'minOffsetsBehindLatest': '0'}}],
 'sink': {'description': 'org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@70afd515',
  'numOutputRows': 1}}