스파크 세선을 성성하고 버전을 확인합니다.
상세한 가이드는 Spark Streaming + Kafka Integration Guide (Kafka broker version 0.10.0 or higher) - Spark 3.2.1 Documentation 를 참고 합니다.

In [17]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from IPython.display import display, display_pretty, clear_output, JSON
# 공통 데이터 위치
home_jovyan = "/home/jovyan"
work_data = f"{home_jovyan}/work/data"
work_dir=!pwd
work_dir = work_dir[0]

# mongodb 관련 conf
# password = 123456
# user = root
# host = 127.0.0.1
# db_auth = tweetDB.test
# mongo_conn = f"mongodb://{user}:{password}@{host}:27017/{db_auth}"
mongo_conn = "mongodb://root:123456@127.0.0.1:27017"

spark = (
    SparkSession
    .builder
    .config("spark.sql.session.timeZone", "Asia/Seoul")
    .getOrCreate()
)

# 노트북에서 테이블 형태로 데이터 프레임 출력을 위한 설정을 합니다
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # display enabled
spark.conf.set("spark.sql.repl.eagerEval.truncate", 100) # display output columns size

# 로컬 환경 최적화
spark.conf.set("spark.sql.shuffle.partitions", 5) # the number of partitions to use when shuffling data for joins or aggregations.
spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")


spark.conf.set("spark.mongodb.write.connection.uri", mongo_conn)
spark.conf.set("spark.mongodb.write.database", "tweetDB")
spark.conf.set("spark.mongodb.write.collection", "test")


# 현재 기동된 스파크 애플리케이션의 포트를 확인하기 위해 스파크 정보를 출력합니다
spark

##### 노트북에서 스파크 스트리밍 상태 및 데이터 조회를 위한 함수

In [2]:
# 스트림 테이블을 주기적으로 조회하는 함수 (name: 이름, sql: Spark SQL, iterations: 반복횟수, sleep_secs: 인터벌)
# 기존 출력 cell을 지우고, 이름, 현재 반복 횟수, 현재 수행 쿼리를 출력합니다.
# 쿼리 결과를 출력하고 sleep sec초 만큼 대기하는 과정을 iteration 번 반복해서 출력합니다.
def displayStream(name, sql, iterations, sleep_secs):
    from time import sleep
    i = 1
    for x in range(iterations):
        clear_output(wait=True)              # 출력 Cell clear
        display('[' + name + '] Iteration: '+str(i)+', Query: '+sql)
        display(spark.sql(sql))              # Spark SQL 수행
        sleep(sleep_secs)                    # sleep_secs 초 만큼 대기
        i += 1

# 스트림 쿼리의 상태를 주기적으로 조회하는 함수 (name: 이름, query: Streaming Query, iterations: 반복횟수, sleep_secs: 인터벌)
def displayStatus(name, query, iterations, sleep_secs):
    from time import sleep
    i = 1
    for x in range(iterations):
        clear_output(wait=True)      # Output Cell 의 내용을 지웁니다
        display('[' + name + '] Iteration: '+str(i)+', Status: '+query.status['message'])
        display(query.lastProgress)  # 마지막 수행된 쿼리의 상태를 출력합니다
        sleep(sleep_secs)            # 지정된 시간(초)을 대기합니다
        i += 1

### kafka reader 생성

카프카로부터 메시지 수신을 위한 카프카 리더를 생성합니다
earliest 옵션으로 kafka:9093 의 movies 토픽으로 부터 메시지를 읽어와서 데이터 변환을 수행합니다.

In [18]:
# kafkaReader 생성 (spark.readStream) -> 카프카에서 메세지 수신
# server.properties 에서 advertised.listeners=INSIDE://kafka:9093,OUTSIDE://localhost:9092

kafkaReader = (
    spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "kafka:9093")
  .option("subscribe", "movies")
  .option("startingOffsets", "earliest")
  .load()
)
kafkaReader.printSchema()


root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



들어오는 데이터 스키마를 명시적으로 정의해줍니다.
KafkaSelector 를 추가해서 카프카에서 읽어들인 데이터(KafkaReader)에서 key 컬럼 추가하고

In [19]:
kafkaSchema = (
    StructType()
    .add(StructField("status", StringType()))
    .add(StructField("id", StringType()))
    .add(StructField("screen_name", StringType()))
    .add(StructField("user_id", IntegerType()))
    .add(StructField("profile", StringType()))
    .add(StructField("time", DateType()))
    .add(StructField("text", StringType()))
    .add(StructField("retweet_count", IntegerType()))
    .add(StructField("favorite_count", IntegerType()))
    .add(StructField("lat", IntegerType()))
    .add(StructField("long", IntegerType()))
)


kafkaSelector = (
    kafkaReader
    .select(
        col("key").cast("string"),
        from_json(col("value").cast("string"), kafkaSchema).alias("movies")
    )
    .selectExpr("movies.id as key", "movies.status", "movies.screen_name", "movies.user_id", "movies.profile", "movies.time", "movies.text", "movies.retweet_count", "movies.favorite_count", "movies.lat", "movies.long")
)


kafkaSelector.printSchema()

root
 |-- key: string (nullable = true)
 |-- status: string (nullable = true)
 |-- screen_name: string (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- profile: string (nullable = true)
 |-- time: date (nullable = true)
 |-- text: string (nullable = true)
 |-- retweet_count: integer (nullable = true)
 |-- favorite_count: integer (nullable = true)
 |-- lat: integer (nullable = true)
 |-- long: integer (nullable = true)



In [20]:
def processing(s):
    import re
    s = re.compile('[^ ㄱ-ㅣ가-힣a-zA-Z0-9./%#:\n]+').sub('',s)
    return s
    
processing_udf = udf(lambda x: processing(x),StringType())

kafkaSelector.createOrReplaceTempView("kafkaSelector")
kafkaSelector = spark.sql(f"select * from kafkaSelector order by time")
# 트윗 본문 process : 한글, 영어만 남김
kafkaSelector = kafkaSelector.withColumn("processed_text", processing_udf(col("text"))) 

quoted_tweet = original_tweet.select("*").where("status='QUOTED'")
original_tweet = original_tweet.select("*").where("status='ORIGINAL'")

디버깅을 위해 임시로 콘솔 출력을 통해 검증합니다
스트리밍 데이터는 디버깅이 상당히 어렵기 때문에 새로운 카프카 토픽 혹은 외부에 저장하기 전에 반드시 눈으로 확인해야 합니다

위에서 선언한 displayStream 함수를 활용하여 메시지를 읽어서 출력합니다.

### test

In [15]:
# 노트북 로그 콘솔로 출력
queryName = "consoleSink7"
kafkaWriter = (
    kafkaSelector.select("*")
    .writeStream
    .queryName(queryName)
    .format("memory")
    .outputMode("append")
)

checkpointLocation = f"{work_dir}/tmp/{queryName}"
!rm -rf $checkpointLocation

kafkaTrigger = (
    kafkaWriter
    .trigger(processingTime="5 second")
    .option("checkpointLocation", checkpointLocation)
)

# 파이썬의 경우 콘솔 디버깅이 노트북 표준출력으로 나오기 때문에, 별도 메모리 테이블로 조회
from time import sleep
def processing(s):
    import re
    s = re.compile('[^ ㄱ-ㅣ가-힣a-zA-Z0-9./%#:\n]+').sub('',s)
    return s
processing_udf = udf(lambda x: processing(x),StringType())
iterations = 4; sleep_secs = 5

kafkaQuery = kafkaTrigger.start()

for x in range(iterations):
    clear_output(wait=True)              # 출력 Cell clear
    df = spark.sql(f"select * from {queryName}")
    df = df.withColumn("processed_text", processing_udf(col("text")))
    quoted_tweet = df.select("*").where("status='QUOTED'").orderBy("key")
    original_tweet = df.select("*").where("status='ORIGINAL'").orderBy("key")
    display(quoted_tweet)
    display(original_tweet)
    sleep(sleep_secs)                    # sleep_secs 초 만큼 대기

kafkaQuery.stop()

key,status,id,screen_name,user_id,profile,time,text,retweet_count,favorite_count,lat,long,processed_text
1594843339698098176,QUOTED,1594843339698098176,miusakamoto,40164217.0,http://pbs.twimg.com/profile_images/1645322449/image_normal.jpg,2009-05-15 03:20:06,BTSの人の歌よかったねー！と話題にのぼるたびとっても嬉しいけど、あのFIFAの曲はメロディー5音ほどで音域も狭いので、グクの歌の本当のテクニックや表現力は、繊細かつパワフルで伸びやかでもっとも...,603,10077,,,BTSFIFA5
1595021927747313665,QUOTED,1595021927747313665,stopbeingdelulu,,http://pbs.twimg.com/profile_images/1595007895925452800/5x4D60Oo_normal.jpg,2022-05-05 06:26:55,8. A BTS song you would like to get played in your funeral.,0,29,,,8. A BTS song you would like to get played in your funeral.
1595228479959728129,QUOTED,1595228479959728129,PopBase,,http://pbs.twimg.com/profile_images/1268086791443230737/BRGz4AiW_normal.jpg,2019-06-11 14:49:15,This year’s AMAs were officially the least watched in the ceremony’s history. (3.3m) https://t.co...,107,2115,,,This years AMAs were officially the least watched in the ceremonys history. 3.3m https://t.co/WMv...
1595231726565208064,QUOTED,1595231726565208064,sugaclair,290594645.0,http://pbs.twimg.com/profile_images/1579120340424589314/ldoW6u8x_normal.jpg,2011-04-30 15:09:43,머치팩#9 알람시계...\n미쳤음...\n달달해서 일어나겠냐...\n\n#머치팩후기 #BTS https://t.co/YbphRW3Nv7,788,1738,,,머치팩#9 알람시계...\n미쳤음...\n달달해서 일어나겠냐...\n\n#머치팩후기 #BTS https://t.co/YbphRW3Nv7
1595250813404073985,QUOTED,1595250813404073985,bts_bighit,1409798257.0,http://pbs.twimg.com/profile_images/1590721340772093952/hlqcqI5d_normal.jpg,2013-05-07 09:13:05,[공지] #RM 콘텐츠 녹화 참여 안내 (+ENG/JPN/CHN)\nhttps://t.co/HIdCB7A46I,28310,77341,,,공지 #RM 콘텐츠 녹화 참여 안내 ENG/JPN/CHN\nhttps://t.co/HIdCB7A46I


key,status,id,screen_name,user_id,profile,time,text,retweet_count,favorite_count,lat,long,processed_text
1595238151865643008,ORIGINAL,1595238151865643008,tata_chimi_95,,http://pbs.twimg.com/profile_images/1535996219352563713/HydW3S5X_normal.jpg,2022-11-23 02:09:48,このタグにすると皆に投票できるんですか？🗳\nもしそうだったら嬉しすぎる！💜\nI vote #jhope #Jimin #Jin #Jungkook #RM #Suga #V of #B...,0,0,,,\n\nI vote #jhope #Jimin #Jin #Jungkook #RM #Suga #V of #BTS for https://t.co/HyD39GDoUO
1595238162397282304,ORIGINAL,1595238162397282304,ojedamicaela5,,http://pbs.twimg.com/profile_images/1595070283244658688/_YHfTQ4o_normal.jpg,2022-11-23 02:09:50,"en qué momento esquizo dijimos con oriana y lucía ""jajaj compremos un juego de shots de bts""",0,0,,,en qu momento esquizo dijimos con oriana y luca jajaj compremos un juego de shots de bts
1595238164754739200,ORIGINAL,1595238164754739200,xyzm_,,http://pbs.twimg.com/profile_images/1490339931528785921/VjT6VH07_normal.jpg,2022-11-23 02:09:51,Hai haii army\nAda GO bts txt dan untuk grup lain juga yang suka drop jajanan murce nih 🤑 \nNama...,0,0,,,Hai haii army\nAda GO bts txt dan untuk grup lain juga yang suka drop jajanan murce nih \nNamany...
1595238165291622401,ORIGINAL,1595238165291622401,jiminluv1365244,,http://pbs.twimg.com/profile_images/1587261189721317378/8LCxdQeC_normal.jpg,2022-11-23 02:09:51,🗳King Choice(11/30)\nHOT 100 KPOP IDOLS RANKING 2022\n\n📍https://t.co/rX4L2rhThu\n\n⭐️지민 1위\n\n...,0,0,,,King Choice11/30\nHOT 100 KPOP IDOLS RANKING 2022\n\nhttps://t.co/rX4L2rhThu\n\n지민 1위\n\n7일 남았어요 ...
1595238166050373633,ORIGINAL,1595238166050373633,yexelyn2573,,http://pbs.twimg.com/profile_images/1581242339728625665/LgzY3Dh__normal.jpg,2022-11-23 02:09:51,JIMIN JIMIN \nI vote #JIMIN of BTS for #NETIZENSREPORT The Most Talented Kpop Idols @thenreport ...,0,0,,,JIMIN JIMIN \nI vote #JIMIN of BTS for #NETIZENSREPORT The Most Talented Kpop Idols thenreport h...
1595238167208230912,ORIGINAL,1595238167208230912,ArMoonBebe,,http://pbs.twimg.com/profile_images/1588722882284240897/ieaNDav__normal.jpg,2022-11-23 02:09:51,@allkpop wheen i hear well rounded and absolute perfection i only know bts jungkook and ONEUS seo...,0,0,,,allkpop wheen i hear well rounded and absolute perfection i only know bts jungkook and ONEUS seoh...
1595238168177291265,ORIGINAL,1595238168177291265,enhypen1226,,http://pbs.twimg.com/profile_images/1591292465444573184/xxN1yImD_normal.jpg,2022-11-23 02:09:52,譲　1.2枚目twice\n求　３枚目enhypenジョンウォン\n\n・🌎は初期傷あり\n・👑は出し惜しみ(レート配慮必ずお願いします)\n\n#newjeansトレカ交換 #enhype...,0,0,,,1.2twice\nenhypen\n\n\n\n\n#newjeans #enhypen #seventeen https://t.co/r8VjJzMnqb
1595238193410244608,ORIGINAL,1595238193410244608,genz_daily,,http://pbs.twimg.com/profile_images/1503065483142381568/GXF9_cUX_normal.jpg,2022-11-23 02:09:58,Apa yang Membuat Upaya Ketiga BTS di Grammy Awards Menjadi Istimewa? \n#BTS #Grmmay #GrammyAwards...,0,0,,,Apa yang Membuat Upaya Ketiga BTS di Grammy Awards Menjadi Istimewa \n#BTS #Grmmay #GrammyAwards ...
1595245983826575360,ORIGINAL,1595245983826575360,song_universal,,http://pbs.twimg.com/profile_images/1595240308652052483/Cn8gh_yq_normal.jpg,2022-11-23 02:40:55,"You,\nYou are,\nMy universe and I just want to put you first.” -Coldplay, BTS",0,0,,,You\nYou are\nMy universe and I just want to put you first. Coldplay BTS
1595245985688879104,ORIGINAL,1595245985688879104,BTS_Noona_ARMY,,http://pbs.twimg.com/profile_images/1538013629714341888/0f0jeeV7_normal.jpg,2022-11-23 02:40:55,@HeatherBenson75 I still see it??? HYBE usually has rights to all things BTS...pretty sure that's...,0,0,,,HeatherBenson75 I still see it HYBE usually has rights to all things BTS...pretty sure thats in e...


### 카프카로(writeStream)

In [None]:
# 카프카로 다시 저장
queryName1 = "kafkaSink_origin"
queryName2 = "kafkaSink_quoted"

kafkaWriter_origin = (
    original_tweet
    .writeStream
    .queryName(queryName1)
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9093")
    .option("topic", "original")
    .outputMode("append")
)
kafkaWriter_quoted = (
    quoted_tweet
    .writeStream
    .queryName(queryName2)
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9093")
    .option("topic", "quoted")
    .outputMode("append")
)

checkpointLocation = f"{work_dir}/tmp/{queryName1}"
!rm -rf $checkpointLocation
checkpointLocation = f"{work_dir}/tmp/{queryName2}"
!rm -rf $checkpointLocation

kafkaWriter_origin
    .trigger(processingTime="5 second")
    .option("checkpointLocation", checkpointLocation)
    .start()
kafkaWriter_quoted
    .trigger(processingTime="5 second")
    .option("checkpointLocation", checkpointLocation)
    .start()

spark.streams.awaitAnyTermination()

### write to mongoDB

In [None]:
kafkaSelector
    .write
    .format("mongo")
    .mode("append")
    .save()