In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import datetime
import re
import json
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
from kafka import KafkaProducer, KafkaConsumer, TopicPartition

In [2]:
# Configuration

KAFKA_HOSTS = 'localhost:9092'
KAFKA_VERSION = (0, 10, 2)

In [None]:
# Spark

spark = SparkSession.builder.master("local[1]") \
    .appName("Followers Stream Producer") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1") \
    .getOrCreate()

In [3]:
# Test Message

producer = KafkaProducer(bootstrap_servers=KAFKA_HOSTS, api_version=KAFKA_VERSION, value_serializer=lambda v: json.dumps(v).encode('utf-8'))
producer.send("test", "Hello World!")
producer.flush()

In [None]:
# Load Data
followers_df = spark.read.json("shared_data/clickhouse_data/followers.json")
posts_df = spark.read.json("shared_data/clickhouse_data/posts.json")

In [None]:
followers_df.printSchema()

In [None]:
posts_df.printSchema()

In [None]:
data_df1 = followers_df.select('ctime', 'follower_id', 'profile_id')

data_df2 = posts_df.select( 'date', 'comments_count', \
                'from_id', 'likes_count', 'owner_id', 'post_id', \
                'post_type', 'text', 'views_count', 'reposts_count')

In [None]:
# Send message

def sendMessage(row):
    producer = KafkaProducer(bootstrap_servers=KAFKA_HOSTS, api_version=KAFKA_VERSION, value_serializer=lambda v: json.dumps(v).encode('utf-8'))
    producer.send(topics, row.asDict())
    producer.flush()

topics = "followers"
data_df1.rdd.foreach(sendMessage)

topics = "posts"
data_df2.rdd.foreach(sendMessage)

In [4]:
# Consumer test 1

consumer = KafkaConsumer('posts', bootstrap_servers=KAFKA_HOSTS, group_id=None, auto_offset_reset ='earliest')
tp = TopicPartition('posts',0)
print("Kafka Test Message")
for message in consumer:
    print(message)
    if message.offset == consumer.position(tp) - 1:
        break


Kafka Test Message
ConsumerRecord(topic='posts', partition=0, offset=0, timestamp=1622530723056, timestamp_type=0, key=None, value=b'{"date":"2012-03-13 00:00:00","comments_count":-1,"from_id":"78062","likes_count":2,"owner_id":"-36559458","post_id":"5","post_type":"post","text":"","views_count":0,"reposts_count":0}', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=184, serialized_header_size=-1)


In [5]:
# Consumer test 2

consumer = KafkaConsumer('followers', bootstrap_servers=KAFKA_HOSTS, group_id=None, auto_offset_reset ='earliest')
tp = TopicPartition('followers',0)
print("Kafka Test Message")
for message in consumer:
    print(message)
    if message.offset == consumer.position(tp) - 1:
        break

Kafka Test Message
ConsumerRecord(topic='followers', partition=0, offset=0, timestamp=1622530880617, timestamp_type=0, key=None, value=b'{"profile_id":"328582","follower_id":"4652928","ctime":"2020-03-07 10:09:50"}', headers=[], checksum=None, serialized_key_size=-1, serialized_value_size=77, serialized_header_size=-1)


In [None]:
spark.stop()