In [1]:
import pandas as pd
import numpy as np
import time
import json
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from kafka import KafkaProducer

In [2]:
# Configuration

KAFKA_HOSTS = 'localhost:9092'
KAFKA_VERSION = (0, 10, 2)
TOPIC = "posts"

spark = SparkSession.builder.master("local[*]") \
    .appName("Word Count") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1") \
    .getOrCreate()

In [3]:
# Load Data

df = spark.read.json("shared_data/bigdata20/posts_api.json/*.json")
kdf = df.select(F.col("id").alias("key"), F.col("likes.count").alias("value"))
kdf.columns

['key', 'value']

In [4]:
# Write DF

ds = kdf \
  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
  .write \
  .format("kafka") \
  .option("kafka.bootstrap.servers", KAFKA_HOSTS) \
  .option("topic", TOPIC) \
  .save()

In [None]:
# Send as a collection

producer = KafkaProducer(bootstrap_servers=KAFKA_HOSTS, api_version=KAFKA_VERSION, value_serializer=lambda v: json.dumps(v).encode('utf-8'))

for record in df.rdd.collect():
    producer.send(TOPIC, str(record))
    producer.flush()

In [None]:
spark.stop()