In [None]:
# Import Libraries

import pandas as pd
import numpy as np
import datetime
import json
import pyspark
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, IntegerType
import pyspark.sql.functions as F
from kafka import KafkaProducer

In [None]:
# Configuration

KAFKA_HOSTS = 'localhost:9092'
KAFKA_VERSION = (0, 10, 2)
TOPIC = "profile1"
dataDirectory = "shared_data/bigdata20/followers_info.json/*.json"

In [None]:
# Spark

spark = SparkSession.builder.master("local[*]") \
    .appName("Profile Stream Producer") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1") \
    .getOrCreate()

In [None]:
# Load Data

profile_df = spark.read.json("shared_data/bigdata20/followers_info.json/*.json")
posts_df = spark.read.json("shared_data/bigdata20/followers_posts_api_final.json")

In [None]:
# Add age and sex of the owner_id to the post_info

find_year = F.udf(lambda y: y[-4:] if y else 0, StringType())
find_age = F.udf(lambda y: (np.random.randint(18,80)) if y==0 else (datetime.datetime.now().year - y), IntegerType())


profile_df = profile_df.withColumn("year",\
                        F.when(F.col('bdate').isNull() | (F.length(F.col('bdate'))<7), '0')\
                        .otherwise(find_year(F.col('bdate'))))
profile_df = profile_df.withColumn('age', find_age(profile_df["year"].cast(IntegerType())))\
                        .select(F.col('id').alias('owner_id'), F.col('sex'), F.col('age'))

posts_df = posts_df.join(profile_df, "owner_id", "left")

In [None]:
data_df = posts_df.select(F.col("id").alias("post_id"), \
                    F.col("owner_id"), F.col("age"), F.col("sex")).na.drop()

ds = data_df\
    .select(F.to_json(F.struct([F.col(c).alias(c) for c in data_df.columns])).cast("string").alias('value'))\
    .write \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_HOSTS) \
    .option("topic", TOPIC) \
    .save()

In [None]:
spark.stop()