In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import datetime
import re
import json
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
from kafka import KafkaProducer

In [2]:
# Configuration

KAFKA_HOSTS = 'localhost:9092'
KAFKA_VERSION = (0, 10, 2)
topics = 'posts'

In [3]:
# Spark

spark = SparkSession.builder.master("local[1]") \
    .appName("Profile Stream Producer") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1") \
    .getOrCreate()

In [4]:
# Test Message

producer = KafkaProducer(bootstrap_servers=KAFKA_HOSTS, api_version=KAFKA_VERSION, value_serializer=lambda v: json.dumps(v).encode('utf-8'))
producer.send("test", "Hello World!")
producer.flush()

In [5]:
# Load Data

profile_df = spark.read.json("shared_data/bigdata20/followers_info.json/*.json")
posts_df = spark.read.json("shared_data/bigdata20/followers_posts_api_final.json")

In [6]:
# Add age and sex of the post owner to the posts dataset

find_year = F.udf(lambda y: y[-4:] if y else 0, StringType())
find_age = F.udf(lambda y: (np.random.randint(15,80)) if y==0 else (datetime.datetime.now().year - y), IntegerType())


profile_df = profile_df.withColumn("year",\
                        F.when(F.col('bdate').isNull() | (F.length(F.col('bdate'))<7), '0')\
                        .otherwise(find_year(F.col('bdate'))))
profile_df = profile_df.withColumn('age', find_age(profile_df["year"].cast(IntegerType())))\
                        .select(F.col('id').alias('owner_id'), F.col('sex'), F.col('age'))

posts_df = posts_df.join(profile_df, "owner_id", "left")
data_df = posts_df.withColumnRenamed('id', 'post_id').na\
                    .drop(subset=['age', 'sex', 'post_id', 'owner_id'])

In [7]:
# Remove Stopwords

stopwords = spark.read.text('data/stopwords.txt')
val = stopwords.select('value').collect()
wordlist = [ele['value'] for ele in val]
x = ",".join(wordlist)

text_df = data_df.select('post_id', 'text').withColumn("text", F.regexp_replace(F.col("text"), "\n", ""))\
                    .withColumn("unfilteredText", F.split("text", " "))
text_df = text_df.withColumn("filter_col", F.lit(x))\
                    .withColumn("filter_col", F.split("filter_col", ","))\
                    .withColumn("filteredText", F.array_except("unfilteredText", "filter_col"))\
                    .na.drop(subset=['filteredText'])

In [8]:
# Filter data

data_df = data_df.join(text_df.select('post_id','filteredText', 'unfilteredText'), on='post_id', how='left')\
                .withColumn('totalwords', F.size('unfilteredText'))\
                .withColumn('totalfiltwords', F.size('filteredText'))\
                .select("post_id", "sex", "age", "totalwords", "totalfiltwords")

In [None]:
# Send message

def sendMessage(row):
    producer = KafkaProducer(bootstrap_servers=KAFKA_HOSTS, api_version=KAFKA_VERSION, value_serializer=lambda v: json.dumps(v).encode('utf-8'))
    producer.send(topics, row.asDict())
    producer.flush()

data_df.rdd.foreach(sendMessage)

In [None]:
spark.stop()