In [1]:
import pyspark
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import json
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, IntegerType
import pyspark.sql.functions as F
from kafka import KafkaConsumer

In [2]:
# Configuration

KAFKA_HOSTS = 'localhost:9092'
KAFKA_VERSION = (0, 10)
TOPIC = "profile1"

In [3]:
# Spark

spark = SparkSession.builder.master("local[*]") \
    .appName("Profile Stream Consumer") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1") \
    .getOrCreate()

In [4]:
# Read DF

df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", KAFKA_HOSTS) \
  .option("subscribe", TOPIC) \
  .load()
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

DataFrame[key: string, value: string]

In [5]:
df.count()

694639

In [6]:
sdf = df.select('topic', col('value').cast("string"))

find_post = F.udf(lambda y: json.loads(y)['post_id'])
find_age = F.udf(lambda y: json.loads(y)['age'])
find_sex = F.udf(lambda y: json.loads(y)['sex'])

sdf = sdf.withColumn('post_id', find_post(sdf.value))\
    .withColumn('sex', find_sex(sdf.value))\
    .withColumn('age', find_age(sdf.value))\
    .na.drop()\
    .select(col('topic'), col('post_id'), col('sex'), col('age').cast('int'))

In [7]:
print("Posts by a male: ", sdf.filter(sdf.sex == "1").count())
print("Posts by a female: ", sdf.filter(sdf.sex == "2").count())
print("Posts (owner age <18)",sdf.filter("age<18").count())
print("Posts (owner age 18-27)",sdf.filter(col("age").between(18,27)).count())
print("Posts (owner age 27-40)",sdf.filter(col("age").between(27,40)).count())
print("Posts (owner age 40-60)",sdf.filter(col("age").between(40,60)).count())
print("Posts (owner age >60)",sdf.filter("age>60").count())

Posts by a male:  354019
Posts by a female:  340519
Posts (owner age <18) 3736
Posts (owner age 18-27) 141804
Posts (owner age 27-40) 192625
Posts (owner age 40-60) 238317
Posts (owner age >60) 157108


In [8]:
spark.stop()