In [1]:
# Import Libraries

import pyspark
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import json
from datetime import datetime
from pyspark.sql.functions import *
from pyspark.sql.types import *
from apscheduler.schedulers.blocking import BlockingScheduler

In [2]:
# Spark

spark = SparkSession.builder.master("local[*]") \
    .appName("Profile Stream Consumer Log") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1") \
    .getOrCreate()

In [3]:
schema = StructType([StructField("post_id", IntegerType()), \
            StructField("sex", IntegerType()), StructField("age", IntegerType()), \
            StructField("totalwords", IntegerType()), StructField("totalfiltwords", IntegerType())])


def timed_job():
    print(datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    df = spark.read.csv('output/*.csv')
    if (df.count() > 0):
        df = df.withColumn('json', from_json('_c0', schema)).select('json')
        print("Total Number of Records = ", df.count())
        print("Total Posts by Males = ", df.filter("json.sex==1").count())
        print("Total Posts by Females = ", df.filter("json.sex==2").count())
        print("Total Posts by age < 18 = ", df.filter("json.age<18").count())
        print("Total Posts by age >= 18 & < 27 = ", df.filter("json.age>=18 and json.age<27").count())
        print("Total Posts by age >= 27 & < 40 = ", df.filter("json.age>=27 and json.age<40").count())
        print("Total Posts by age >= 40 & < 60 = ", df.filter("json.age>=40 and json.age<60").count())
        print("Total Posts by age >= 60 = ", df.filter("json.age>=60").count())
        print("Total Number of words = ", df.agg(sum("json.totalwords")).collect()[0][0])
        print("Total Number of words (except stopwords) = ", df.agg(sum("json.totalfiltwords")).collect()[0][0])
    else:
        print("No records received!!!")

In [None]:
scheduler = BlockingScheduler()
timed_job()
scheduler.add_job(timed_job, 'interval', minutes=10)
scheduler.start()

In [None]:
spark.stop()