In [1]:
from pyspark.sql import SparkSession
from datetime import datetime, timezone
import re
from pyspark.sql.functions import col, udf, size
from pyspark.sql.types import StringType, ArrayType
from operator import add

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.242:7077") \
        .appName("reddit_analysis_t11")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.eventLog.enabled", True)\
        .config("spark.eventLog.dir", "hdfs://192.168.2.242:9000/user/shared/spark-logs")\
        .config("spark.cores.max", 8)\
        .getOrCreate()

# Old API (RDD) 
spark_context = spark_session.sparkContext

spark_context.setLogLevel("INFO")

In [2]:
#cores = 10
df = spark_session.read.json("hdfs://192.168.2.242:9000/user/reddit_data/big/strong_scaling/*")

print("Input Sample:\n\n", df.take(2))

print("\nNumber of partitions:", df.rdd.getNumPartitions())

print("\nReddit post schema:\n")
df.printSchema()

# Print time stamp of first and last reddit post in this data frame
first_timestamp = datetime.fromtimestamp(df.first()["created_utc"], tz=timezone.utc)
last_timestamp = datetime.fromtimestamp(df.tail(1)[0]["created_utc"], tz=timezone.utc)

print("First Timestamp of this batch: ", first_timestamp)
print("Last Timestamp of this batch", last_timestamp)

# currently set the dates encompassing the entire data
# can be adjusted to filter on smaller time frames
start_ts = int(datetime(2000, 1, 1, 0, 0).timestamp())
end_ts = int(datetime(2030, 1, 31, 0, 0).timestamp())

# Only interested in these two columnns
df = df[['created_utc', 'body']]

In [3]:
df = df.filter((end_ts >= df.created_utc) & (df.created_utc >= start_ts))
df = df.filter(df.body != '[deleted]')
df = df.filter(df.body != '[removed]')
# cache for faster, future computation
df.cache()

DataFrame[created_utc: bigint, body: string]

In [4]:
def normalize(string):
    """ 
    UDF that lower cases input string and 
    removes special characters
    
    string -- the string to normalize
    """
    return  re.sub(r'[^A-Za-z ]', '', string.lower())

udf_normalize = udf(normalize, StringType())

df = df.withColumn('body', udf_normalize(col('body')))

In [None]:
# the names to filter on 
pres_cand = ["donald trump", "hillary clinton"]

def return_candidate(string):
    """
    UDF that returns the string representation of a list
    of the candidate names from pres_cand that are present in 
    the input string
    
    string -- the string to look for candidates in
    """
    present = [cand for cand in pres_cand if cand in string]
    if present == []:
        present = ''
    return str(present)

udf_rc = udf(return_candidate, StringType())
df = df.withColumn('body', udf_rc(col('body')))
df = df.filter(df.body != '')
df = df.groupBy('body').count()
df.repartition(1).write.csv('save.csv')
