In [None]:
%pylab inline
from pyspark.sql.types import *
from datetime import datetime
from pyspark.sql import Row
from time import time
from pandas import DataFrame

# Introduction

Spark will usually split your data into a set of partitions automatically but there are cases where you want to do this manually to improve performance. You can us the repartition() method to define the number of partitions in your RDD. In this tutorial we explore the number of partitions on performance.

We are going to be using the Reddit comments dataset for this tutorial. More information on this dataset can be found [here](https://sites.google.com/a/insightdatascience.com/spark-lab/s3-data/reddit-comments).

# Load in the data

In [None]:
fields = [StructField("archived", BooleanType(), True),
        StructField("author", StringType(), True),
        StructField("author_flair_css_class", StringType(), True),
        StructField("body", StringType(), True),
        StructField("controversiality", LongType(), True),
        StructField("created_utc", StringType(), True),
        StructField("day", LongType(), True),
        StructField("distinguished", StringType(), True),
        StructField("downs", LongType(), True),
        StructField("edited", StringType(), True),
        StructField("gilded", LongType(), True),
        StructField("id", StringType(), True),
        StructField("link_id", StringType(), True),
        StructField("month", LongType(), True),
        StructField("name", StringType(), True),
        StructField("parent_id", StringType(), True),
        StructField("retrieved_on", LongType(), True),
        StructField("score", LongType(), True),
        StructField("score_hidden", BooleanType(), True),
        StructField("subreddit", StringType(), True),
        StructField("subreddit_id", StringType(), True),
        StructField("ups", LongType(), True),
        StructField("year", LongType(), True)]
#rawDF = sqlContext.read.json("s3a://reddit-comments/2008", StructType(fields))
rawDF = sqlContext.read.parquet("s3a://reddit-comments-parquet/year=2008")
rawDF.persist(StorageLevel.DISK_ONLY)
rawDF.count()

# Parallelism - example 1

To analyze the performance of a simple Spark job with different numbers of partitions, let's split our data into varying numbers of partitions. We'll persist the partitions to save recomputing down the road.

In [None]:
# Setting up DataFrames to have various number of partitions ranging from 2 to 32
# All will be persisted in memory and the same job will be run on each one to show 
# performance benefits of different number of partitions
repart_2_df = rawDF.repartition(2).persist(StorageLevel.DISK_ONLY)
repart_4_df = rawDF.repartition(4).persist(StorageLevel.DISK_ONLY)
repart_8_df = rawDF.repartition(8).persist(StorageLevel.DISK_ONLY)
repart_16_df = rawDF.repartition(16).persist(StorageLevel.DISK_ONLY)
repart_32_df = rawDF.repartition(32).persist(StorageLevel.DISK_ONLY)

Now let's perform a simple job (count the number of elements in each dataframe) and compare how long the operation takes for various number of partitions.

In [None]:
# Run through each DataFrame and count the number of elements in each. This will 
# trigger the DataFrames to persist into Memory and Disk
df_arr = [(repart_2_df, 2), 
          (repart_4_df, 4), 
          (repart_8_df, 8), 
          (repart_16_df, 16), 
          (repart_32_df, 32)]
for df in df_arr:
    start_time = time()
    df[0].count()
    end_time = time()
    print "{} partitions took {} seconds to repartition and count".format(df[1], end_time - start_time) 

### What does the runtime indicate about the scheduling and distribution of tasks to worker nodes?

### Look at 4040 to see how the count action varied in time with increased number of partitions

# Parallelism - example 2

In [None]:
# Function sorts an array of UTC time and calculates the delta time in days between each consecutive UTC time
# Returns a Row object containing the subreddit, the median time delta representing the median time in days 
# between comments for any subreddit, and the total number of comments in each subreddit
def calc_median(row):
    from heapq import heappop
    
    subreddit = row[0]
    val_arr = row[1]
    num_comments = len(val_arr)
    
    dt = []

    if len(val_arr) > 1:
        prev_val = heappop(val_arr)
        while len(val_arr) > 0:
            curr_val = heappop(val_arr)
            dt.append(curr_val - prev_val)
            prev_val = curr_val
        return Row(subreddit=subreddit, median_time_days=float(median(dt))/60/60/24, num_comments=num_comments)
    else:
        return Row(subreddit=subreddit, median_time_days=0, num_comments=num_comments)
        

### The following code is very inefficient. Can you figure out why (before attempting to run it)?

In [None]:
def combiner(value):
    return value

def merger(x, value):
    from heapq import heappush

    heappush(x, value[0])

    return x

def merge_combiner(x, y):
    from heapq import heappush, heappop

    while len(y) > 0:
        heappush(x, heappop(y))

    return x

In [None]:
# Loop through each DataFrame and run the same job to calculate the median time between comments for each
# subreddit
for df in df_arr:
    start_time = time()
    curr_df = df[0]
    num_partitions = df[1]

    subreddit_comment_times = curr_df.rdd.map(lambda r: (r.subreddit, [int(r.created_utc)]))
    median_time_between_posts_df = subreddit_comment_times.combineByKey(combiner, merger, merge_combiner)\
                                                          .map(calc_median)\
                                                          .toDF()

    total_cnt = median_time_between_posts_df.count()
    end_time = time()
    print "{} partitions took {} seconds to process, {} final records".format(df[1], end_time - start_time, total_cnt) 

In [None]:
# Revised code using predicate pushdown
for df in df_arr:
    start_time = time()
    curr_df = df[0]
    num_partitions = df[1]

    subreddit_comment_times = curr_df.select(['subreddit', 'created_utc']).rdd.map(lambda r: (r.subreddit, [int(r.created_utc)]))
    median_time_between_posts_df = subreddit_comment_times.combineByKey(combiner, merger, merge_combiner)\
                                                          .map(calc_median)\
                                                          .toDF()

    total_cnt = median_time_between_posts_df.count()
    end_time = time()
    print "{} partitions took {} seconds to process, {} final records".format(df[1], end_time - start_time, total_cnt) 

Spark Dataframes use predicate pushdow and will permit users to filter before running other map transformations. This reduces the total amount of data going into the map transformation which is particularly useful for DataFrames with many many columns. Here we simply need to select the 2 columns of interest, "subreddit" and "created_utc" before mapping the DataFrame back to a RDD

In [None]:
# A scatter plot where each element is a subreddit. We can see that subreddits with more than 40 comments tend to
# also have more frequent comment activity throughout the year
median_time_between_posts_pd = median_time_between_posts_df.toPandas()
median_time_between_posts_pd.plot(kind='scatter', x='num_comments', y='median_time_days', xlim=(-5, 100), ylim=(-5, 60))

# Next Steps

## Task 1: What is the optimal number of partitions for calculating the authors who have written the longest comments per subreddit?

In [None]:
sqlContext.udf.register('length_text', lambda x: len(x), LongType())

partition_list = [16,50,100,500,1000,2500,5000]
elapsed_time = []

# Loop through each DataFrame and run the same job to calculate the median time between comments for each
# subreddit
for partitions in partition_list:
    repart_df = rawDF.repartition(partitions).persist(StorageLevel.DISK_ONLY)
    repart_df.registerTempTable("repart")
    repart_df.count()
    
    start_time = time()

    max_comment_len_per_subreddit_author = sqlContext.sql("""
        SELECT subreddit, 
            author,
            MAX(length_text(body)) as longest_comment
        FROM repart
        GROUP BY subreddit, author
    """)
    max_comment_len_per_subreddit_author.persist(StorageLevel.MEMORY_AND_DISK_SER)
    max_comment_len_per_subreddit_author.registerTempTable("max_comment")
    max_comment_len_per_subreddit = sqlContext.sql("""
        SELECT max_comment.subreddit, 
               max_comment.author, 
               all_time_longest
        FROM max_comment, (
            SELECT subreddit, 
                   MAX(longest_comment) as all_time_longest
            FROM max_comment
            GROUP BY subreddit
        ) as all_max_comment
        WHERE max_comment.longest_comment = all_max_comment.all_time_longest
          AND max_comment.subreddit = all_max_comment.subreddit
        ORDER BY all_time_longest DESC
    """)
    num_records = max_comment_len_per_subreddit.count()
    elapsed_time.append(time()-start_time)
    
    # clean up before next iteration
    repart_df.unpersist()
    max_comment_len_per_subreddit_author.unpersist()
    

In [None]:
benchmark = DataFrame({"partitions": partition_list,
                       "elapsed_time": elapsed_time})
benchmark.plot(x='partitions', y='elapsed_time', logx=True)
plt.ylabel('time (s)')
plt.xlabel('number of partitions')
print benchmark