In [36]:

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, month
import os
from pyspark.sql.functions import broadcast
import time


spark = SparkSession.builder.appName('Optimize I').getOrCreate()

base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

answers_input_path = os.path.join(project_path, 'Springboard/Spark_optimizationproject/Optimization/data/answers')

questions_input_path = os.path.join(project_path, 'Springboard/Spark_optimizationproject/Optimization/data/questions')


answersDF = spark.read.option('path', answers_input_path).load()

questionsDF = spark.read.option('path', questions_input_path).load()

'''
Answers aggregation

Here we : get number of answers per question per month
'''
# getting total time for the query

start = time.time()

answers_month = answersDF.withColumn('month', month('creation_date')).groupBy('question_id', 'month').agg(count('*').alias('cnt'))

resultDF = questionsDF.join((answers_month), 'question_id').select('question_id', 'creation_date', 'title', 'month', 'cnt')

resultDF.orderBy('question_id', 'month').show()

print("Processing time: %s seconds" % (time.time() - start))
resultDF.explain()




                                                                                

+-----------+--------------------+--------------------+-----+---+
|question_id|       creation_date|               title|month|cnt|
+-----------+--------------------+--------------------+-----+---+
|     155989|2014-12-31 20:59:...|Frost bubble form...|    1|  1|
|     155989|2014-12-31 20:59:...|Frost bubble form...|    2|  1|
|     155990|2014-12-31 21:51:...|The abstract spac...|    1|  2|
|     155992|2014-12-31 22:44:...|centrifugal force...|    1|  1|
|     155993|2014-12-31 22:56:...|How can I estimat...|    1|  1|
|     155995|2015-01-01 00:16:...|Why should a solu...|    1|  3|
|     155996|2015-01-01 01:06:...|Why do we assume ...|    1|  2|
|     155996|2015-01-01 01:06:...|Why do we assume ...|    2|  1|
|     155996|2015-01-01 01:06:...|Why do we assume ...|   11|  1|
|     155997|2015-01-01 01:26:...|Why do square sha...|    1|  3|
|     155999|2015-01-01 02:01:...|Diagonalizability...|    1|  1|
|     156008|2015-01-01 03:48:...|Capturing a light...|    1|  2|
|     1560

In [11]:
answersDF.show()

+-----------+---------+--------------------+--------+-------+-----+
|question_id|answer_id|       creation_date|comments|user_id|score|
+-----------+---------+--------------------+--------+-------+-----+
|     226592|   226595|2015-12-29 18:46:...|       3|  82798|    2|
|     388057|   388062|2018-02-22 13:52:...|       8|    520|   21|
|     293286|   293305|2016-11-17 16:35:...|       0|  47472|    2|
|     442499|   442503|2018-11-22 01:34:...|       0| 137289|    0|
|     293009|   293031|2016-11-16 08:36:...|       0|  83721|    0|
|     395532|   395537|2018-03-25 01:51:...|       0|   1325|    0|
|     329826|   329843|2017-04-29 11:42:...|       4|    520|    1|
|     294710|   295061|2016-11-26 20:29:...|       2| 114696|    2|
|     291910|   291917|2016-11-10 05:56:...|       0| 114696|    2|
|     372382|   372394|2017-12-03 21:17:...|       0| 172328|    0|
|     178387|   178394|2015-04-25 13:31:...|       6|  62726|    0|
|     393947|   393948|2018-03-17 18:22:...|    

In [12]:
questionsDF.show()

+-----------+--------------------+--------------------+--------------------+------------------+--------+-------+-----+
|question_id|                tags|       creation_date|               title|accepted_answer_id|comments|user_id|views|
+-----------+--------------------+--------------------+--------------------+------------------+--------+-------+-----+
|     382738|[optics, waves, f...|2018-01-28 02:22:...|What is the pseud...|            382772|       0|  76347|   32|
|     370717|[field-theory, de...|2017-11-25 04:09:...|What is the defin...|              null|       1|  75085|   82|
|     339944|[general-relativi...|2017-06-17 16:32:...|Could gravitation...|              null|      13| 116137|  333|
|     233852|[homework-and-exe...|2016-02-04 16:19:...|When does travell...|              null|       9|  95831|  185|
|     294165|[quantum-mechanic...|2016-11-22 06:39:...|Time-dependent qu...|              null|       1| 118807|   56|
|     173819|[homework-and-exe...|2015-04-02 11:

## Optimized Query : Using broadcast variable and removing null values

In [37]:

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, month
import os
from pyspark.sql.functions import broadcast


spark = SparkSession.builder.appName('Optimize I').getOrCreate()

base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

answers_input_path = os.path.join(project_path, 'Springboard/Spark_optimizationproject/Optimization/data/answers')

questions_input_path = os.path.join(project_path, 'Springboard/Spark_optimizationproject/Optimization/data/questions')


answersDF = spark.read.option('path', answers_input_path).load()

#filtering out null values from questionsDF
questionsDF = spark.read.option('path', questions_input_path).load().filter(col("accepted_answer_id").isNotNull())

'''
Answers aggregation

Here we : get number of answers per question per month
'''
start = time.time()

answers = broadcast(answersDF) #adding broadcast variable


answers_month = answers.withColumn('month', month('creation_date')).groupBy('question_id', 'month').agg(count('*').alias('cnt'))


resultDF = questionsDF.join((answers_month), 'question_id').select('question_id', 'creation_date', 'title', 'month', 'cnt')

resultDF.orderBy('question_id', 'month').show()

print("Processing time: %s seconds" % (time.time() - start))



                                                                                

+-----------+--------------------+--------------------+-----+---+
|question_id|       creation_date|               title|month|cnt|
+-----------+--------------------+--------------------+-----+---+
|     155989|2014-12-31 20:59:...|Frost bubble form...|    1|  1|
|     155989|2014-12-31 20:59:...|Frost bubble form...|    2|  1|
|     155990|2014-12-31 21:51:...|The abstract spac...|    1|  2|
|     155995|2015-01-01 00:16:...|Why should a solu...|    1|  3|
|     155996|2015-01-01 01:06:...|Why do we assume ...|    1|  2|
|     155996|2015-01-01 01:06:...|Why do we assume ...|    2|  1|
|     155996|2015-01-01 01:06:...|Why do we assume ...|   11|  1|
|     155997|2015-01-01 01:26:...|Why do square sha...|    1|  3|
|     156008|2015-01-01 03:48:...|Capturing a light...|    1|  2|
|     156008|2015-01-01 03:48:...|Capturing a light...|   11|  1|
|     156022|2015-01-01 06:55:...|Advice on Major S...|    1|  1|
|     156025|2015-01-01 07:32:...|Deriving the Cano...|    1|  1|
|     1560

As it can be observed from the processing time, the taken for optimized code is 1.39 seconds while the original query processing time was 1.67 seconds.