In [1]:
import findspark
findspark.init()

import pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('sandbox').getOrCreate()

In [2]:
myRange = spark.range(1000).toDF("number")

In [3]:
divisBy2 = myRange.where("number % 2 = 0")

In [4]:
flightData2015 = spark\
  .read\
  .option("inferSchema", "true")\
  .option("header", "true")\
  .csv("gs://is843/notebooks/data/flight-data/csv/2015-summary.csv")

In [5]:
flightData2015.sort("count").explain()

== Physical Plan ==
*(2) Sort [count#16 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(count#16 ASC NULLS FIRST, 200)
   +- *(1) FileScan csv [DEST_COUNTRY_NAME#14,ORIGIN_COUNTRY_NAME#15,count#16] Batched: false, Format: CSV, Location: InMemoryFileIndex[gs://is843/notebooks/data/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>


In [6]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [8]:
flightData2015.sort("count").take(4)

[Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='Malta', ORIGIN_COUNTRY_NAME='United States', count=1)]

In [13]:
flightData2015.createOrReplaceTempView("flight_data_2015")


In [14]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

dataFrameWay = flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .count()

sqlWay.explain()
dataFrameWay.explain()


== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#31], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#31, 5)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#31], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#31] Batched: false, Format: CSV, Location: InMemoryFileIndex[gs://dataproc-bucket-is843-demo/notebooks/data/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>
== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#31], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#31, 5)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#31], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#31] Batched: false, Format: CSV, Location: InMemoryFileIndex[gs://dataproc-bucket-is843-demo/notebooks/data/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:st

In [15]:
from pyspark.sql.functions import max

flightData2015.select(max("count")).take(1)


[Row(max(count)=370002)]

In [16]:
maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")

maxSql.show()


+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [17]:
from pyspark.sql.functions import desc

flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .sum("count")\
  .withColumnRenamed("sum(count)", "destination_total")\
  .sort(desc("destination_total"))\
  .limit(5)\
  .show()


+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [18]:
flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .sum("count")\
  .withColumnRenamed("sum(count)", "destination_total")\
  .sort(desc("destination_total"))\
  .limit(5)\
  .explain()


== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#114L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#31,destination_total#114L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#31], functions=[sum(cast(count#33 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#31, 5)
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#31], functions=[partial_sum(cast(count#33 as bigint))])
         +- *(1) FileScan csv [DEST_COUNTRY_NAME#31,count#33] Batched: false, Format: CSV, Location: InMemoryFileIndex[gs://dataproc-bucket-is843-demo/notebooks/data/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>
