In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f, types as t
import os

In [2]:
airport_codes_dataset = os.path.join(os.getcwd(),"datasets/airport-codes-na.txt")
delays_dataset = os.path.join(os.getcwd(),"datasets/departuredelays.csv")

In [3]:
spark = SparkSession.builder.appName("spark-sql").config("spark.driver.bindAddress","127.0.0.1").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/17 21:45:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df_airport_codes = spark.read\
                        .format("csv")\
                        .option("header","true")\
                        .option("delimiter","\t")\
                        .load(airport_codes_dataset)

df_delays = spark.read\
                        .format("csv")\
                        .option("header","true")\
                        .load(delays_dataset)



In [5]:
df_airport_codes.show()
df_delays.show()

+-----------+-----+-------+----+
|       City|State|Country|IATA|
+-----------+-----+-------+----+
| Abbotsford|   BC| Canada| YXX|
|   Aberdeen|   SD|    USA| ABR|
|    Abilene|   TX|    USA| ABI|
|      Akron|   OH|    USA| CAK|
|    Alamosa|   CO|    USA| ALS|
|     Albany|   GA|    USA| ABY|
|     Albany|   NY|    USA| ALB|
|Albuquerque|   NM|    USA| ABQ|
| Alexandria|   LA|    USA| AEX|
|  Allentown|   PA|    USA| ABE|
|   Alliance|   NE|    USA| AIA|
|     Alpena|   MI|    USA| APN|
|    Altoona|   PA|    USA| AOO|
|   Amarillo|   TX|    USA| AMA|
|Anahim Lake|   BC| Canada| YAA|
|  Anchorage|   AK|    USA| ANC|
|   Appleton|   WI|    USA| ATW|
|     Arviat|  NWT| Canada| YEK|
|  Asheville|   NC|    USA| AVL|
|      Aspen|   CO|    USA| ASE|
+-----------+-----+-------+----+
only showing top 20 rows

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|    6|     602|   ABE|        ATL|
|0

In [6]:
# Using expr(), convert the delay and distance columns from STRING to INT.

df_delays = df_delays\
                    .withColumn("delay", f.expr("cast(delay as INT) as delay"))\
                    .withColumn("distance", f.expr("cast(distance as INT) as distance"))

In [7]:
df_delays.printSchema()

root
 |-- date: string (nullable = true)
 |-- delay: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)



In [8]:
# Create a smaller table, foo, that we can focus on for our demo examples; it contains
# only information on three flights originating from Seattle (SEA) to the destination
# of San Francisco (SFO) for a small time range.

df_filtered = df_delays.where(f.expr("""origin == 'SEA' and destination == 'SFO' and 
                        delay > 0 and date like '01010%'"""))

## Unions
A common pattern within Apache Spark is to union two different DataFrames with
the same schema together. This can be achieved using the `union()` method:

In [9]:
df_union = df_delays.union(df_filtered)

df_union.createOrReplaceTempView("union_data")

df_union.filter(f.expr(""" origin == 'SEA' and destination == 'SFO' and 
                        delay > 0 and date like '01010%' """)).show()

# The bar DataFrame is the union of foo with delays. Using the same filtering criteria
# results in the bar DataFrame, we see a duplication of the foo data, as expected:

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
|01010710|   31|     590|   SEA|        SFO|
|01010955|  104|     590|   SEA|        SFO|
|01010730|    5|     590|   SEA|        SFO|
+--------+-----+--------+------+-----------+



## Joins
A common DataFrame operation is to join two DataFrames (or tables) together. By
default, a Spark SQL join is an inner join, with the options being inner, cross,
outer, full, full_outer, left, left_outer, right, right_outer, left_semi, and
left_anti. More information is available in the documentation (this is applicable to
Scala as well as Python).

The following code sample performs the default of an inner join between the air
portsna and foo DataFrames:

In [10]:
join_df = df_airport_codes.join(df_filtered, on=df_filtered.origin == df_airport_codes.IATA)\
                            .select("City", "State", "date", "delay", "distance", "destination").show()

+-------+-----+--------+-----+--------+-----------+
|   City|State|    date|delay|distance|destination|
+-------+-----+--------+-----+--------+-----------+
|Seattle|   WA|01010710|   31|     590|        SFO|
|Seattle|   WA|01010955|  104|     590|        SFO|
|Seattle|   WA|01010730|    5|     590|        SFO|
+-------+-----+--------+-----+--------+-----------+



# Windowing

A window function uses values from the rows in a window (a range of input rows) to
return a set of values, typically in the form of another row. With window functions, it
is possible to operate on a group of rows while still returning a single value for every
input row. In this section, we will show how to use the dense_rank() window function;
there are many other functions, as noted in Table 5-5.

In [11]:
# What if for each of these origin airports you wanted to find the three destinations that experienced the most delays?


df_delays.createOrReplaceTempView("delays")

spark.sql(""" select origin, destination, total_delay from
            (select origin, destination, total_delay, dense_rank() over (partition by origin order by total_delay desc) as drank
               from (select origin, destination, sum(delay) as total_delay from delays group by origin, destination) q
                ) o 
                where drank <= 3
                and origin in ('SEA','SFO','JFK')
            """).show()



+------+-----------+-----------+
|origin|destination|total_delay|
+------+-----------+-----------+
|   JFK|        LAX|      35755|
|   JFK|        SFO|      35619|
|   JFK|        MCO|      28419|
|   SEA|        SFO|      22293|
|   SEA|        DEN|      13645|
|   SEA|        ORD|      10041|
|   SFO|        LAX|      40798|
|   SFO|        LAS|      30030|
|   SFO|        ORD|      27412|
+------+-----------+-----------+



                                                                                

### need to learn about pivot