#### References:<br>
https://docs.azuredatabricks.net/user-guide/importing-data.html<br>
https://docs.azuredatabricks.net/spark/latest/faq/join-two-dataframes-duplicated-column.html<br>

https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame<br>

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import broadcast, lit

In [3]:
source_file_airports = "/mnt/hack/csv/sample/dat202/airports.csv"
source_file_flights_raw = "/mnt/hack/csv/sample/dat202/raw-flight-data.csv"

#### With header, and schema inference

In [5]:
df_airports = spark\
    .read\
    .format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(source_file_airports)

In [6]:
df_flights_raw = spark\
    .read\
    .format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(source_file_flights_raw)

#### With header and explicit schema

In [8]:
schema_airports = StructType([
  StructField("airport_id", IntegerType(), True),
  StructField("city", StringType(), True),
  StructField("state", StringType(), True),
  StructField("name", StringType(), True)
])

In [9]:
schema_flights = StructType([
  StructField("DayofMonth", IntegerType(), True),
  StructField("DayOfWeek", IntegerType(), True),
  StructField("Carrier", StringType(), True),
  StructField("OriginAirportID", IntegerType(), True),
  StructField("DestAirportID", IntegerType(), True),
  StructField("DepDelay", IntegerType(), True),
  StructField("ArrDelay", IntegerType(), True)
])

In [10]:
# Read the data into a dataframe, explicitly specifying the schema we defined above

df_airports = spark\
    .read\
    .format("csv")\
    .option("header", "true")\
    .schema(schema_airports)\
    .load(source_file_airports)

In [11]:
df_flights_raw = spark\
    .read\
    .format("csv")\
    .option("header", "true")\
    .schema(schema_flights)\
    .load(source_file_flights_raw)

In [12]:
broadcast(df_airports)

In [13]:
# Lazy cache - in Spark SQL it would be an eager cache

df_airports.cache()

In [14]:
df_airports.count()

#### Join flights and airports

In [16]:
df2 = df_flights_raw\
  .join(df_airports, df_flights_raw["OriginAirportID"] == df_airports["airport_id"], "leftouter")\
  .drop("airport_id")\
  .withColumnRenamed("city", "OriginCity")\
  .withColumnRenamed("state", "OriginState")\
  .withColumnRenamed("name", "OriginAirportName")

In [17]:
df_flights_full = df2\
  .join(df_airports, df2["DestAirportID"] == df_airports["airport_id"], "leftouter")\
  .drop("airport_id")\
  .withColumnRenamed("city", "DestCity")\
  .withColumnRenamed("state", "DestState")\
  .withColumnRenamed("name", "DestAirportName")

In [18]:
df_flights_full.cache()

In [19]:
display(df_flights_full.describe())

summary,DayofMonth,DayOfWeek,Carrier,OriginAirportID,DestAirportID,DepDelay,ArrDelay,OriginCity,OriginState,OriginAirportName,DestCity,DestState,DestAirportName
count,2719418.0,2719418.0,2719418,2719418.0,2719418.0,2691974.0,2690385.0,2719418,2719418,2719418,2719418,2719418,2719418
mean,15.79747468024408,3.8983907586108497,,12742.26441172339,12742.455345592329,10.53686662649788,6.63768791455498,,,,,,
stddev,8.799860168985386,1.985988139037331,,1501.9729397025624,1501.9692528927906,36.0995280664314,38.64881489390076,,,,,,
min,1.0,1.0,9E,10140.0,10140.0,-63.0,-94.0,Albuquerque,AK,Albuquerque International Sunport,Albuquerque,AK,Albuquerque International Sunport
max,31.0,7.0,YV,15376.0,15376.0,1863.0,1845.0,West Palm Beach/Palm Beach,WI,William P Hobby,West Palm Beach/Palm Beach,WI,William P Hobby


In [20]:
display(df_flights_full)

DayofMonth,DayOfWeek,Carrier,OriginAirportID,DestAirportID,DepDelay,ArrDelay,OriginCity,OriginState,OriginAirportName,DestCity,DestState,DestAirportName
19,5,DL,11433,13303,-3.0,1.0,Detroit,MI,Detroit Metro Wayne County,Miami,FL,Miami International
19,5,DL,14869,12478,0.0,-8.0,Salt Lake City,UT,Salt Lake City International,New York,NY,John F. Kennedy International
19,5,DL,14057,14869,-4.0,-15.0,Portland,OR,Portland International,Salt Lake City,UT,Salt Lake City International
19,5,DL,15016,11433,28.0,24.0,St. Louis,MO,Lambert-St. Louis International,Detroit,MI,Detroit Metro Wayne County
19,5,DL,11193,12892,-6.0,-11.0,Cincinnati,OH,Cincinnati/Northern Kentucky International,Los Angeles,CA,Los Angeles International
19,5,DL,10397,15016,-1.0,-19.0,Atlanta,GA,Hartsfield-Jackson Atlanta International,St. Louis,MO,Lambert-St. Louis International
19,5,DL,15016,10397,0.0,-1.0,St. Louis,MO,Lambert-St. Louis International,Atlanta,GA,Hartsfield-Jackson Atlanta International
19,5,DL,10397,14869,15.0,24.0,Atlanta,GA,Hartsfield-Jackson Atlanta International,Salt Lake City,UT,Salt Lake City International
19,5,DL,10397,10423,33.0,34.0,Atlanta,GA,Hartsfield-Jackson Atlanta International,Austin,TX,Austin - Bergstrom International
19,5,DL,11278,10397,323.0,322.0,Washington,DC,Ronald Reagan Washington National,Atlanta,GA,Hartsfield-Jackson Atlanta International
