# Airport Codes Data Processing

The staging script for the Airport Codes Processing Spark Job

In [1]:
# import libraries
import datetime
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

# Spark Job
---

In [2]:
def initialize_spark():
    """
    Initializes a spark instance
    """
    # initialize spark
    spark = SparkSession\
        .builder\
        .appName("immigration-data-preprocessing")\
        .getOrCreate()

    return spark

In [3]:
def preprocess_ports_data(spark, input_fp):
    """
    Preprocesses the airport codes data to be used by other functions

    Params
    ------
    spark: spark session
        An initialized spark session
    input_fp: str
        The location of the airport codes raw data file
    """

    # load raw data
    ports = spark.read.csv(input_fp, header=True)

    # filter out the non us airports
    ports = ports.where(ports["iso_country"] == "US")

    # extract the US states from the iso_region
    extract_state = F.udf(lambda x: x.split("-")[-1].strip(), StringType())
    ports = ports.withColumn("state_id", extract_state(ports["iso_region"]))

    # drop any records with state codes that aren't 2 characters long
    ports = ports.where(F.length(ports["state_id"]) == 2)

    return ports

In [4]:
def create_ports_fact(ports, output_fp):
    """
    Creates the ports fact table

    Params
    ------
    ports: spark dataframe
        The preprocessed airport codes data
    output_fp: str
        The location where the final fact table should be stored
    """
    # create ports fact table
    fact_trans_ports = ports.groupby("state_id", "type").count()

    # create record id
    fact_trans_ports = fact_trans_ports.withColumn("record_id", F.monotonically_increasing_id())

    # rename columns
    fact_trans_col_names = {
        "type": "port_type",
        "count": "num_of_ports"
    }

    for k,v in fact_trans_col_names.items():
        fact_trans_ports = fact_trans_ports.withColumnRenamed(k, v)

    fact_trans_ports.write.parquet(output_fp + "fact_ports/", "append")

In [5]:
def create_state_city_lookup(ports, preprocessed_fp):
    """
    Creates a state city lookup table that we can use for the temperature data (to assign state codes)
    """
    # get state id and municipality and drop duplicates
    state_city_lookup = ports.select("state_id", "municipality").dropDuplicates()

    # write to proprocessed folder
    state_city_lookup.write.parquet(preprocessed_fp + "state_city_lookup_ports/", "append")

In [6]:
def main():
    """
    The main function that runs the airport codes spark job
    """
    # hardcoded variables
    input_fp = "../data/airport-codes_csv.csv"
    output_fp = "../data/output_files/"
    preprocessed_fp = "../data/preprocessed_files/"

    # run spark job
    spark = initialize_spark()
    ports = preprocess_ports_data(spark, input_fp)
    create_ports_fact(ports, output_fp)
    create_state_city_lookup(ports, preprocessed_fp)
    spark.stop()


In [7]:
# run above
main()

22/04/07 02:42:09 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.1.9 instead (on interface wlo1)
22/04/07 02:42:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/07 02:42:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/04/07 02:42:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/04/07 02:42:10 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/04/07 02:42:10 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/04/07 02:42:10 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
                      

# Testing
---

In [8]:
# initialize spark
spark = initialize_spark()

# load data
fact_trans = spark.read.parquet("../data/output_files/fact_ports/")

22/04/07 02:42:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/04/07 02:42:24 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/04/07 02:42:24 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/04/07 02:42:24 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


In [9]:
# head
fact_trans.limit(10).toPandas()

Unnamed: 0,state_id,port_type,num_of_ports,record_id
0,ME,seaplane_base,44,0
1,UT,small_airport,95,1
2,VA,closed,22,2
3,MI,medium_airport,16,3
4,ID,medium_airport,9,4
5,NC,small_airport,329,5
6,SD,heliport,39,6
7,MO,seaplane_base,4,7
8,NY,heliport,184,8
9,ND,closed,6,9


In [10]:
# check total
fact_trans.agg({"num_of_ports": "sum"}).collect()

[Row(sum(num_of_ports)=22747)]

In [11]:
# check lookup table
state_city_lookup = spark.read.parquet("../data/preprocessed_files/state_city_lookup_ports/")

In [12]:
# check head
state_city_lookup.limit(20).toPandas()

Unnamed: 0,state_id,municipality
0,MI,Grant
1,IL,Bonfield
2,NE,Wilber
3,IN,Indianapolis
4,KS,Ottawa
5,LA,Morrow
6,MI,Hulbert
7,MO,Kansas City
8,PA,Elizabethville
9,NC,Youngsville
