## Ingest airport_data.json file

In [None]:
dbutils.widgets.text("data_source", "")
data_source = dbutils.widgets.get("data_source")

In [None]:
dbutils.widgets.text("file_date", "2000-01-01")
file_date = dbutils.widgets.get("file_date")

In [None]:
%run "../includes/configurations"

##### Imports

In [None]:
from pyspark.sql.functions import (
    split,
    current_timestamp,
    trim,
    col,
    lit
)

##### Step 1 - Read the JSON file using the spark dataframe reader

In [None]:
# DDL based schema defination
airports_schema = "ident STRING, type STRING, name STRING, elevation_ft INT, continent STRING, iso_country STRING, iso_region STRING, municipality STRING, gps_code STRING, iata_code STRING, local_code STRING, coordinates STRING"

**Note:**
By default spark does not deal with multi-line JSON format. We need to define it in options

In [None]:
airports_df = (spark.read.
               schema(airports_schema).
               option("multiLine", True).
               json(f"{raw_folder_path}/{file_date}/airport_data.json")
              )

In [None]:
airports_df.printSchema()

In [None]:
display(airports_df)

##### Step 2 - Select the required columns

In [None]:
req_cols = ['type','name','elevation_ft','continent','iso_country','iso_region','municipality','iata_code','coordinates']
req_cols = [col(col_name) for col_name in req_cols]

In [None]:
airports_selected_df = airports_df.select(req_cols)

##### Step 3 - Rename the required columns

In [None]:
airports_renamed_df = (airports_selected_df.
                       withColumnRenamed("type","airport_type").
                       withColumnRenamed("name","airport_name")
                      )

##### Step 4 - Add new columns

In [None]:
split_coordinates = split(airports_renamed_df['coordinates'], ',')

In [None]:
import os
os.chdir("..")
from src import utils as ut

In [None]:
airports_new_cols_df = (airports_renamed_df.
                        withColumn("latitude", trim(split_coordinates.getItem(0))).
                        withColumn("longitude", trim(split_coordinates.getItem(1)))
                       )
airports_new_cols_df = ut.add_ingestion_date(airports_new_cols_df)
airports_new_cols_df = airports_new_cols_df.drop(col("coordinates"))

In [None]:
airports_final_df = (airports_new_cols_df.
                     withColumn("latitude", col("latitude").cast("double")).
                     withColumn("longitude", col("longitude").cast("double")).
                     withColumn("source", lit(data_source)).
                     withColumn("file_date", lit(file_date))
                    )

In [None]:
display(airports_final_df)

##### Step 5 - Write output to parquet file

In [None]:
# airports_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}airports")

In [None]:
(airports_final_df.
 write.mode("overwrite").
 format("parquet").
 saveAsTable("dev_air_travel_processed.airports")
)

In [None]:
dbutils.notebook.exit("Success")