## Ingest airline_data.json file

In [0]:
dbutils.widgets.text("data_source", "")
data_source = dbutils.widgets.get("data_source")

In [0]:
dbutils.widgets.text("file_date", "2000-01-01")
file_date = dbutils.widgets.get("file_date")

In [0]:
%run "../includes/configurations"

##### Imports

In [0]:
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    StringType
)
from pyspark.sql.functions import col, current_timestamp, lit

##### Step 1 - Read the JSON file using the spark dataframe reader

In [0]:
airlines_schema = StructType(fields=[
    StructField('id', StringType(), False),
    StructField('name', StringType(), True),
    StructField('alias', StringType(), True),
    StructField('iata', StringType(), True),
    StructField('icao', StringType(), True),
    StructField('callsign', StringType(), True),
    StructField('country', StringType(), True),
    StructField('active', StringType(), True)
])

**Note:**
By default spark does not deal with multi-line JSON format. We need to define it in options

In [0]:
airlines_df = (spark.read.
               schema(airlines_schema).
               option("multiLine", True).
               json(f"{raw_folder_path}/{file_date}/airline_data.json")
              )

In [0]:
airlines_df.printSchema()

In [0]:
display(airlines_df)

##### Step 2 - Drop unwanted columns

In [0]:
airlines_dropped_df = (airlines_df.
                       drop(col("alias")).
                       drop(col("callsign"))
                      )

##### Step 3 - Rename the required columns

In [0]:
airlines_renamed_df = (airlines_dropped_df.
                       withColumnRenamed("id","airline_id").
                       withColumnRenamed("country","airline_country")
                      )

##### Step 4 - Add new columns

In [0]:
import os
os.chdir("..")
from src import utils as ut

In [0]:
airlines_final_df = ut.add_ingestion_date(airlines_renamed_df)
airlines_final_df = airlines_final_df.withColumn("source", lit(data_source))
airlines_final_df = airlines_final_df.withColumn("file_date", lit(file_date))

In [0]:
display(airlines_final_df)

##### Step 5 - Write output to parquet file

In [0]:
# airlines_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}airlines")

In [0]:
(airlines_final_df.
 write.mode("overwrite").
 format("parquet").
 saveAsTable("dev_air_travel_processed.airlines")
)

In [0]:
dbutils.notebook.exit("Success")