In [0]:
import pytz
from datetime import datetime, timedelta

In [0]:
# Define the base path where the subfolders are located
departures_base_path = '/mnt/raw/departures/'
arrivals_base_path = '/mnt/raw/arrivals/'

# Use a wildcard (*) to read all Parquet files in all subfolders
df_departures = spark.read.parquet(f'{departures_base_path}*/')
df_arrivals = spark.read.parquet(f'{arrivals_base_path}*/')


departure_row = df_departures.count()
arrivals_row = df_arrivals.count()

print(departure_row)
print(arrivals_row)

196
202


In [0]:
df_departures_sorted = df_departures.orderBy("actual_departure_myt", ascending=True)

# Show the sorted DataFrame
df_departures_sorted.show(202)

+---------+-------------+-------------+-----------------------+--------------------+------------------+--------------------+----------------+
|flight_id|flight_number|aircraft_type|scheduled_departure_myt|actual_departure_myt|            origin|         destination|gate_destination|
+---------+-------------+-------------+-----------------------+--------------------+------------------+--------------------+----------------+
|   MAS204|          204|        B738 |    2024-09-03 23:10:00| 2024-09-04 00:00:38|Kuala Lumpur Int'l|Thiruvananthapura...|            NULL|
|   KAL672|          672|        A333 |    2024-09-03 23:30:00| 2024-09-04 00:06:28|Kuala Lumpur Int'l|       Incheon Int'l|             248|
|  CSC3994|         3994|        A321 |    2024-09-04 00:15:00| 2024-09-04 00:09:25|Kuala Lumpur Int'l|Chengdu Tianfu Int'l|            NULL|
|   RMY382|          382|         NULL|    2024-09-04 00:39:25| 2024-09-04 00:12:17|Kuala Lumpur Int'l|    Singapore Changi|            NULL|
|   MX

In [0]:
df_arrivals_sorted = df_arrivals.orderBy("actual_arrival_myt", ascending=True)

# Show the sorted DataFrame
df_arrivals_sorted.show(202)

+---------+-------------+-------------+---------------------+-------------------+--------------------+------------------+----------------+
|flight_id|flight_number|aircraft_type|scheduled_arrival_myt| actual_arrival_myt|              origin|       destination|gate_destination|
+---------+-------------+-------------+---------------------+-------------------+--------------------+------------------+----------------+
|   KXP210|          210|         NULL|  2024-09-04 04:20:22|2024-09-04 00:15:05|       Chennai Int'l|Kuala Lumpur Int'l|            NULL|
|  CSN5079|         5079|        A320 |  2024-09-03 23:56:00|2024-09-04 00:16:33|Shenzhen Bao'an I...|Kuala Lumpur Int'l|            NULL|
|   MXD693|          693|        B38M |                  N/A|2024-09-04 00:21:08|                NULL|Kuala Lumpur Int'l|            NULL|
|  CES6021|         6021|        A359 |  2024-09-04 00:13:00|2024-09-04 00:21:26|Shanghai Pudong I...|Kuala Lumpur Int'l|            NULL|
|  AXM5752|         5752|  

In [0]:
folder_path_arrivals = f'/mnt/compiled/arrivals/'
folder_path_departures = f'/mnt/compiled/departures/'

# Coalesce the DataFrame to a single partition
df_arrivals_coalesced = df_arrivals.coalesce(1)
df_departures_coalesced = df_departures.coalesce(1)

# Write the coalesced DataFrame to a Parquet file in the specified folder
df_arrivals_coalesced.write.mode('overwrite').parquet(folder_path_arrivals)
df_departures_coalesced.write.mode('overwrite').parquet(folder_path_departures)

# List the files in the directory after writing the Parquet file
files_arrivals = dbutils.fs.ls(folder_path_arrivals)
files_departures = dbutils.fs.ls(folder_path_departures)

# Correct the file path for renaming
corrected_file_path_arrivals = f"{folder_path_arrivals}arrivals_compiled.parquet"
corrected_file_path_departures = f"{folder_path_departures}departures_compiled.parquet"

# Find the part file and rename it (for arrivals)
for file in files_arrivals:
    if file.name.startswith("part-"):
        # Move (rename) the part file to the desired file name
        dbutils.fs.mv(file.path, corrected_file_path_arrivals)
        break

# Find the part file and rename it (for departures)
for file in files_departures:
    if file.name.startswith("part-"):
        # Move (rename) the part file to the desired file name
        dbutils.fs.mv(file.path, corrected_file_path_departures)
        break
