# Preparing data for Power Bi Dashboard

In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [60]:
departures = pd.read_csv('../../data/Departure.csv',encoding='latin1',low_memory=False)

In [61]:
arrivals = pd.read_csv('../../data/Arrivals.csv', encoding='utf-8-sig', low_memory=False)

In [62]:
cancellations = pd.read_csv('../../data/Airlines_Cancellation.csv',encoding='utf-8-sig',low_memory=False)

In [63]:
departures.shape

(497696, 16)

In [6]:
arrivals.shape

(513001, 16)

In [7]:
cancellations.shape

(10370, 6)

In [15]:
departures.rename(columns={'Date (MM/DD/YYYY)':'Flight Date'},inplace =True)

In [31]:
arrivals.rename(columns={'Date (MM/DD/YYYY)':'Flight Date'},inplace =True)

In [32]:
cancellations.rename(columns={'Date (MM/DD/YYYY)':'Flight Date'},inplace =True)

In [33]:
departures['Flight Date'] = pd.to_datetime(departures['Flight Date'])

In [34]:
### Converting Date columns to datetime
departures['Flight Date'] = pd.to_datetime(departures['Flight Date'])
arrivals['Flight Date'] = pd.to_datetime(arrivals['Flight Date'])  # adjust if your column name is 'Date'
cancellations['Flight Date'] = pd.to_datetime(cancellations['Flight Date'])

In [35]:
### Function to classify COVID Phase
def covid_phase(date):
    if date < pd.Timestamp('2020-01-01'):
        return 'Pre-COVID'
    elif date <= pd.Timestamp('2021-12-31'):
        return 'During-COVID'
    else:
        return 'Post-COVID'

In [36]:
### Applying the covid_phase function to each dataframe
departures['COVID Phase'] = departures['Flight Date'].apply(covid_phase)
arrivals['COVID Phase'] = arrivals['Flight Date'].apply(covid_phase)
cancellations['COVID Phase'] = cancellations['Flight Date'].apply(covid_phase)

In [37]:
departures

Unnamed: 0,Carrier Code,Airlines,Flight Date,Flight Number,Destination Airport,Destination City,Scheduled departure time,Actual departure time,Scheduled elapsed time (Minutes),Actual elapsed time (Minutes),Departure delay (Minutes),Delay Carrier (Minutes),Delay Weather (Minutes),Delay National Aviation System (Minutes),Delay Security (Minutes),Delay Late Aircraft Arrival (Minutes),Date,COVID Phase
0,AA,American Airlines,2018-01-01,469,PHL,"Philadelphia, PA",9:24:00 AM,9:35:00 AM,124,109,11,0,0,0,0,0,2018-01-01,Pre-COVID
1,AA,American Airlines,2018-01-01,602,DFW,"DallasFort Worth, TX",8:51:00 PM,9:13:00 PM,139,132,22,0,0,0,0,15,2018-01-01,Pre-COVID
2,AA,American Airlines,2018-01-01,820,PHL,"Philadelphia, PA",7:00:00 AM,6:59:00 AM,134,124,-1,0,0,0,0,0,2018-01-01,Pre-COVID
3,AA,American Airlines,2018-01-01,1065,DFW,"DallasFort Worth, TX",9:28:00 AM,9:49:00 AM,136,127,21,0,0,0,0,0,2018-01-01,Pre-COVID
4,AA,American Airlines,2018-01-01,1439,DFW,"DallasFort Worth, TX",3:20:00 PM,3:21:00 PM,134,119,1,0,0,0,0,0,2018-01-01,Pre-COVID
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497691,WN,Southwest Airlines,2024-12-31,4966,PHX,"Phoenix, AZ",5:50:00 AM,5:49:00 AM,240,258,-1,0,0,17,0,0,2024-12-31,Post-COVID
497692,WN,Southwest Airlines,2024-12-31,5185,MCI,"Kansas City, MO",8:35:00 AM,8:44:00 AM,105,100,9,0,0,0,0,0,2024-12-31,Post-COVID
497693,WN,Southwest Airlines,2024-12-31,5253,DEN,"Denver, CO",2:50:00 PM,2:47:00 PM,180,186,-3,0,0,0,0,0,2024-12-31,Post-COVID
497694,WN,Southwest Airlines,2024-12-31,5302,AUS,"Austin, TX",8:50:00 AM,8:54:00 AM,145,144,4,0,0,0,0,0,2024-12-31,Post-COVID


In [38]:
dep_agg = departures.groupby(['Airlines','Flight Date','COVID Phase']).agg({
    'Departure delay (Minutes)':'mean',
    'Delay Carrier (Minutes)':'sum',
    'Delay Weather (Minutes)':'sum',
    'Delay National Aviation System (Minutes)':'sum',
    'Delay Security (Minutes)':'sum',
    'Delay Late Aircraft Arrival (Minutes)':'sum',
    'Flight Number':'count'   # Total depatures flights
}).reset_index().rename(columns={'Flight Number':'Total Flights'})

In [57]:
arr_agg = arrivals.groupby(['Airlines','Flight Date','COVID Phase']).agg({
    'Arrival Delay (Minutes)':'mean',
    'Delay Carrier (Minutes)':'sum',
    'Delay Weather (Minutes)':'sum',
    'Delay National Aviation System (Minutes)':'sum',
    'Delay Security (Minutes)':'sum',
    'Delay Late Aircraft Arrival (Minutes)':'sum',
    'Flight Number':'count' ## Total Arrivals
}).reset_index().rename(columns={'Flight Number':'Total Flights'})

In [58]:
cancel_agg = cancellations.groupby(['Airlines','Flight Date','COVID Phase']).agg({
    'Flight_Number':'count'   # Number of cancelled flights
}).reset_index().rename(columns={'Flight_Number':'Cancelled Flights'})

In [41]:
cancellations

Unnamed: 0,Carrier Code,Airlines,Flight Date,Flight_Number,Destination Airport,Destination City,Date,COVID Phase
0,AA,American Airlines,2018-01-04,469,PHL,"Philadelphia, PA",2018-01-04,Pre-COVID
1,AA,American Airlines,2018-01-08,1899,PHL,"Philadelphia, PA",2018-01-08,Pre-COVID
2,AA,American Airlines,2018-01-11,1439,DFW,"Dallas–Fort Worth, TX",2018-01-11,Pre-COVID
3,AA,American Airlines,2018-01-17,1694,CLT,"Charlotte, NC",2018-01-17,Pre-COVID
4,AA,American Airlines,2018-01-17,1736,CLT,"Charlotte, NC",2018-01-17,Pre-COVID
...,...,...,...,...,...,...,...,...
10365,WN,Southwest Airlines,2025-07-31,1131,LAS,"Las Vegas, NV",2025-07-31,Post-COVID
10366,WN,Southwest Airlines,2025-07-31,1139,LGA,"New York, NY",2025-07-31,Post-COVID
10367,WN,Southwest Airlines,2025-07-31,1143,LGA,"New York, NY",2025-07-31,Post-COVID
10368,WN,Southwest Airlines,2025-07-31,1166,MCO,"Orlando, FL",2025-07-31,Post-COVID


In [50]:
dep_cancel = dep_agg.merge(cancel_agg, on=['Airlines','Flight Date','COVID Phase'], how='left')
dep_cancel['Cancelled Flights'] = dep_cancel['Cancelled Flights'].fillna(0)
dep_cancel['Cancellation Rate (%)'] = dep_cancel['Cancelled Flights'] / dep_cancel['Total Flights'] * 100

In [53]:
merged = dep_cancel.merge(arr_agg, on=['Airlines','Flight Date','COVID Phase'], how='left', suffixes=('_Dep','_Arr'))

In [54]:
merged

Unnamed: 0,Airlines,Flight Date,COVID Phase,Departure delay (Minutes),Delay Carrier (Minutes)_Dep,Delay Weather (Minutes)_Dep,Delay National Aviation System (Minutes)_Dep,Delay Security (Minutes)_Dep,Delay Late Aircraft Arrival (Minutes)_Dep,Total Flights_Dep,Cancelled Flights,Cancellation Rate (%),Arrival Delay (Minutes),Delay Carrier (Minutes)_Arr,Delay Weather (Minutes)_Arr,Delay National Aviation System (Minutes)_Arr,Delay Security (Minutes)_Arr,Delay Late Aircraft Arrival (Minutes)_Arr,Total Flights_Arr
0,Alaska Airlines,2018-01-01,Pre-COVID,-1.000000,0,0,0,0,0,1,0.0,0.000000,-21.000000,0.0,0.0,0.0,0.0,0.0,1.0
1,Alaska Airlines,2018-01-02,Pre-COVID,-4.000000,0,0,0,0,0,1,0.0,0.000000,-1.000000,0.0,0.0,0.0,0.0,0.0,1.0
2,Alaska Airlines,2018-01-03,Pre-COVID,-2.000000,0,0,0,0,0,1,0.0,0.000000,-16.000000,0.0,0.0,0.0,0.0,0.0,1.0
3,Alaska Airlines,2018-01-04,Pre-COVID,1.000000,0,0,0,0,0,1,0.0,0.000000,-1.000000,0.0,0.0,0.0,0.0,0.0,1.0
4,Alaska Airlines,2018-01-05,Pre-COVID,5.000000,0,0,0,0,0,1,0.0,0.000000,-33.000000,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20950,United Airlines,2025-07-27,Post-COVID,50.954545,137,0,793,0,171,22,1.0,4.545455,16.045455,2.0,70.0,46.0,0.0,367.0,22.0
20951,United Airlines,2025-07-28,Post-COVID,36.809524,85,0,524,0,109,21,1.0,4.761905,1.428571,65.0,0.0,45.0,0.0,79.0,21.0
20952,United Airlines,2025-07-29,Post-COVID,4.705882,0,0,233,0,9,17,1.0,5.882353,-2.764706,11.0,0.0,20.0,0.0,29.0,17.0
20953,United Airlines,2025-07-30,Post-COVID,15.600000,57,0,155,0,217,20,0.0,0.000000,18.850000,73.0,0.0,188.0,0.0,212.0,20.0


In [56]:
merged.to_csv('Flight_Delay_merged_powerbi.csv', index=False)

In [67]:
merged['Total Flights_Dep'].sum()

np.int64(497696)