In [77]:
import pandas as pd

In [78]:
airports_to_filter = ('DFW','CLT','ORD','DEN','ALT','SEA','PHX','LAX','DTW','IAH','SLC','PHL','MDW','SFO','BWI')
columns_to_use = ("Year",
          "Quarter",
          "Month",
          "DayofMonth",
          "FlightDate",
          "OriginAirportID",
          "Origin",
          "DestAirportID",
          "Dest",
          "CRSDepTime",
          "DepTime",
          "DepDelayMinutes",
          "DepDel15",
          "CRSArrTime",
          "ArrTime",
          "ArrDelayMinutes",
          "ArrDel15",
          "Cancelled")

Filter the dataset using <b>usecols</b> while reading it into pandas.

In [79]:
df = pd.read_csv('../dataset/Flights_2020_5.csv', usecols=columns_to_use, index_col=False, low_memory=False)

In [80]:
df.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,FlightDate,OriginAirportID,Origin,DestAirportID,Dest,CRSDepTime,DepTime,DepDelayMinutes,DepDel15,CRSArrTime,ArrTime,ArrDelayMinutes,ArrDel15,Cancelled
0,2020,2,5,31,2020-05-31,14570,RNO,14107,PHX,1500,1454.0,0.0,0.0,1645,1636.0,0.0,0.0,0.0
1,2020,2,5,31,2020-05-31,14570,RNO,14679,SAN,810,810.0,0.0,0.0,940,937.0,0.0,0.0,0.0
2,2020,2,5,31,2020-05-31,14576,ROC,10821,BWI,1355,1350.0,0.0,0.0,1505,1454.0,0.0,0.0,0.0
3,2020,2,5,31,2020-05-31,14576,ROC,10821,BWI,1030,1028.0,0.0,0.0,1140,1130.0,0.0,0.0,0.0
4,2020,2,5,31,2020-05-31,14635,RSW,10397,ATL,1715,1708.0,0.0,0.0,1900,1845.0,0.0,0.0,0.0


In [81]:
df.shape

(192412, 18)

# Remove flights that were cancelled

In [82]:
cancelled_condition = (df['Cancelled']==1.0)
df.drop(df[cancelled_condition].index,inplace=True)

In [83]:
df.shape

(180151, 18)

# Filter Origin and Destination
We have filtered the dataset to use only the top 15 busiest airport in the United States.

In [84]:
df = df[df["Origin"].isin(airports_to_filter)]
df = df[df["Dest"].isin(airports_to_filter)]

In [85]:
df.shape

(18064, 18)

# Function 

In [86]:
def eda(csv_file):
    df = pd.read_csv('./Flights_2021/'+csv_file+'.csv', usecols=columns_to_use, index_col=False, low_memory=False)
    # Remove cancelled flights
    cancelled_condition = (df['Cancelled']==1.0)
    df.drop(df[cancelled_condition].index,inplace=True)
    # Filter by airport
    df = df[df["Origin"].isin(airports_to_filter)]
    df = df[df["Dest"].isin(airports_to_filter)]
    df.drop(['Cancelled'], axis=1, inplace=True)
    df.to_csv('./Flights_2021-eda1/'+csv_file+'.csv')
    return csv_file

In [87]:
files = [f'Flights_2021_{i}' for i in range(1, 13)]
print(files)

['Flights_2021_1', 'Flights_2021_2', 'Flights_2021_3', 'Flights_2021_4', 'Flights_2021_5', 'Flights_2021_6', 'Flights_2021_7', 'Flights_2021_8', 'Flights_2021_9', 'Flights_2021_10', 'Flights_2021_11', 'Flights_2021_12']


In [88]:
for f in files:
    eda(f)

# Merge all csv to one

In [89]:
file_paths = [f'./Flights_2021-eda1/Flights_2021_{i}.csv' for i in range(1,13)]
print(file_paths)

['./Flights_2021-eda1/Flights_2021_1.csv', './Flights_2021-eda1/Flights_2021_2.csv', './Flights_2021-eda1/Flights_2021_3.csv', './Flights_2021-eda1/Flights_2021_4.csv', './Flights_2021-eda1/Flights_2021_5.csv', './Flights_2021-eda1/Flights_2021_6.csv', './Flights_2021-eda1/Flights_2021_7.csv', './Flights_2021-eda1/Flights_2021_8.csv', './Flights_2021-eda1/Flights_2021_9.csv', './Flights_2021-eda1/Flights_2021_10.csv', './Flights_2021-eda1/Flights_2021_11.csv', './Flights_2021-eda1/Flights_2021_12.csv']


In [90]:
_df = [pd.read_csv(f) for f in file_paths]

In [91]:
merged_df = pd.concat(_df, ignore_index=True)

In [92]:
merged_df.shape

(538051, 18)

In [98]:
merged_df.columns

Index(['Unnamed: 0', 'Year', 'Quarter', 'Month', 'DayofMonth', 'FlightDate',
       'OriginAirportID', 'Origin', 'DestAirportID', 'Dest', 'CRSDepTime',
       'DepTime', 'DepDelayMinutes', 'DepDel15', 'CRSArrTime', 'ArrTime',
       'ArrDelayMinutes', 'ArrDel15'],
      dtype='object')

In [99]:
merged_df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [100]:
merged_df.shape

(538051, 17)

In [101]:
merged_df.to_csv('flights_2021_all.csv')