In [15]:
import pandas as pd
import numpy as np

path_import = "../../../Thesis_data/raw_data/"
path_export = "../../../Thesis_data/processed_data/"

In [16]:
#Define pre-selected columns from all datasets
cols_ontime_reporting = ["MONTH", #month of departure, 1 = January, ... 12 = December
                         "DAY_OF_MONTH", #Day of month of departure
                         "DAY_OF_WEEK", #Day of week of departure, 1 = Monday, ... 7 = Sunday
                         "TAIL_NUM", #Unique tail number of aircraft
                         "ORIGIN_AIRPORT_ID", #Unique airport id, matches with ORIGIN
                         "ORIGIN", #International Air Transport Association's (IATA) Location Identifier code, unique 3 letter code matches to departure location
                         "ORIGIN_CITY_NAME", #City name with state abbreviation used to match with us_cities 'City'
                         "DEST", #International Air Transport Association's (IATA) Location Identifier code, unique 3 letter code matches to destination location
                         "DISTANCE_GROUP", #Miles between ORIGIN and DESTINATION, grouped together by integers,
                         "CRS_DEP_TIME", #4 digit military time formatting of the planned departure time
                         "DEP_DEL15", #Binary number that classifies a delay (1) as: a aircraft departing 15 minutes later than planned
                         "OP_UNIQUE_CARRIER"
                         ]

cols_aircraft_inventory = ["TAIL_NUM", #Unique tail number of aircraft
                           "MANUFACTURE_YEAR", #Manufacturing year of the plane
                           "NUMBER_OF_SEATS" #N of seats on a plane
                           ]

cols_airport_list = ["ORIGIN_AIRPORT_ID", #Unique airport id, matches with ORIGIN_AIRPORT_ID from ontime_reporting
                     "NAME" #Location of weather reading, matches with NAME from airport_weather
                     ]

cols_airport_weather = ["NAME", #Location of weather reading
                        "DATE",
                        "PRCP",
                        "SNOW",
                        "SNWD",
                        "TMAX", #Maximum temperature that day in Fahrenheit
                        "AWND" #Maximum wind speed that day in Miles per Hour
                        ]

cols_us_cities = ["City",
                  "Median Age",
                  "Total Population",
                  "Average Household Size"
                  ]

## Loading on-time reporting data for each month

In [17]:
#Loading the ontime_reporting data for each month and concatanating them on an empty DataFrame
ontime_reporting_all = pd.DataFrame() #Define empty dataframe

#Going through all the CSV files (12, for each month one) related to On-Time Airplane Reporting and concatenating them
for i in range(1,13): #13
    if i == 1:
        ontime_reporting_montly = pd.read_csv(path_import + "ONTIME_REPORTING_" + str(i) + ".csv", usecols=cols_ontime_reporting)
        ontime_reporting_all = ontime_reporting_montly
        print(ontime_reporting_all.shape)
    else:
        ontime_reporting_montly = pd.read_csv(path_import + "ONTIME_REPORTING_" + str(i) + ".csv", usecols=cols_ontime_reporting)
        ontime_reporting_all = pd.concat([ontime_reporting_all, ontime_reporting_montly])
        print(ontime_reporting_all.shape)

(583985, 12)
(1117160, 12)
(1749234, 12)
(2361257, 12)
(2997647, 12)
(3634338, 12)
(4293367, 12)
(4951828, 12)
(5557807, 12)
(6193821, 12)
(6796274, 12)
(7422037, 12)


## Loading aircraft inventory data and merging with on-time data

In [18]:
#Loading aircraft inventory list and merging it with ontime_reporting_all
aircraft_inventory_data = pd.read_csv(path_import + "B43_AIRCRAFT_INVENTORY.csv", encoding='latin1', usecols=cols_aircraft_inventory)
print("Shape of aircraft_inventory_data: ", aircraft_inventory_data.shape)
print("Shape of ontime_reporting_all before merge: ", ontime_reporting_all.shape)
ontime_reporting_all = ontime_reporting_all.merge(aircraft_inventory_data, on='TAIL_NUM', how="left")
print("Shape of ontime_reporting_all after merge: ", ontime_reporting_all.shape)

Shape of aircraft_inventory_data:  (7383, 3)
Shape of ontime_reporting_all before merge:  (7422037, 12)
Shape of ontime_reporting_all after merge:  (7441892, 14)


## Loading airport and weather data then merging them

In [19]:
#Loading airport_list and airport_weather and meging it with ontime_reporting_all
airport_list_data = pd.read_csv(path_import + "airports_list.csv", usecols=cols_airport_list)
print("Shape of airport_list_data: ", airport_list_data.shape)
airport_weather_data = pd.read_csv(path_import + "airport_weather_2019.csv", usecols=cols_airport_weather)
print("Shape of airport_weather_data: ", airport_weather_data.shape)

Shape of airport_list_data:  (97, 2)
Shape of airport_weather_data:  (38675, 7)


In [20]:
#Converting DATE to datetime dtype and extracting Month and Day for merging with ontime_reporting_all
airport_weather_data['DATE'] = pd.to_datetime(airport_weather_data['DATE'])
airport_weather_data['MONTH'] = pd.DatetimeIndex(airport_weather_data['DATE']).month
airport_weather_data['DAY_OF_MONTH'] = pd.DatetimeIndex(airport_weather_data['DATE']).day

In [21]:
#Merging airport_list with airport_weather for linking ORIGIN_AIRPORT_ID to NAME
#An left join was made because if an inner join would have been made we would have lost relevant weather data
print("Shape of airport_weather_data before merge: ", airport_weather_data.shape)
airport_weather_data = airport_list_data.merge(airport_weather_data, on="NAME", how="left")
print("Shape of airport_weather_data after merge: ", airport_weather_data.shape)

#Dropping redundant columns
airport_weather_data = airport_weather_data.drop(columns=["NAME", "DATE"])

Shape of airport_weather_data before merge:  (38675, 9)
Shape of airport_weather_data after merge:  (35025, 10)


## Merging Weather and on-time data

In [22]:
#Merging airport_weather_data with ontime_reporting_all
#An inner join has been chosen here because when weather was missing it was almost completly missing from 
print("Shape of ontime_reporting_all before merge: ", ontime_reporting_all.shape)
ontime_reporting_all = ontime_reporting_all.merge(airport_weather_data, how='inner', on=['ORIGIN_AIRPORT_ID', 'MONTH', 'DAY_OF_MONTH'])
print("Shape of ontime_reporting_all after merge: ", ontime_reporting_all.shape)

Shape of ontime_reporting_all before merge:  (7441892, 14)
Shape of ontime_reporting_all after merge:  (6708260, 19)


## Loading and merging US city and on-time data

In [23]:
#Loading US Cities data using predefined columns
us_cities_data = pd.read_csv(path_import + "us-cities-demographics-2015.csv", usecols=cols_us_cities, delimiter=";")
print("Shape of us_cities_data: ", us_cities_data.shape)

Shape of us_cities_data:  (2891, 4)


In [24]:
#Redefining US cities column names to match City with ORIGIN_CITY_NAME from on-time reporting data
us_cities_data.rename(columns = {"City":"ORIGIN_CITY_NAME", "Median Age":"MEDIAN_AGE", "Total Population":"TOT_POP", "Average Household Size":"AVG_HOUSEHOLD_SIZE"}, inplace = True)

In [25]:
#Removing duplicates as the demograhics are devided into race with a seperate count but general statistics are the same but just repeating per city
us_cities_data = us_cities_data.drop_duplicates(subset='ORIGIN_CITY_NAME')
print("Shape of us_cities_data: ", us_cities_data.shape)

Shape of us_cities_data:  (567, 4)


In [26]:
#Splitting City name and state from on-time reporting data and dropping state abbreviation
ontime_reporting_all["ORIGIN_CITY_NAME"] = ontime_reporting_all['ORIGIN_CITY_NAME'].str.split(',').str[0]

In [27]:
#Merging on-time reporting with US cities data via an inner join,
#The inner join has been chosen because it is difficult to impute the missing values for each city when doing a left join and would only generate noise and a skewed image
print("Shape of ontime_reporting_all before merge: ", ontime_reporting_all.shape)
ontime_reporting_all = ontime_reporting_all.merge(us_cities_data, how='inner', on='ORIGIN_CITY_NAME')
print("Shape of ontime_reporting_all before merge: ", ontime_reporting_all.shape)

Shape of ontime_reporting_all before merge:  (6708260, 19)
Shape of ontime_reporting_all before merge:  (6001976, 22)


## Exporting the dataset

In [28]:
#Exporting the whole dataset
ontime_reporting_all.to_csv(path_export + "ontime_reporting_export.csv", index=False)