In [1]:
import pandas as pd

def import_file(file_name, file_type):
    """
    Imports a CSV file into a pandas DataFrame.

    Args:
        file_name (str): The name of the CSV file to import.
        file_type (str): The type of data to import (raw, processed, etc.)

    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV file.
    """
    return pd.read_csv("../data/" + file_type + "/" + file_name)

airports_df = import_file("airports.csv", "raw")


In [2]:
# Filter American airports
us_airports_df = airports_df[airports_df['country'] == 'United States'].sort_values(by="iata")
us_airports_df


Unnamed: 0,id,name,city,country,iata,icao,lat,lon,alt,tz,dst,timezone,type,source
6042,8252,Apalachicola Regional Airport,Apalachicola,United States,AAF,KAAF,29.727501,-85.027496,20,-5,A,America/New_York,airport,OurAirports
6512,9225,Andrau Airpark,Houston,United States,AAP,KAAP,29.722500,-95.588303,79,-6,A,America/Chicago,airport,OurAirports
4085,4355,Lehigh Valley International Airport,Allentown,United States,ABE,KABE,40.652100,-75.440804,393,-5,A,America/New_York,airport,OurAirports
3518,3718,Abilene Regional Airport,Abilene,United States,ABI,KABI,32.411301,-99.681900,1791,-6,A,America/Chicago,airport,OurAirports
5415,7177,Ambler Airport,Ambler,United States,ABL,PAFM,67.106300,-157.856989,334,-9,A,America/Anchorage,airport,OurAirports
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6399,8931,Longview Ranch Airport,Longview,United States,\N,OG39,44.661701,-119.652000,2080,-8,A,America/Los_Angeles,airport,OurAirports
6398,8928,Windom Municipal Airport,Windom,United States,\N,KMWM,43.913399,-95.109398,1410,-6,A,America/Chicago,airport,OurAirports
3457,3657,Emanuel County Airport,Santa Barbara,United States,\N,KSBO,32.609100,-82.369904,327,-5,A,America/New_York,airport,OurAirports
6574,9409,Desert Aire Regional Airport,Mattawa,United States,\N,KM94,46.687401,-119.920998,586,-8,A,America/Los_Angeles,airport,OurAirports


In [3]:
# Remove unnecessary columns
us_airports_df = us_airports_df[['id', 'name', 'city', 'country','iata','lat','lon','alt']]
us_airports_df


Unnamed: 0,id,name,city,country,iata,lat,lon,alt
6042,8252,Apalachicola Regional Airport,Apalachicola,United States,AAF,29.727501,-85.027496,20
6512,9225,Andrau Airpark,Houston,United States,AAP,29.722500,-95.588303,79
4085,4355,Lehigh Valley International Airport,Allentown,United States,ABE,40.652100,-75.440804,393
3518,3718,Abilene Regional Airport,Abilene,United States,ABI,32.411301,-99.681900,1791
5415,7177,Ambler Airport,Ambler,United States,ABL,67.106300,-157.856989,334
...,...,...,...,...,...,...,...,...
6399,8931,Longview Ranch Airport,Longview,United States,\N,44.661701,-119.652000,2080
6398,8928,Windom Municipal Airport,Windom,United States,\N,43.913399,-95.109398,1410
3457,3657,Emanuel County Airport,Santa Barbara,United States,\N,32.609100,-82.369904,327
6574,9409,Desert Aire Regional Airport,Mattawa,United States,\N,46.687401,-119.920998,586


In [4]:
us_airports_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1512 entries, 6042 to 6205
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       1512 non-null   int64  
 1   name     1512 non-null   object 
 2   city     1512 non-null   object 
 3   country  1512 non-null   object 
 4   iata     1512 non-null   object 
 5   lat      1512 non-null   float64
 6   lon      1512 non-null   float64
 7   alt      1512 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 106.3+ KB


In [5]:
#replace all \N with NaN
us_airports_df = us_airports_df.replace('\\N', pd.NA)

#Count how many nulls in each column (nulls are represented by \N in dataset)
us_airports_df.isnull().sum()

id           0
name         0
city         0
country      0
iata       261
lat          0
lon          0
alt          0
dtype: int64

In [6]:
# Remove all rows that contain null values
us_airports_df = us_airports_df.dropna()
us_airports_df.isnull().sum()

id         0
name       0
city       0
country    0
iata       0
lat        0
lon        0
alt        0
dtype: int64

In [7]:
us_airports_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1251 entries, 6042 to 7105
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       1251 non-null   int64  
 1   name     1251 non-null   object 
 2   city     1251 non-null   object 
 3   country  1251 non-null   object 
 4   iata     1251 non-null   object 
 5   lat      1251 non-null   float64
 6   lon      1251 non-null   float64
 7   alt      1251 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 88.0+ KB


In [10]:
delay_df = import_file("Airline_Delay_Cause.csv", "raw")
delay_df = delay_df.sort_values(["year", "month", "carrier", "airport"])
delay_df.head()

Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,...,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
405358,2003,6,AA,American Airlines Inc.,ABQ,"Albuquerque, NM: Albuquerque International Sun...",307.0,56.0,14.68,10.79,...,1.48,9.96,1.0,1.0,2530.0,510.0,621.0,676.0,25.0,698.0
405359,2003,6,AA,American Airlines Inc.,ANC,"Anchorage, AK: Ted Stevens Anchorage Internati...",90.0,27.0,7.09,2.0,...,0.0,7.16,0.0,0.0,1390.0,271.0,83.0,581.0,0.0,455.0
405360,2003,6,AA,American Airlines Inc.,ATL,"Atlanta, GA: Hartsfield-Jackson Atlanta Intern...",752.0,186.0,33.99,27.82,...,1.9,17.53,5.0,0.0,8314.0,1367.0,1722.0,3817.0,139.0,1269.0
405361,2003,6,AA,American Airlines Inc.,AUS,"Austin, TX: Austin - Bergstrom International",842.0,174.0,60.24,20.54,...,4.69,40.75,9.0,1.0,8344.0,3040.0,1032.0,1835.0,115.0,2322.0
405362,2003,6,AA,American Airlines Inc.,BDL,"Hartford, CT: Bradley International",383.0,55.0,14.9,8.91,...,0.0,16.61,0.0,0.0,3137.0,815.0,574.0,555.0,0.0,1193.0


In [9]:
#compare airports in airports and in late
airports_late = us_airports_df[us_airports_df['iata'].isin(delay_df['airport'])]
airports_late

Unnamed: 0,id,name,city,country,iata,lat,lon,alt
4085,4355,Lehigh Valley International Airport,Allentown,United States,ABE,40.652100,-75.440804,393
3518,3718,Abilene Regional Airport,Abilene,United States,ABI,32.411301,-99.681900,1791
3805,4019,Albuquerque International Sunport,Albuquerque,United States,ABQ,35.040199,-106.609001,5355
4406,5714,Aberdeen Regional Airport,Aberdeen,United States,ABR,45.449100,-98.421799,1302
4407,5715,Southwest Georgia Regional Airport,Albany,United States,ABY,31.535500,-84.194504,197
...,...,...,...,...,...,...,...,...
4086,4356,Northwest Arkansas Regional Airport,Bentonville,United States,XNA,36.281898,-94.306801,1287
5183,6803,Yakutat Airport,Yakutat,United States,YAK,59.503300,-139.660004,33
4470,5779,Yakima Air Terminal McAllister Field,Yakima,United States,YKM,46.568199,-120.543999,1099
3681,3881,Youngstown Warren Regional Airport,Youngstown,United States,YNG,41.260700,-80.679100,1192
