In [20]:
import pandas as pd

def import_file(file_name, file_type):
    """
    Imports a CSV file into a pandas DataFrame.

    Args:
        file_name (str): The name of the CSV file to import.
        file_type (str): The type of data to import (raw, processed, etc.)

    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV file.
    """
    return pd.read_csv("../data/" + file_type + "/" + file_name)

airports_df = import_file("airports.csv", "raw")


In [21]:
# Filter American airports
us_airports_df = airports_df[airports_df['country'] == 'United States'].sort_values(by="iata")
us_airports_df


Unnamed: 0,id,name,city,country,iata,icao,lat,lon,alt,tz,dst,timezone,type,source
6042,8252,Apalachicola Regional Airport,Apalachicola,United States,AAF,KAAF,29.727501,-85.027496,20,-5,A,America/New_York,airport,OurAirports
6512,9225,Andrau Airpark,Houston,United States,AAP,KAAP,29.722500,-95.588303,79,-6,A,America/Chicago,airport,OurAirports
4085,4355,Lehigh Valley International Airport,Allentown,United States,ABE,KABE,40.652100,-75.440804,393,-5,A,America/New_York,airport,OurAirports
3518,3718,Abilene Regional Airport,Abilene,United States,ABI,KABI,32.411301,-99.681900,1791,-6,A,America/Chicago,airport,OurAirports
5415,7177,Ambler Airport,Ambler,United States,ABL,PAFM,67.106300,-157.856989,334,-9,A,America/Anchorage,airport,OurAirports
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6399,8931,Longview Ranch Airport,Longview,United States,\N,OG39,44.661701,-119.652000,2080,-8,A,America/Los_Angeles,airport,OurAirports
6398,8928,Windom Municipal Airport,Windom,United States,\N,KMWM,43.913399,-95.109398,1410,-6,A,America/Chicago,airport,OurAirports
3457,3657,Emanuel County Airport,Santa Barbara,United States,\N,KSBO,32.609100,-82.369904,327,-5,A,America/New_York,airport,OurAirports
6574,9409,Desert Aire Regional Airport,Mattawa,United States,\N,KM94,46.687401,-119.920998,586,-8,A,America/Los_Angeles,airport,OurAirports


In [22]:
# Remove unnecessary columns
us_airports_df = us_airports_df[['id', 'name', 'city', 'country','iata','lat','lon','alt']]
us_airports_df


Unnamed: 0,id,name,city,country,iata,lat,lon,alt
6042,8252,Apalachicola Regional Airport,Apalachicola,United States,AAF,29.727501,-85.027496,20
6512,9225,Andrau Airpark,Houston,United States,AAP,29.722500,-95.588303,79
4085,4355,Lehigh Valley International Airport,Allentown,United States,ABE,40.652100,-75.440804,393
3518,3718,Abilene Regional Airport,Abilene,United States,ABI,32.411301,-99.681900,1791
5415,7177,Ambler Airport,Ambler,United States,ABL,67.106300,-157.856989,334
...,...,...,...,...,...,...,...,...
6399,8931,Longview Ranch Airport,Longview,United States,\N,44.661701,-119.652000,2080
6398,8928,Windom Municipal Airport,Windom,United States,\N,43.913399,-95.109398,1410
3457,3657,Emanuel County Airport,Santa Barbara,United States,\N,32.609100,-82.369904,327
6574,9409,Desert Aire Regional Airport,Mattawa,United States,\N,46.687401,-119.920998,586


In [23]:
us_airports_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1512 entries, 6042 to 6205
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       1512 non-null   int64  
 1   name     1512 non-null   object 
 2   city     1512 non-null   object 
 3   country  1512 non-null   object 
 4   iata     1512 non-null   object 
 5   lat      1512 non-null   float64
 6   lon      1512 non-null   float64
 7   alt      1512 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 106.3+ KB


In [24]:
#replace all \N with NaN
us_airports_df = us_airports_df.replace('\\N', pd.NA)

#Count how many nulls in each column (nulls are represented by \N in dataset)
us_airports_df.isnull().sum()

id           0
name         0
city         0
country      0
iata       261
lat          0
lon          0
alt          0
dtype: int64

In [25]:
# Remove all rows that contain null values
us_airports_df = us_airports_df.dropna()
us_airports_df.isnull().sum()

id         0
name       0
city       0
country    0
iata       0
lat        0
lon        0
alt        0
dtype: int64

In [26]:
us_airports_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1251 entries, 6042 to 7105
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       1251 non-null   int64  
 1   name     1251 non-null   object 
 2   city     1251 non-null   object 
 3   country  1251 non-null   object 
 4   iata     1251 non-null   object 
 5   lat      1251 non-null   float64
 6   lon      1251 non-null   float64
 7   alt      1251 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 88.0+ KB
