# Pandas Clean-up

In [1]:
# Import Dependencies
import pandas as pd
from pathlib import Path

In [2]:
# Read in CSV and display a few rows
airports = pd.read_csv('Resources/us-airports.csv')
airports.head()

Unnamed: 0,id,ident,type,name,latitude_deg,longitude_deg,elevation_ft,continent,country_name,iso_country,...,municipality,scheduled_service,gps_code,iata_code,local_code,home_link,wikipedia_link,keywords,score,last_updated
0,#meta +id,#meta +code,#loc +airport +type,#loc +airport +name,#geo +lat,#geo +lon,#geo +elevation +ft,#region +continent +code,#country +name,#country +code +iso2,...,#loc +municipality +name,#status +scheduled,#loc +airport +code +gps,#loc +airport +code +iata,#loc +airport +code +local,#meta +url +airport,#meta +url +wikipedia,#meta +keywords,#meta +score,#date +updated
1,3632,KLAX,large_airport,Los Angeles International Airport,33.942501,-118.407997,125,,United States,US,...,Los Angeles,1,KLAX,LAX,LAX,https://www.flylax.com/,https://en.wikipedia.org/wiki/Los_Angeles_Inte...,Tom Bradley,1335475,2023-12-21T12:31:02+00:00
2,3754,KORD,large_airport,Chicago O'Hare International Airport,41.9786,-87.9048,680,,United States,US,...,Chicago,1,KORD,ORD,ORD,https://www.flychicago.com/ohare/home/pages/de...,https://en.wikipedia.org/wiki/O'Hare_Internati...,"CHI, Orchard Place",1503175,2024-03-09T23:28:49+00:00
3,3622,KJFK,large_airport,John F Kennedy International Airport,40.639447,-73.779317,13,,United States,US,...,New York,1,KJFK,JFK,JFK,https://www.jfkairport.com/,https://en.wikipedia.org/wiki/John_F._Kennedy_...,"Manhattan, New York City, NYC, Idlewild, IDL, ...",1052075,2022-10-18T18:49:55+00:00
4,3384,KATL,large_airport,Hartsfield Jackson Atlanta International Airport,33.6367,-84.428101,1026,,United States,US,...,Atlanta,1,KATL,ATL,ATL,http://www.atlanta-airport.com/,https://en.wikipedia.org/wiki/Hartsfield–Jacks...,,2002475,2018-09-19T14:50:01+00:00


In [3]:
airports.columns

Index(['id', 'ident', 'type', 'name', 'latitude_deg', 'longitude_deg',
       'elevation_ft', 'continent', 'country_name', 'iso_country',
       'region_name', 'iso_region', 'local_region', 'municipality',
       'scheduled_service', 'gps_code', 'iata_code', 'local_code', 'home_link',
       'wikipedia_link', 'keywords', 'score', 'last_updated'],
      dtype='object')

In [4]:
# Select columns and create new dataframe
airport_columns_df = pd.DataFrame(airports[['iata_code', 'type', 'name', 'latitude_deg', 'longitude_deg',
       'elevation_ft','region_name', 'local_region']])
airport_columns_df.head()

Unnamed: 0,iata_code,type,name,latitude_deg,longitude_deg,elevation_ft,region_name,local_region
0,#loc +airport +code +iata,#loc +airport +type,#loc +airport +name,#geo +lat,#geo +lon,#geo +elevation +ft,#adm1 +name,#adm1 +code +local
1,LAX,large_airport,Los Angeles International Airport,33.942501,-118.407997,125,California,CA
2,ORD,large_airport,Chicago O'Hare International Airport,41.9786,-87.9048,680,Illinois,IL
3,JFK,large_airport,John F Kennedy International Airport,40.639447,-73.779317,13,New York,NY
4,ATL,large_airport,Hartsfield Jackson Atlanta International Airport,33.6367,-84.428101,1026,Georgia,GA


In [5]:
# Drop description row, leave headers
airports_df = airport_columns_df.drop(0).reset_index(drop=True)
airports_df.head()

Unnamed: 0,iata_code,type,name,latitude_deg,longitude_deg,elevation_ft,region_name,local_region
0,LAX,large_airport,Los Angeles International Airport,33.942501,-118.407997,125,California,CA
1,ORD,large_airport,Chicago O'Hare International Airport,41.9786,-87.9048,680,Illinois,IL
2,JFK,large_airport,John F Kennedy International Airport,40.639447,-73.779317,13,New York,NY
3,ATL,large_airport,Hartsfield Jackson Atlanta International Airport,33.6367,-84.428101,1026,Georgia,GA
4,SFO,large_airport,San Francisco International Airport,37.61899948120117,-122.375,13,California,CA


In [6]:
# Check types of airports included
airports_df['type'].value_counts()

type
small_airport     15018
heliport           7935
closed             6484
medium_airport      806
seaplane_base       631
large_airport        67
balloonport          29
Name: count, dtype: int64

In [7]:
# Remove erroneous types
erroneous = ((airports_df['type'] == 'balloonport') | (airports_df['type'] == 'closed') | \
            (airports_df['type'] == 'seaplane_base') | (airports_df['type'] == 'heliport') | \
            airports_df['iata_code'].isna())

cleaned_df = airports_df[~erroneous].reset_index(drop=True)
cleaned_df.head()

Unnamed: 0,iata_code,type,name,latitude_deg,longitude_deg,elevation_ft,region_name,local_region
0,LAX,large_airport,Los Angeles International Airport,33.942501,-118.407997,125,California,CA
1,ORD,large_airport,Chicago O'Hare International Airport,41.9786,-87.9048,680,Illinois,IL
2,JFK,large_airport,John F Kennedy International Airport,40.639447,-73.779317,13,New York,NY
3,ATL,large_airport,Hartsfield Jackson Atlanta International Airport,33.6367,-84.428101,1026,Georgia,GA
4,SFO,large_airport,San Francisco International Airport,37.61899948120117,-122.375,13,California,CA


In [8]:
# Verify types
cleaned_df['type'].value_counts()

type
small_airport     1047
medium_airport     767
large_airport       66
Name: count, dtype: int64

In [9]:
cleaned_df['region_name'].value_counts()

region_name
Alaska                  268
California              142
Texas                   115
Florida                  75
Michigan                 48
Washington               47
Arizona                  46
New York                 43
Wisconsin                43
Iowa                     42
Oklahoma                 41
Pennsylvania             40
Illinois                 39
Colorado                 37
Georgia                  37
Minnesota                36
Kansas                   36
Oregon                   36
Indiana                  35
Ohio                     35
North Carolina           34
Arkansas                 32
Virginia                 32
South Carolina           31
Nebraska                 31
Nevada                   29
New Mexico               29
Missouri                 28
Wyoming                  28
Utah                     28
Alabama                  27
Mississippi              27
Montana                  26
Tennessee                25
Louisiana                22
Hawaii  

In [10]:
cleaned_df.tail()

Unnamed: 0,iata_code,type,name,latitude_deg,longitude_deg,elevation_ft,region_name,local_region
1875,UMT,small_airport,Umiat Airport,69.37110138,-152.1360016,267,Alaska,AK
1876,UWA,small_airport,Ware Airport,42.282001495361,-72.214797973633,483,Massachusetts,MA
1877,EAN,small_airport,Phifer Airfield,42.0555,-104.929001,4776,Wyoming,WY
1878,WSM,small_airport,Wiseman Airport,67.4046020508,-150.123001099,1180,Alaska,AK
1879,LGF,small_airport,Laguna Army Airfield,32.86000061,-114.3970032,433,Arizona,AZ


In [14]:
#exporting to csv
cleaned_df.to_csv('Resources/us-airports-cleaned.csv', index=False)

In [15]:
erroneous2 = ((airports_df['type'] == 'balloonport') | (airports_df['type'] == 'closed') | \
            (airports_df['type'] == 'seaplane_base') | (airports_df['type'] == 'heliport') | \
            (airports_df['type'] == 'small_airport') | (airports_df['type'] == 'medium_airport') | \
            (airports_df['iata_code'].isna()))
cleaned_df2 = airports_df[~erroneous2]
cleaned_df2['type'].value_counts()

type
large_airport    66
Name: count, dtype: int64

In [16]:
# creating data source with just large airports and their coordinates
large_airports_lat_long = pd.DataFrame(cleaned_df2[['iata_code','name', 'latitude_deg', 'longitude_deg']])
large_airports_lat_long.head()

Unnamed: 0,iata_code,name,latitude_deg,longitude_deg
0,LAX,Los Angeles International Airport,33.942501,-118.407997
1,ORD,Chicago O'Hare International Airport,41.9786,-87.9048
2,JFK,John F Kennedy International Airport,40.639447,-73.779317
3,ATL,Hartsfield Jackson Atlanta International Airport,33.6367,-84.428101
4,SFO,San Francisco International Airport,37.61899948120117,-122.375


In [17]:
#exporting to csv
large_airports_lat_long.to_csv('Resources/large_airports_lat_long.csv', index=False)