<div class="alert alert-block alert-info">A notebook that extracts US airport tables by state from Wikipedia and saves them as CSV files. This can be useful for categorizing airports (e.g., civil, military, other; global, local, regional, etc.).</div>

<div class="alert alert-block alert-warning"> <b>Warning:</b> The airport category (APT_CAT) still needs to be mapped to standardized codes. For example, 'Other military/government airports' and 'Other government/military airports' refer to the same category but are written differently depending on the Wikipedia page. These variations should be harmonized to ensure consistency. </div>

# Import

In [None]:
import pandas as pd
import polars as pl
import requests
from urllib.parse import quote
from io import StringIO
from polars import col as d

# Code

In [None]:
## states list
states = [
    "Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware",
    "Florida","Georgia","Hawaii","Idaho","Illinois","Indiana","Iowa","Kansas","Kentucky",
    "Louisiana","Maine","Maryland","Massachusetts","Michigan","Minnesota","Mississippi",
    "Missouri","Montana","Nebraska","Nevada","New_Hampshire","New_Jersey","New_Mexico",
    "New_York","North_Carolina","North_Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
    "Rhode_Island","South_Carolina","South_Dakota","Tennessee","Texas","Utah","Vermont",
    "Virginia","Washington","West_Virginia","Wisconsin","Wyoming"
]

In [None]:
all_airports = [] ## empty list to save the tables and concat them after

## loop for all states
for state in states:

    ## handle special Wikipedia page URLs for certain states
    if state == "Washington":
        url = f"https://en.wikipedia.org/wiki/List_of_airports_in_Washington_(state)"
    elif state == "New_York":
        url = f"https://en.wikipedia.org/wiki/List_of_airports_in_New_York_(state)"
    elif state == "Georgia":
        url = f"https://en.wikipedia.org/wiki/List_of_airports_in_Georgia_(U.S._state)"

    ## default URL format for other states
    else:
        url = f"https://en.wikipedia.org/wiki/List_of_airports_in_{quote(state)}"

    ## set a User-Agent to avoid request blocking
    ## basically we pretend to be a browser to be able to access the data
    headers = {"User-Agent": "Mozilla/5.0"}

    try:
        ## send HTTP GET request to Wikipedia
        response = requests.get(url, headers=headers)
        response.raise_for_status() ## raise an error if request fails

        html_data = StringIO(response.text) ## convert HTML content to a file-like object
        tables = pd.read_html(html_data) ## read all HTML tables on the page with pandas

        ## filter tables to keep only those that have columns "Airport name" or "Role"
        ## this helps avoid parsing unwanted tables like banners or notices
        tables = [t for t in tables if 'Airport name' in t.columns or 'Role' in t.columns]

        if not tables:
            print(f"No airport table found for {state}")
            continue ## skip this state if no valid table is found

        df = tables[0] ## take the first valid table
        df["STATE"] = state ## add a columns with the state name


        df['APT_CAT'] = pd.Series(dtype="object") ## create a new columns for aiport category
        airport_col = df.columns[4] ## the 5th columns (zero-indexed) contains the airport name and category (weird structure from Wikipédia)
        other_cols = [c for c in df.columns if c not in ['STATE', airport_col]] ## the others columns
        current_category = None

        ## loop over rows to assign airport category
        for idx, row in df.iterrows():
            ## detect rows that are only category headers:
            ## 'airport_col' is not NaN, all other columns (except STATE) are NaN            
            if pd.notna(row[airport_col]) and row[other_cols].isna().all():
                current_category = row[airport_col]
                df.at[idx, 'APT_CAT'] = current_category ## update current category
            else:
                df.at[idx, 'APT_CAT'] = current_category ## assign category to regular rows

        ## remove rows where the first column is NaN (often empty or banner rows)
        df = df[df[df.columns[0]].notna()].reset_index(drop=True)

        ## Hawaii has a different column name for the city, handle separately
        if state == 'Hawaii':
            df_pl = (pl.from_pandas(df) ## convert Pandas DataFrame to Polars
                    .rename({'City served, Island':'CITY_SERVED', 'Airport name':'APT_NAME', 'Role':'FAA_ROLE', 'IATA':'IATA_CODE', 'ICAO':'ICAO_CODE'})
                    .select(['IATA_CODE', 'ICAO_CODE', 'APT_NAME', 'APT_CAT', 'FAA_ROLE', 'STATE', 'CITY_SERVED']) ## process naming & keep certain columns
            )
        else:
            df_pl = (pl.from_pandas(df)
                    .rename({'City served':'CITY_SERVED', 'Airport name':'APT_NAME', 'Role':'FAA_ROLE', 'IATA':'IATA_CODE', 'ICAO':'ICAO_CODE'})
                    .select(['IATA_CODE', 'ICAO_CODE', 'APT_NAME', 'APT_CAT', 'FAA_ROLE', 'STATE', 'CITY_SERVED'])
            )

        all_airports.append(df_pl) ## add processed Polars DataFrame to the list
        print(f"{state} : {len(df)} added lines")

    except Exception as e:
        print(f"Error for {state} : {e}")




In [None]:
all_airports_df = pl.concat(all_airports) ## concat everything
all_airports_df.head(2)

# Save csv

In [None]:
# all_airports_df.write_csv("airports_us_wikipedia.csv")
# print("CSV saved!")

In [None]:
all_airports_df['APT_CAT'].unique().to_list() ## do a mapping to code