# TSA passenger checkpoint throughput

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import datetime as dt
import numpy as np

In [3]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [4]:
today = pd.to_datetime("today")

In [5]:
url = "https://www.tsa.gov/coronavirus/passenger-throughput"

In [6]:
header = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
  Chrome/50.0.2661.75 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest",
}

In [7]:
r = requests.get(url, headers=header)

In [8]:
tsa_dfs = pd.read_html(r.text)
tsa_tables = pd.DataFrame(tsa_dfs[0])

In [9]:
tsa_tables.head()

Unnamed: 0,Date,2022,2021,2020,2019
0,1/9/2022,1693518.0,886536,2183734,1955200
1,1/8/2022,1449698.0,709444,1687974,1739642
2,1/7/2022,1502714.0,772471,2072543,1733739
3,1/6/2022,1533544.0,771734,2034472,2044043
4,1/5/2022,1493235.0,665855,1815040,2229391


In [10]:
tsa_tables.rename(
    columns={
        "Date": "date",
        "2022": "2021 Traveler Throughput",
        "2021": "2021 Traveler Throughput",
        "2020": "2020 Traveler Throughput",
        "2019": "2019 Traveler Throughput",
    },
    inplace=True,
)
tsa_tables.columns = ["date", "2022", "2021", "2020", "2019"]

In [11]:
tsa_tables.head()

Unnamed: 0,date,2022,2021,2020,2019
0,1/9/2022,1693518.0,886536,2183734,1955200
1,1/8/2022,1449698.0,709444,1687974,1739642
2,1/7/2022,1502714.0,772471,2072543,1733739
3,1/6/2022,1533544.0,771734,2034472,2044043
4,1/5/2022,1493235.0,665855,1815040,2229391


In [12]:
tsa_tables.tail()

Unnamed: 0,date,2022,2021,2020,2019
359,1/15/2021,,903039,2347075,1605758
360,1/14/2021,,803688,2242656,1886642
361,1/13/2021,,567401,1876782,1970450
362,1/12/2021,,520117,1691205,1604862
363,1/11/2021,,708177,1992453,1959788


In [13]:
tsa_tables = tsa_tables.iloc[1:].copy()

### Include previously collected data

In [14]:
archive = pd.read_csv("data/raw/tsa_tables_before_pandemic.csv", parse_dates=["date"])

In [15]:
archive.drop(["Unnamed: 0"], axis=1, inplace=True)

In [16]:
df = pd.concat([tsa_tables, archive])

---

In [17]:
df["date"] = pd.to_datetime(df["date"])

In [18]:
df["month_day"] = df["date"].dt.strftime("%m-%d")

In [19]:
df.head(10)

Unnamed: 0,date,2022,2021,2020,2019,month_day
1,2022-01-08,1449698.0,709444.0,1687974,1739642,01-08
2,2022-01-07,1502714.0,772471.0,2072543,1733739,01-07
3,2022-01-06,1533544.0,771734.0,2034472,2044043,01-06
4,2022-01-05,1493235.0,665855.0,1815040,2229391,01-05
5,2022-01-04,1666715.0,766594.0,1806480,1975947,01-04
6,2022-01-03,1916499.0,1080346.0,2210542,2150571,01-03
7,2022-01-02,2023309.0,1327289.0,2422272,2202111,01-02
8,2022-01-01,1616316.0,1192881.0,2178656,2345103,01-01
9,2021-12-31,,1650795.0,805990,2311732,12-31
10,2021-12-30,,2049604.0,874406,2392331,12-30


In [20]:
tsa_tables_melt = pd.melt(
    df,
    id_vars=["month_day"],
    value_vars=["2021", "2020", "2019"],
    var_name="year",
    value_name="travelers",
)

In [21]:
tsa_tables_melt.head()

Unnamed: 0,month_day,year,travelers
0,01-08,2021,709444.0
1,01-07,2021,772471.0
2,01-06,2021,771734.0
3,01-05,2021,665855.0
4,01-04,2021,766594.0


In [22]:
tsa_tables_melt.tail()

Unnamed: 0,month_day,year,travelers
1258,03-05,2019,2402692.0
1259,03-04,2019,2143619.0
1260,03-03,2019,1979558.0
1261,03-02,2019,2257920.0
1262,03-01,2019,2301439.0


In [23]:
tsa_tables_melt

Unnamed: 0,month_day,year,travelers
0,01-08,2021,709444.0
1,01-07,2021,772471.0
2,01-06,2021,771734.0
3,01-05,2021,665855.0
4,01-04,2021,766594.0
...,...,...,...
1258,03-05,2019,2402692.0
1259,03-04,2019,2143619.0
1260,03-03,2019,1979558.0
1261,03-02,2019,2257920.0


In [24]:
df.to_csv("data/processed/tsa_passenger_throughput.csv", index=False)