# Use Pandas to convert live HTML tables to dataframes

In [1]:
import pandas as pd

### L.A. County COVID cases, deaths by community

In [2]:
url_dph = 'http://publichealth.lacounty.gov/media/Coronavirus/locations.htm'

In [3]:
page_dph = pd.read_html(url_dph, attrs={'class':'table'})

In [4]:
df_dph = pd.DataFrame(page_dph[1])

In [5]:
df_dph.columns = ['place', 'cases', 'case_rate', 'deaths', 'death_rate']

In [6]:
df_dph.head(10)

Unnamed: 0,place,cases,case_rate,deaths,death_rate
0,City of Agoura Hills,1446,6924.0,19,91.0
1,City of Alhambra,8213,9470.0,217,250.0
2,City of Arcadia,3370,5835.0,147,255.0
3,City of Artesia,2436,14504.0,80,476.0
4,City of Avalon,32,827.0,0,0.0
5,City of Azusa,7441,14870.0,130,260.0
6,City of Baldwin Park,14655,19090.0,334,435.0
7,City of Bell,7569,20833.0,126,347.0
8,City of Bell Gardens,9092,21109.0,120,279.0
9,City of Bellflower,13004,16729.0,214,275.0


In [7]:
df_dph[df_dph['cases'] > 100 ].sort_values('case_rate', ascending=False).head(10)

Unnamed: 0,place,cases,case_rate,deaths,death_rate
311,Unincorporated - Saugus,166,107097.0,0,0.0
309,Unincorporated - Santa Catalina Island,252,94382.0,3,1124.0
79,City of Vernon,141,67464.0,0,0.0
35,City of Industry,191,43707.0,5,1144.0
294,Unincorporated - Pellissier Village,191,30856.0,1,162.0
166,Los Angeles - Pacoima,19208,24952.0,244,317.0
66,City of San Fernando,6002,24386.0,71,288.0
232,Unincorporated - Athens Village,1191,24321.0,31,633.0
208,Los Angeles - Vernon Central,12304,23662.0,172,331.0
220,Los Angeles - Wholesale District*,8497,23519.0,116,321.0


---

### DOJ Jan. 6 insurrection defendants

In [8]:
url_doj = 'https://www.justice.gov/usao-dc/capitol-breach-cases'

In [9]:
page_doj = pd.read_html(url_doj, attrs={'class':'tablesaw'})

In [10]:
df_doj = pd.DataFrame(page_doj[0])

In [11]:
df_doj.columns = (
    df_doj.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace(":", "", regex=False)
    .str.replace("/", "_", regex=False)
    .str.replace(",", "_", regex=False)
    .str.replace("*", "", regex=False)
    .str.replace("(s)", "s", regex=False)
)

In [12]:
df_doj["location_of_arrest"] = (
    df_doj["location_of_arrest"]
    .str.strip()
    .str.title()
    .str.replace(", Middle District", "", regex=False)
    .str.replace(", Southern District", "", regex=False)
    .str.replace(", Central District", "", regex=False)
    .str.replace(", Western District", "", regex=False)
    .str.replace(", Eastern District", "", regex=False)
    .str.replace(", Northern District", "", regex=False)
)

In [13]:
df_doj["location_of_arrest"] = df_doj["location_of_arrest"].fillna("Not listed")

In [14]:
df_doj[["location_of_arrest_state", "location_of_arrest_other"]] = df_doj["location_of_arrest"].str.split(", ", n=1, expand=True)

In [15]:
df_doj.drop(columns=["location_of_arrest"], inplace=True)

In [16]:
df_doj.location_of_arrest_state.value_counts().head()

Florida         62
Texas           58
Pennsylvania    57
New York        43
California      32
Name: location_of_arrest_state, dtype: int64

In [17]:
states = df_doj.groupby(['location_of_arrest_state']).agg({'case_number':'count'}).reset_index()

In [18]:
states.rename(columns={'case_number':'cases'}, inplace=True)

In [19]:
states.sort_values('cases', ascending=False).head()

Unnamed: 0,location_of_arrest_state,cases
10,Florida,62
43,Texas,58
40,Pennsylvania,56
33,New York,43
4,California,32


---

### Export both dataframes

In [20]:
df_dph.to_csv('data/la-county-covid-rates-by-community.csv', index=False)

In [21]:
df_doj.to_csv('data/doj-capitol-breach-arrests.csv', index=False)

In [22]:
states.to_csv('data/doj-capitol-breach-arrests-by-state.csv', index=False)