# Data from HTML tables: U.S. births

In [1]:
import pandas as pd
import altair as alt

#### Get url

In [2]:
url = "https://www.ssa.gov/oact/babynames/numberUSbirths.html"

#### Read it into a dataframe

In [3]:
births = pd.read_html(url)[0]

#### Clean up columns

In [4]:
births.rename(
    columns={
        "Year of birth": "year",
        "Male": "male",
        "Female": "female",
        "Total": "total",
    },
    inplace=True,
)

In [5]:
births.head()

Unnamed: 0,year,male,female,total
0,1880,118399,97606,216005
1,1881,108280,98855,207135
2,1882,122031,115694,237725
3,1883,112475,120059,232534
4,1884,122738,137585,260323


#### Convert year

In [6]:
births.year = births.year.astype(str)

#### Chart it!

In [7]:
alt.Chart(births).mark_area(color='red').encode(
    x='year:T',
    y='total'
).properties(title='Births by year', width=650)

In [8]:
births.to_csv('../../data/processed/ssa_births_1880_2020.csv', index=False)

---

## Get data from Wikipedia

In [9]:
# Used in this story: https://www.grid.news/story/politics/2022/01/13/what-economic-and-political-indicators-tell-us-about-the-2022-midterms/

In [10]:
wiki_df = pd.read_html('https://en.wikipedia.org/wiki/United_States_midterm_election', header=0)[1]

#### Clean up the column names

In [11]:
wiki_df = wiki_df.drop([0])

#### Make them lower case

In [12]:
wiki_df.columns = wiki_df.columns.str.lower()

#### Or just rename them entirely

In [13]:
wiki_df.columns = ['year', 'president', 'president_party', 'gain_loss_house', 'gain_loss_senate']

#### First five rows

In [14]:
wiki_df.head()

Unnamed: 0,year,president,president_party,gain_loss_house,gain_loss_senate
1,1790,George Washington,None[a],+3: (37 ► 40),0: (18 ► 18)
2,1794,George Washington,None[a],-4: (51 ► 47),+3: (16 ► 19)
3,1798,John Adams,Federalist,+3: (57 ► 60),0: (22 ► 22)
4,1802,Thomas Jefferson,Democratic-Republican,+1: (38 ► 39),-6: (15 ► 9)
5,1806,Thomas Jefferson,Democratic-Republican,+2: (114 ► 116),+1: (27 ► 28)


#### Filter to presidents after 1900

In [16]:
# wiki_df[wiki_df['year'] > 1900]

#### First, convert data type on year column

In [None]:
wiki_df['year'] = wiki_df['year'].astype(int)

#### Now try the filter

In [None]:
wiki_df[wiki_df['year'] > 1900]

#### Export

In [17]:
wiki_df.to_csv('../../data/processed/wiki_midterm_gains_losses_party.csv', index=False)