# Data from HTML tables: U.S. births

In [10]:
%load_ext lab_black
import pandas as pd
import altair as alt

#### Get url

In [11]:
url = "https://www.ssa.gov/oact/babynames/numberUSbirths.html"

#### Read it into a dataframe

In [12]:
births = pd.read_html(url)[0]
births

Unnamed: 0,Year of birth,Male,Female,Total
0,1880,118399,97606,216005
1,1881,108280,98855,207135
2,1882,122031,115694,237725
3,1883,112475,120059,232534
4,1884,122738,137585,260323
...,...,...,...,...
136,2016,2022858,1933195,3956053
137,2017,1975558,1885970,3861528
138,2018,1939284,1855454,3794738
139,2019,1914661,1828242,3742903


#### Clean up columns

In [15]:
births.rename(
    columns={
        "Year of birth": "year",
        "Male": "male",
        "Female": "female",
        "Total": "total",
    },
    inplace=True,
)

In [16]:
births.head()

Unnamed: 0,year,male,female,total
0,1880,118399,97606,216005
1,1881,108280,98855,207135
2,1882,122031,115694,237725
3,1883,112475,120059,232534
4,1884,122738,137585,260323


#### Convert year

In [18]:
births.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   year    141 non-null    int64
 1   male    141 non-null    int64
 2   female  141 non-null    int64
 3   total   141 non-null    int64
dtypes: int64(4)
memory usage: 4.5 KB


In [21]:
births["year"] = births["year"].astype(str)
births.dtypes

year      object
male       int64
female     int64
total      int64
dtype: object

#### Chart it!

In [25]:
alt.Chart(births).mark_bar(color="orange").encode(x="year:T", y="total").properties(
    width=650
)

In [3]:
# births.to_csv('../../data/processed/ssa_births_1880_2020.csv', index=False)

---

## Get data from Wikipedia

In [51]:
# Used in this story:
url2 = "https://en.wikipedia.org/wiki/United_States_midterm_election"
# "https://www.grid.news/story/politics/2022/01/13/what-economic-and-political-indicators-tell-us-about-the-2022-midterms/"

In [52]:
wiki_df = pd.read_html(url2, header=0)[1]
wiki_df.drop([0], inplace=True)

In [59]:
wiki_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58 entries, 1 to 58
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   year             58 non-null     object
 1   president        58 non-null     object
 2   president_party  58 non-null     object
 3   house_loss       58 non-null     object
 4   senate_loss      58 non-null     object
dtypes: object(5)
memory usage: 2.7+ KB


#### Clean up the column names

#### Make them lower case

In [62]:
wiki_df.columns = (
    wiki_df.columns.str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace("/", "_", regex=False)
)
wiki_df

Unnamed: 0,year,president,president_party,house_loss,senate_loss
1,1790,George Washington,None[a],+3: (37 ► 40),0: (18 ► 18)
2,1794,George Washington,None[a],-4: (51 ► 47),+3: (16 ► 19)
3,1798,John Adams,Federalist,+3: (57 ► 60),0: (22 ► 22)
4,1802,Thomas Jefferson,Democratic-Republican,+1: (38 ► 39),-6: (15 ► 9)
5,1806,Thomas Jefferson,Democratic-Republican,+2: (114 ► 116),+1: (27 ► 28)
6,1810,James Madison,Democratic-Republican,+13: (94 ► 107),0: (26 ► 26)
7,1814,James Madison,Democratic-Republican,+5: (114 ► 119),-3: (26 ► 22)
8,1818,James Monroe,Democratic-Republican,+13: (145 ► 158),+2: (28 ► 30)
9,1822,James Monroe,Democratic-Republican,+34: (155 ► 189),0: (44 ► 44)
10,1826,John Quincy Adams,Democratic-Republican[b],-9: (109 ► 100),-2: (21 ► 19)


#### Or just rename them entirely

In [57]:
wiki_df.columns = ["year", "president", "president_party", "house_loss", "senate_loss"]
wiki_df.head()

Unnamed: 0,year,president,president_party,house_loss,senate_loss
1,1790,George Washington,None[a],+3: (37 ► 40),0: (18 ► 18)
2,1794,George Washington,None[a],-4: (51 ► 47),+3: (16 ► 19)
3,1798,John Adams,Federalist,+3: (57 ► 60),0: (22 ► 22)
4,1802,Thomas Jefferson,Democratic-Republican,+1: (38 ► 39),-6: (15 ► 9)
5,1806,Thomas Jefferson,Democratic-Republican,+2: (114 ► 116),+1: (27 ► 28)


#### First five rows

In [58]:
wiki_df.head()

Unnamed: 0,year,president,president_party,house_loss,senate_loss
1,1790,George Washington,None[a],+3: (37 ► 40),0: (18 ► 18)
2,1794,George Washington,None[a],-4: (51 ► 47),+3: (16 ► 19)
3,1798,John Adams,Federalist,+3: (57 ► 60),0: (22 ► 22)
4,1802,Thomas Jefferson,Democratic-Republican,+1: (38 ► 39),-6: (15 ► 9)
5,1806,Thomas Jefferson,Democratic-Republican,+2: (114 ► 116),+1: (27 ► 28)


#### Filter to presidents after 1900

In [None]:
#wiki_df[wiki_df['year']>1900]

#### First, convert data type on year column

In [60]:
wiki_df["year"] = wiki_df["year"].astype(int)

#### Now try the filter

In [61]:
wiki_df[wiki_df["year"] > 1900]

Unnamed: 0,year,president,president_party,house_loss,senate_loss
29,1902,Theodore Roosevelt,Republican,+9: (201 ► 210),0: (55 ► 55)
30,1906,Theodore Roosevelt,Republican,-27: (251 ► 224),+2: (58 ► 60)
31,1910,William Howard Taft,Republican,-56: (219 ► 163),-9: (59 ► 50)
32,1914,Woodrow Wilson,Democratic,-61: (291 ► 230),+3: (50 ► 53)
33,1918,Woodrow Wilson,Democratic,-22: (214 ► 192),-4: (52 ► 48)
34,1922,Warren G. Harding,Republican,-77: (302 ► 225),-7: (60 ► 53)
35,1926,Calvin Coolidge,Republican,-9: (247 ► 238),-6: (56 ► 50)
36,1930,Herbert Hoover,Republican,-52: (270 ► 218),-6: (56 ► 50)
37,1934,Franklin D. Roosevelt,Democratic,+9: (313 ► 322),+9: (60 ► 69)
38,1938,Franklin D. Roosevelt,Democratic,-72: (334 ► 262),-7: (75 ► 68)


#### Export

In [63]:
wiki_df.to_csv("../data/processed/wiki_midterm_gains_losses_party.csv", index=False)