# State of the Union analytics

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import glob
import altair as alt
import numpy as np

In [3]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1500

----

## List of presidents

In [4]:
pres_src = pd.read_html(
    "https://history.house.gov/Institution/Presidents-Coinciding/Presidents-Coinciding/"
)[0]

#### Slim down the number of columns and clean up strings

In [5]:
pres_df = pres_src[~pres_src["No."].isnull()][["No.", "President"]]

In [6]:
pres_df.columns = pres_df.columns.str.lower().str.replace(".", "", regex=False)
pres_df["president"] = pres_df["president"].str.strip()
pres_df["president"] = (
    pres_df["president"]
    .str.replace("\d+", "", regex=True)
    .str.replace("Van Buren", "van Buren", regex=False)
    .str.replace("James K. Polk", "James Polk", regex=False)
    .str.replace("Richard M. Nixon", "Richard Nixon", regex=False)
    .str.replace("George Bush", "George H.W. Bush", regex=False)
    .str.replace("Barack H. Obama", "Barack Obama", regex=False)
    .str.replace("Joseph R. Biden, Jr.", "Joseph R. Biden", regex=False)
    .str.replace("Herbert C. Hoover", "Herbert Hoover", regex=False)
    .str.replace("Warren G. Harding", "Warren Harding", regex=False)
    .str.replace("James Earl Carter", "Jimmy Carter", regex=False)
)
pres_df["no"] = pres_df["no"].astype(str).str.replace(".0", "", regex=False).astype(int)

#### What's left?

In [7]:
pres_df.head()

Unnamed: 0,no,president
0,1,George Washington
1,2,John Adams
2,3,Thomas Jefferson
4,4,James Madison
6,5,James Monroe


---

## SOTU analytics

In [8]:
#### http://stateoftheunion.onetwothree.net/data/documentsData.json

#### Read data

In [9]:
src = pd.read_json("data/raw/documentsData.json").T.reset_index()

#### Dates

In [10]:
src["date"] = pd.to_datetime(src["index"]).copy()
src["year"] = src["date"].dt.year

#### Prepare president column for join

In [11]:
src["author"] = src["author"].str.strip()
src.rename(columns={"author": "president"}, inplace=True)

#### Top words

In [12]:
words = []

for r in range(0, 234):
    words.append(
        pd.json_normalize(src["wordList"][r])
        .sort_values("freq", ascending=False)
        .reset_index(drop=True)
        .iloc[0, 0]
    )

#### Slim down the columns we need

In [13]:
df = src[
    [
        "date",
        "year",
        "president",
        "maxFreq",
        "numberOfSentences",
        "numberOfWords",
        "numberOfSyllables",
        "fleschKincaid",
        "wordList",
    ]
]

#### Merge the presidents list and the analytics

In [14]:
merge = pd.merge(df, pres_df, on="president", how="inner").drop_duplicates(
    subset=["date", "president"]
)

In [15]:
merge.head()

Unnamed: 0,date,year,president,maxFreq,numberOfSentences,numberOfWords,numberOfSyllables,fleschKincaid,wordList,no
0,1790-01-08,1790,George Washington,3,24,1085,1869,22.367702,"[{'word': 'aggressors', 'position': 0.36129034...",1
1,1790-12-08,1790,George Washington,3,40,1403,2288,17.332584,"[{'word': '3,000,000', 'position': 0.13114753,...",1
2,1791-10-25,1791,George Washington,7,60,2304,3937,19.549458,"[{'word': '2,500,000', 'position': 0.59071183,...",1
3,1792-11-06,1792,George Washington,8,61,2099,3492,17.460896,"[{'word': '4.5%', 'position': 0.8027632, 'freq...",1
4,1793-12-03,1793,George Washington,24,56,1965,3233,17.509274,"[{'word': '3%', 'position': 0.8122136999999999...",1


#### Top words used by all presidents

In [16]:
words = []

for r, p, d in zip(merge.index, merge.president, merge.date):
    words.append(pd.json_normalize(merge["wordList"][r]).assign(name=p, date=d))

In [17]:
words_df = pd.concat(words)

In [18]:
top_words = words_df.loc[words_df.groupby("date")["freq"].idxmax()]

In [19]:
len(top_words)

55225

In [20]:
top_words_freq = top_words.groupby(["date", "name"])["freq"].max().reset_index()

In [21]:
top_words_years = (
    pd.merge(top_words, top_words_freq, on=["name", "date", "freq"])
    .drop_duplicates(subset=["name", "date", "freq"])
    .sort_values("date")
)

In [22]:
final = pd.merge(merge, top_words_years, on=["date"])

In [23]:
final.columns

Index(['date', 'year', 'president', 'maxFreq', 'numberOfSentences',
       'numberOfWords', 'numberOfSyllables', 'fleschKincaid', 'wordList', 'no',
       'word', 'position', 'freq', 'corpFreqAsPercent', 'freqAsPercent',
       'freqAsPercentDiff', 'LLS', 'L1LLS', 'S', 'alpha', 'df', 'tfIDF',
       'name'],
      dtype='object')

In [24]:
final_df = final[
    [
        "date",
        "year",
        "president",
        "no",
        "maxFreq",
        "numberOfSentences",
        "numberOfWords",
        "numberOfSyllables",
        "fleschKincaid",
        "word",
    ]
].copy()

In [25]:
final_df["word"] = final_df["word"].str.lower()

---

## Aggregate

#### Words, sentences and syllables — by president

In [26]:
by_pres = (
    final_df.groupby(["no", "president"])
    .agg(
        {
            "numberOfSentences": "mean",
            "numberOfWords": "mean",
            "numberOfSyllables": "mean",
            "fleschKincaid": "mean",
        }
    )
    .round()
    .reset_index()
    .sort_values("no")
)

In [27]:
by_pres.head()

Unnamed: 0,no,president,numberOfSentences,numberOfWords,numberOfSyllables,fleschKincaid
0,1,George Washington,57.0,2078.0,3486.0,19.0
1,2,John Adams,48.0,1789.0,2998.0,19.0
2,3,Thomas Jefferson,67.0,2582.0,4200.0,19.0
3,4,James Madison,60.0,2706.0,4567.0,22.0
4,5,James Monroe,146.0,5279.0,8560.0,18.0


In [28]:
by_pres.tail()

Unnamed: 0,no,president,numberOfSentences,numberOfWords,numberOfSyllables,fleschKincaid
38,42,William J. Clinton,376.0,7362.0,11047.0,10.0
39,43,George W. Bush,262.0,4824.0,7563.0,10.0
40,44,Barack Obama,360.0,6518.0,9811.0,9.0
41,45,Donald J. Trump,312.0,5306.0,8335.0,10.0
42,46,Joseph R. Biden,460.0,6036.0,9095.0,7.0


---

## Chart sketches

In [30]:
# import seaborn as sns

# sns.set_theme(style="whitegrid")
# tips = sns.load_dataset("tips")
# ax = sns.swarmplot(x=tips["total_bill"], size=8)

In [31]:
alt.Chart(by_pres).mark_bar().encode(
    y=alt.Y("president", sort=alt.EncodingSortField(field="no", order="ascending")),
    x="fleschKincaid",
)

  for col_name, dtype in df.dtypes.iteritems():


#### Syllables, by year

In [32]:
alt.Chart(final_df).mark_area().encode(x="date", y="numberOfSyllables").properties(
    width=600
)

In [33]:
alt.Chart(final_df).mark_line().encode(
    x="year:O",
    y="fleschKincaid",
).properties(width=650)

In [34]:
words_counts = pd.read_csv("data/raw/words_counts.txt", sep="	")

In [35]:
selected_words = [
    "America",
    "government",
    "states",
    "strong",
    "terrorism",
    "people",
    "war",
    "jobs",
    "slavery",
    "space",
    "wages",
    "fight",
    "peace",
    "bipartisan",
    "together",
    "division",
    "united",
    "citizens",
    "military",
    "constitution",
]

In [36]:
selected_words_df = words_counts[words_counts["word"].isin(selected_words)]

In [42]:
selected_words_df

Unnamed: 0,word,corpus,01/08/1790,12/08/1790,10/25/1791,11/06/1792,12/03/1793,11/19/1794,12/08/1795,12/07/1796,11/22/1797,12/08/1798,12/03/1799,11/11/1800,12/08/1801,12/15/1802,10/17/1803,11/08/1804,12/03/1805,12/02/1806,10/27/1807,11/08/1808,11/29/1809,12/05/1810,11/05/1811,11/04/1812,12/07/1813,09/20/1814,12/05/1815,12/03/1816,12/12/1817,11/16/1818,12/07/1819,11/14/1820,12/03/1821,12/03/1822,12/02/1823,12/07/1824,12/06/1825,12/05/1826,12/04/1827,12/02/1828,12/08/1829,12/06/1830,12/06/1831,12/04/1832,12/03/1833,12/01/1834,12/07/1835,12/05/1836,12/05/1837,12/03/1838,12/02/1839,12/05/1840,12/07/1841,12/06/1842,12/06/1843,12/03/1844,12/02/1845,12/08/1846,12/07/1847,12/05/1848,12/04/1849,12/02/1850,12/02/1851,12/06/1852,12/05/1853,12/04/1854,12/31/1855,12/02/1856,12/08/1857,12/06/1858,12/19/1859,12/03/1860,12/03/1861,12/01/1862,12/08/1863,12/06/1864,12/04/1865,12/03/1866,12/03/1867,12/09/1868,12/06/1869,12/05/1870,12/04/1871,12/02/1872,12/01/1873,12/07/1874,12/07/1875,12/05/1876,12/03/1877,12/02/1878,12/01/1879,12/06/1880,12/06/1881,12/04/1882,12/04/1883,12/01/1884,12/08/1885,12/06/1886,12/06/1887,12/03/1888,12/03/1889,12/01/1890,12/09/1891,12/06/1892,12/03/1893,12/02/1894,12/07/1895,12/04/1896,12/06/1897,12/05/1898,12/05/1899,12/03/1900,12/03/1901,12/02/1902,12/07/1903,12/06/1904,12/05/1905,12/03/1906,12/03/1907,12/08/1908,12/07/1909,12/06/1910,12/05/1911,12/03/1912,12/02/1913,12/08/1914,12/07/1915,12/05/1916,12/04/1917,12/02/1918,12/02/1919,12/07/1920,12/06/1921,12/08/1922,12/06/1923,12/03/1924,12/08/1925,12/07/1926,12/06/1927,12/04/1928,12/03/1929,12/02/1930,12/08/1931,12/06/1932,01/03/1934,01/04/1935,01/03/1936,01/06/1937,01/03/1938,01/04/1939,01/03/1940,01/06/1941,01/06/1942,01/07/1943,01/11/1944,01/06/1945,01/21/1946,01/06/1947,01/07/1948,01/05/1949,01/04/1950,01/08/1951,01/09/1952,01/07/1953,02/02/1953,01/07/1954,01/06/1955,01/05/1956,01/10/1957,01/09/1958,01/09/1959,01/07/1960,01/12/1961,01/30/1961,01/11/1962,01/14/1963,01/08/1964,01/04/1965,01/12/1966,01/10/1967,01/17/1968,01/14/1969,01/22/1970,01/22/1971,01/20/1972,02/02/1973,01/30/1974,01/15/1975,01/19/1976,01/12/1977,01/19/1978,01/25/1979,01/21/1980,01/16/1981,01/26/1982,01/25/1983,01/25/1984,02/06/1985,02/04/1986,01/27/1987,01/25/1988,02/09/1989,01/31/1990,01/29/1991,01/28/1992,02/17/1993,01/25/1994,01/24/1995,01/23/1996,02/04/1997,01/27/1998,01/19/1999,01/27/2000,02/27/2001,09/20/2001,01/29/2002,01/28/2003,01/20/2004,02/02/2005,01/31/2006,01/23/2007,01/28/2008,02/24/2009,01/27/2010,01/25/2011,01/24/2012,02/12/2013,01/28/2014,01/20/2015,01/12/2016,02/28/2017,01/30/2018,02/05/2019,02/04/2020,04/28/2021,Unnamed: 237
3183,America,1600,,,,,,,,,,,6.0,,,,,,,,,,,,,,,,,,,4.0,2.0,2.0,1.0,,3.0,,2.0,7.0,1.0,5.0,,,5.0,2.0,1.0,0.0,1.0,,,2.0,2.0,1.0,,,,,4.0,0.0,0.0,3.0,3.0,,2.0,1.0,2.0,8.0,14.0,8.0,8.0,1.0,0.0,0.0,,2.0,1.0,3.0,3.0,1.0,0.0,,,1.0,1.0,2.0,,2.0,,,1.0,2.0,1.0,4.0,2.0,3.0,13.0,3.0,3.0,0.0,,1.0,3.0,3.0,1.0,1.0,0.0,,,,1.0,3.0,3.0,0.0,1.0,3.0,2.0,1.0,1.0,3.0,2.0,1.0,5.0,13.0,1.0,5.0,5.0,11.0,18.0,,2.0,1.0,29.0,7.0,3.0,,7.0,7.0,5.0,3.0,6.0,3.0,2.0,2.0,3.0,2.0,,5.0,2.0,7.0,4.0,,15.0,9.0,2.0,17.0,7.0,4.0,0.0,1.0,,2.0,,,1.0,2.0,4.0,10.0,2.0,10.0,9.0,8.0,20.0,10.0,4.0,5.0,18.0,7.0,12.0,31.0,16.0,11.0,22.0,19.0,62.0,48.0,47.0,23.0,52.0,7.0,38.0,17.0,8.0,24.0,55.0,4.0,21.0,30.0,48.0,23.0,62.0,49.0,46.0,50.0,55.0,36.0,21.0,14.0,13.0,19.0,36.0,63.0,47.0,44.0,43.0,32.0,53.0,86.0,42.0,59.0,29.0,71.0,42.0,52.0,28.0,25.0,26.0,41.0,35.0,41.0,52.0,38.0,53.0,35.0,28.0,39.0,72.0,
4700,bipartisan,114,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,1.0,,1.0,,,,1.0,1.0,2.0,,,,,,,1.0,,,,,,,,,7.0,,,2.0,,,,,,1.0,1.0,12.0,14.0,4.0,2.0,2.0,6.0,,2.0,,,,1.0,3.0,12.0,8.0,12.0,9.0,8.0,,,,,1.0,3.0,3.0,,3.0,1.0,2.0,2.0,4.0,6.0,4.0,5.0,3.0,,1.0,1.0,3.0,3.0,
6066,citizens,1741,18.0,28.0,8.0,28.0,15.0,27.0,15.0,17.0,38.0,9.0,19.0,7.0,21.0,27.0,30.0,14.0,27.0,20.0,20.0,26.0,16.0,12.0,17.0,9.0,12.0,9.0,9.0,14.0,13.0,18.0,12.0,8.0,10.0,2.0,11.0,10.0,10.0,9.0,17.0,12.0,16.0,9.0,29.0,21.0,17.0,17.0,10.0,7.0,10.0,20.0,11.0,12.0,10.0,8.0,7.0,7.0,14.0,20.0,18.0,2.0,13.0,8.0,9.0,12.0,10.0,15.0,12.0,24.0,8.0,22.0,18.0,17.0,12.0,7.0,23.0,5.0,3.0,9.0,5.0,9.0,10.0,16.0,26.0,25.0,16.0,18.0,18.0,23.0,12.0,14.0,16.0,16.0,15.0,23.0,35.0,13.0,11.0,13.0,3.0,17.0,6.0,7.0,6.0,8.0,10.0,6.0,12.0,6.0,10.0,4.0,9.0,5.0,5.0,,8.0,6.0,3.0,7.0,4.0,3.0,5.0,7.0,3.0,5.0,11.0,,7.0,,,,,,1.0,3.0,11.0,2.0,3.0,5.0,8.0,7.0,3.0,6.0,7.0,2.0,13.0,2.0,5.0,7.0,4.0,,6.0,6.0,2.0,,7.0,1.0,1.0,6.0,17.0,11.0,13.0,9.0,3.0,6.0,15.0,11.0,22.0,8.0,14.0,,10.0,3.0,1.0,3.0,7.0,9.0,12.0,2.0,3.0,5.0,2.0,4.0,,,,,3.0,,8.0,8.0,6.0,6.0,5.0,6.0,7.0,5.0,4.0,9.0,5.0,7.0,,4.0,5.0,5.0,2.0,1.0,2.0,19.0,12.0,7.0,6.0,1.0,5.0,11.0,23.0,23.0,16.0,9.0,11.0,16.0,16.0,17.0,3.0,8.0,4.0,7.0,12.0,1.0,1.0,5.0,21.0,13.0,11.0,10.0,,
7016,constitution,934,18.0,,8.0,14.0,,27.0,5.0,,9.0,,,14.0,9.0,4.0,4.0,,,10.0,4.0,3.0,,,,6.0,3.0,,6.0,20.0,4.0,6.0,,2.0,1.0,2.0,3.0,2.0,11.0,,2.0,8.0,6.0,11.0,6.0,13.0,1.0,8.0,5.0,14.0,3.0,6.0,5.0,11.0,1.0,1.0,6.0,5.0,9.0,6.0,1.0,30.0,17.0,14.0,9.0,5.0,14.0,,21.0,18.0,33.0,23.0,18.0,48.0,,4.0,6.0,5.0,34.0,26.0,38.0,18.0,6.0,2.0,3.0,,2.0,13.0,,4.0,10.0,2.0,2.0,8.0,,3.0,,5.0,2.0,0.0,1.0,6.0,4.0,0.0,2.0,2.0,,1.0,0.0,0.0,,0.0,1.0,6.0,2.0,2.0,,1.0,1.0,2.0,1.0,2.0,5.0,,0.0,0.0,2.0,,1.0,9.0,,,,3.0,3.0,3.0,2.0,2.0,4.0,7.0,4.0,3.0,0.0,2.0,1.0,,4.0,5.0,,10.0,,,3.0,3.0,,,7.0,,,4.0,,2.0,,,,,,1.0,,,2.0,,6.0,1.0,,1.0,3.0,,,,1.0,,2.0,2.0,,6.0,,,,4.0,8.0,6.0,2.0,,2.0,,,,,,2.0,23.0,2.0,,,,,,,,,2.0,1.0,,1.0,2.0,,,,1.0,1.0,3.0,3.0,3.0,,2.0,1.0,,1.0,,,1.0,1.0,1.0,,1.0,1.0,
9142,division,103,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,,1.0,,,,,,,1.0,,,,,,1.0,,0.0,,,0.0,,,,,1.0,0.0,,0.0,1.0,,,,,1.0,,,,,,,,1.0,,,,,,,,1.0,,,,,,,,,,1.0,,,,,,,0.0,,,,,1.0,0.0,3.0,3.0,,0.0,2.0,1.0,,1.0,2.0,,,1.0,1.0,0.0,0.0,1.0,0.0,,2.0,1.0,,,,,,,,,,8.0,1.0,2.0,1.0,,,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,1.0,,,2.0,2.0,2.0,,,,1.0,,,1.0,,,,,,,,2.0,,,,,,,,,,,,,,,,,,2.0,,,2.0,,,,1.0,,,,1.0,,,,1.0,,,1.0,,,,5.0,,,,,,,3.0,,,,1.0,
11137,fight,208,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,0.0,,,,,,0.0,,1.0,,1.0,,,0.0,,,,,,,,0.0,,,,,1.0,,,,,,,,,,,,,,,,,,0.0,,,2.0,,,0.0,1.0,0.0,0.0,0.0,,,0.0,,,,1.0,,2.0,,,,,,,,,,,,,,,2.0,,,,,,2.0,3.0,3.0,14.0,8.0,5.0,6.0,1.0,3.0,1.0,,,4.0,16.0,2.0,,1.0,,1.0,,,,1.0,,3.0,3.0,,6.0,2.0,9.0,5.0,,,6.0,,2.0,,7.0,,,,2.0,12.0,2.0,0.0,,,,,11.0,5.0,,,,13.0,2.0,,1.0,3.0,6.0,4.0,2.0,2.0,6.0,6.0,20.0,10.0,9.0,1.0,5.0,16.0,16.0,12.0,,7.0,1.0,5.0,3.0,1.0,1.0,3.0,3.0,3.0,13.0,3.0,3.0,
12221,government,6894,36.0,21.0,30.0,14.0,15.0,48.0,20.0,31.0,9.0,27.0,53.0,21.0,18.0,4.0,22.0,33.0,13.0,6.0,20.0,7.0,32.0,45.0,30.0,18.0,27.0,14.0,19.0,41.0,33.0,54.0,63.0,46.0,36.0,21.0,39.0,29.0,14.0,25.0,25.0,13.0,55.0,45.0,36.0,45.0,54.0,55.0,63.0,46.0,62.0,55.0,38.0,74.0,69.0,59.0,71.0,48.0,53.0,50.0,38.0,38.0,44.0,55.0,35.0,38.0,35.0,36.0,49.0,43.0,38.0,57.0,41.0,57.0,37.0,9.0,34.0,33.0,62.0,56.0,49.0,51.0,37.0,38.0,49.0,62.0,41.0,41.0,45.0,45.0,37.0,37.0,37.0,76.0,84.0,62.0,86.0,51.0,57.0,57.0,34.0,64.0,37.0,25.0,56.0,28.0,47.0,47.0,52.0,44.0,89.0,54.0,46.0,57.0,26.0,15.0,39.0,32.0,34.0,34.0,33.0,28.0,57.0,80.0,37.0,47.0,39.0,28.0,28.0,4.0,15.0,21.0,33.0,62.0,35.0,29.0,70.0,46.0,43.0,47.0,52.0,57.0,42.0,42.0,40.0,64.0,31.0,48.0,31.0,73.0,87.0,66.0,24.0,12.0,5.0,8.0,26.0,9.0,43.0,28.0,21.0,61.0,31.0,4.0,13.0,14.0,40.0,60.0,49.0,31.0,33.0,12.0,26.0,7.0,32.0,11.0,4.0,9.0,,20.0,24.0,16.0,14.0,4.0,26.0,110.0,12.0,77.0,17.0,12.0,44.0,32.0,48.0,52.0,2.0,27.0,51.0,30.0,28.0,28.0,25.0,26.0,38.0,12.0,10.0,23.0,21.0,35.0,9.0,38.0,28.0,4.0,15.0,17.0,4.0,38.0,6.0,10.0,22.0,9.0,27.0,18.0,26.0,19.0,11.0,18.0,26.0,11.0,14.0,4.0,7.0,7.0,15.0,7.0,5.0,10.0,6.0,
14928,jobs,569,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.0,3.0,,,2.0,17.0,1.0,,5.0,2.0,,4.0,5.0,,,,2.0,1.0,2.0,,2.0,,,5.0,12.0,5.0,15.0,,3.0,2.0,10.0,7.0,,4.0,7.0,,7.0,12.0,22.0,17.0,21.0,12.0,8.0,5.0,13.0,14.0,8.0,18.0,2.0,5.0,8.0,4.0,5.0,13.0,16.0,34.0,26.0,13.0,12.0,10.0,13.0,5.0,6.0,4.0,,28.0,3.0,11.0,7.0,11.0,3.0,8.0,23.0,32.0,36.0,47.0,48.0,33.0,26.0,14.0,17.0,11.0,19.0,18.0,71.0,
16764,military,1088,9.0,,,,30.0,6.0,15.0,17.0,4.0,4.0,6.0,,6.0,4.0,,9.0,,3.0,8.0,3.0,5.0,16.0,8.0,9.0,12.0,4.0,22.0,5.0,4.0,,,8.0,3.0,16.0,4.0,11.0,12.0,6.0,2.0,4.0,3.0,,,1.0,2.0,2.0,3.0,3.0,3.0,6.0,,1.0,1.0,4.0,3.0,1.0,3.0,9.0,18.0,6.0,6.0,4.0,9.0,3.0,3.0,8.0,6.0,4.0,7.0,4.0,5.0,4.0,8.0,3.0,14.0,8.0,8.0,11.0,12.0,5.0,5.0,3.0,3.0,2.0,3.0,2.0,1.0,,6.0,2.0,6.0,2.0,2.0,3.0,5.0,2.0,5.0,0.0,,2.0,3.0,1.0,0.0,0.0,9.0,5.0,4.0,5.0,6.0,11.0,3.0,16.0,5.0,2.0,2.0,0.0,3.0,1.0,1.0,4.0,7.0,1.0,3.0,3.0,2.0,6.0,3.0,23.0,7.0,3.0,,3.0,,,,4.0,6.0,3.0,4.0,1.0,,2.0,3.0,,,,,,2.0,13.0,15.0,3.0,14.0,15.0,7.0,12.0,5.0,11.0,1.0,,5.0,39.0,20.0,18.0,17.0,23.0,38.0,19.0,19.0,54.0,20.0,7.0,6.0,9.0,13.0,7.0,9.0,2.0,5.0,1.0,6.0,2.0,4.0,,5.0,,13.0,14.0,10.0,4.0,6.0,9.0,37.0,8.0,15.0,7.0,6.0,4.0,,7.0,,8.0,15.0,2.0,2.0,2.0,5.0,3.0,3.0,1.0,2.0,1.0,1.0,16.0,13.0,10.0,5.0,3.0,7.0,15.0,7.0,10.0,,2.0,2.0,5.0,7.0,7.0,7.0,9.0,9.0,7.0,5.0,10.0,1.0,
18901,peace,1884,18.0,7.0,17.0,23.0,30.0,10.0,20.0,17.0,19.0,22.0,,,27.0,22.0,30.0,33.0,23.0,24.0,50.0,7.0,10.0,4.0,4.0,15.0,12.0,18.0,38.0,20.0,,2.0,4.0,14.0,5.0,10.0,6.0,5.0,12.0,11.0,11.0,5.0,10.0,1.0,9.0,8.0,6.0,2.0,1.0,2.0,5.0,9.0,4.0,8.0,6.0,11.0,6.0,19.0,7.0,14.0,29.0,11.0,2.0,7.0,7.0,6.0,4.0,9.0,7.0,7.0,2.0,5.0,7.0,9.0,7.0,4.0,3.0,6.0,13.0,8.0,9.0,11.0,11.0,9.0,9.0,15.0,3.0,4.0,4.0,2.0,11.0,5.0,5.0,13.0,7.0,19.0,2.0,5.0,3.0,2.0,,2.0,6.0,4.0,3.0,5.0,,5.0,2.0,2.0,9.0,11.0,11.0,6.0,9.0,3.0,7.0,9.0,12.0,9.0,7.0,3.0,2.0,12.0,2.0,6.0,19.0,24.0,16.0,,48.0,18.0,25.0,,8.0,6.0,8.0,15.0,7.0,8.0,12.0,12.0,6.0,4.0,3.0,4.0,4.0,2.0,31.0,25.0,23.0,21.0,40.0,24.0,8.0,26.0,23.0,37.0,11.0,19.0,21.0,14.0,25.0,19.0,42.0,9.0,7.0,18.0,23.0,20.0,24.0,52.0,16.0,30.0,25.0,15.0,13.0,27.0,12.0,13.0,28.0,12.0,20.0,26.0,35.0,4.0,20.0,29.0,52.0,4.0,4.0,34.0,15.0,52.0,64.0,7.0,15.0,19.0,48.0,21.0,22.0,7.0,20.0,12.0,7.0,21.0,4.0,4.0,6.0,8.0,15.0,13.0,9.0,10.0,10.0,13.0,,10.0,16.0,7.0,21.0,11.0,5.0,13.0,1.0,2.0,1.0,,1.0,4.0,1.0,1.0,5.0,,5.0,5.0,1.0,


In [37]:
selected_words_export = (
    selected_words_df[["word", "corpus"]]
    .sort_values("corpus", ascending=False)
    .reset_index(drop=True)
)

In [38]:
selected_words_export["word"] = selected_words_export["word"].str.title()

#### Exports

In [39]:
selected_words_export.to_csv("data/processed/selected_words_counts.csv", index=False)

In [40]:
final_df.to_csv("data/processed/sotu_analytics.csv", index=False)

In [41]:
final_df[["date", "president", "word"]].to_csv(
    "data/processed/sotu_analytics_top_word_each_year.csv", index=False
)