### Ruben Abbou

## Analysis of SOU similarities using TF-IDF and Kmeans

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import json, re, string, math
from math import log
from numpy.linalg import norm
with open('speeches.json', 'r') as f:
    speeches = json.loads(f.read())

### (a)

In [2]:
def clean_and_split(s):
    s = s.lower().replace('-',' ').translate(str.maketrans('','',string.punctuation))
    s = re.sub('(\r\n)+',' ', s)
    s = re.sub(' +',' ',s.strip())
    return s.split(' ')
SOU = []
for speech in speeches:
    SOU.append(clean_and_split(speech['text']))
D = len(SOU)

# vector with every word and a count of zero
sample_vec = {k: 0 for k in [x for sp in SOU for x in sp]}

In [3]:
# for each speech, make a dictionnary with each word and its count.
vocab = dict(sample_vec)
counts = []
for i in range(D):
    sp_words = dict(sample_vec)
    for word in SOU[i]:
        if sp_words[word] == 0:
            vocab[word] += 1
        sp_words[word] += 1
    counts.append(sp_words)

In [4]:
# throw out most 20 common words, only keep words that appear more than 50 times
top20 = sorted(vocab.keys(), key=lambda x: vocab[x], reverse=True)[:20]
keys = list(vocab.keys())
for k in keys:
    if vocab[k] < 50 or k in top20:
        del vocab[k]
V = len(vocab)

In [5]:
# make the tf-idf matrix
tf_idf = np.zeros((D, V))
for i in range(D):
    d = counts[i]
    for j in range(V):
        key = list(vocab.keys())[j]
        ni = d[key]
        den = vocab[key]
        tf_idf[i, j] = ni*log(D/den)

### (b)

In [6]:
presidents = []
years = []
for speech in speeches:
    presidents.append(speech['president'])
    years.append(speech['year'])

In [7]:
# make similarity matrix
S = np.zeros((D, D))
l = []
def sim(d1, d2):
    return np.dot(d1, d2)/(norm(d1)*norm(d2))
for i in range(D):
    for j in range(D):
        S[i, j] = sim(tf_idf[i], tf_idf[j])
        l.append((S[i, j], i, j))

In [8]:
same_pres = []
diff_pres = []
# iterate through the upper triangular excluding the diagonal,
# in order to get coordinates for each corresponding pair without repeating pairs
for i in range(D-1):
    for j in range(i+1, D):
        if presidents[i] == presidents[j]:
            same_pres.append((i, j))
        else:
            diff_pres.append((i, j))

In [9]:
same_pres_sort = [(sim, i, j) for (sim, i, j) in l if (i, j) in same_pres]
same_pres_sort = sorted(same_pres_sort, key = lambda x: x[0], reverse=True)
diff_pres_sort = [(sim, i, j) for (sim, i, j) in l if (i, j) in diff_pres]
diff_pres_sort = sorted(diff_pres_sort, key = lambda x: x[0], reverse=True)

In [10]:
print("Top 50 most similar pairs of SOUs given by different Presidents:")
for k in range(50):
    sim, i, j = diff_pres_sort[k]
    print("%d:" % (k+1), presidents[i], "(%d) and" % int(years[i])
          , presidents[j], "(%d); " % int(years[j]), "Similarity score: %f" % sim)

Top 50 most similar pairs of SOUs given by different Presidents:
1: Dwight D. Eisenhower (1961) and Jimmy Carter (1981);  Similarity score: 0.694960
2: Grover Cleveland (1885) and Benjamin Harrison (1889);  Similarity score: 0.676519
3: John Tyler (1844) and James K. Polk (1846);  Similarity score: 0.673768
4: Dwight D. Eisenhower (1956) and Jimmy Carter (1981);  Similarity score: 0.664240
5: William J. Clinton (1994) and Barack Obama (2010);  Similarity score: 0.661743
6: Rutherford B. Hayes (1877) and Grover Cleveland (1885);  Similarity score: 0.653579
7: Dwight D. Eisenhower (1955) and Jimmy Carter (1981);  Similarity score: 0.653078
8: John Tyler (1844) and James K. Polk (1845);  Similarity score: 0.650417
9: Andrew Jackson (1836) and Martin Van Buren (1839);  Similarity score: 0.649303
10: Theodore Roosevelt (1907) and William Howard Taft (1912);  Similarity score: 0.644658
11: William J. Clinton (1998) and George W. Bush (2004);  Similarity score: 0.642952
12: George Bush (1992)

In [11]:
print("Top 50 most similar pairs of SOUs given by the same President:")
for k in range(50):
    sim, i, j = same_pres_sort[k]
    print("%d:" % (k+1), "%s:" % presidents[i], "%d and" % int(years[i])
          , "%d; " % int(years[j]), "Similarity score: %f" % sim)

Top 50 most similar pairs of SOUs given by the same President:
1: Barack Obama: 2012 and 2013;  Similarity score: 0.985640
2: William McKinley: 1899 and 1900;  Similarity score: 0.754864
3: William J. Clinton: 1997 and 1998;  Similarity score: 0.751195
4: William J. Clinton: 1998 and 2000;  Similarity score: 0.750868
5: William J. Clinton: 1998 and 1999;  Similarity score: 0.748687
6: William J. Clinton: 1994 and 1995;  Similarity score: 0.742226
7: William Howard Taft: 1910 and 1912;  Similarity score: 0.741289
8: Barack Obama: 2010 and 2012;  Similarity score: 0.739157
9: Barack Obama: 2011 and 2012;  Similarity score: 0.737905
10: William J. Clinton: 1999 and 2000;  Similarity score: 0.736413
11: Barack Obama: 2010 and 2013;  Similarity score: 0.730633
12: Dwight D. Eisenhower: 1955 and 1956;  Similarity score: 0.726676
13: Barack Obama: 2011 and 2013;  Similarity score: 0.722149
14: Barack Obama: 2009 and 2010;  Similarity score: 0.720039
15: Theodore Roosevelt: 1905 and 1907;  Sim

In [12]:
# make a dictionnary with each pair, 
# and all the similarity scores between their speeches,
# then take the mean for each
named_diff_pres = [(tuple(sorted((presidents[i], presidents[j]))), sim) \
                       for sim, i, j in diff_pres_sort]
means = {}
for pair, sim in named_diff_pres:
    if pair not in means.keys():
        means[pair] = [sim]
    else:
        means[pair].append(sim)
means = [(pair, np.mean(sim)) for pair, sim in means.items()]
means = sorted(means, key=lambda x: x[1], reverse=True)

In [13]:
print("Top 25 most similar pairs of Presidents:")
for k in range(25):
    pair, sim = means[k]
    print("%d:" % (k+1), "%s:" % pair[0], "and %s;" % pair[1]
          , "Similarity score: %f" % sim)

Top 25 most similar pairs of Presidents:
1: Millard Fillmore: and Zachary Taylor; Similarity score: 0.560071
2: Barack Obama: and William J. Clinton; Similarity score: 0.558363
3: Benjamin Harrison: and Rutherford B. Hayes; Similarity score: 0.544537
4: Chester A. Arthur: and Rutherford B. Hayes; Similarity score: 0.540669
5: Benjamin Harrison: and Chester A. Arthur; Similarity score: 0.539790
6: Theodore Roosevelt: and William Howard Taft; Similarity score: 0.526337
7: Benjamin Harrison: and Grover Cleveland; Similarity score: 0.518406
8: Benjamin Harrison: and William Howard Taft; Similarity score: 0.516884
9: Chester A. Arthur: and William Howard Taft; Similarity score: 0.516318
10: George Bush: and William J. Clinton; Similarity score: 0.515797
11: Grover Cleveland: and Rutherford B. Hayes; Similarity score: 0.510615
12: William Howard Taft: and William McKinley; Similarity score: 0.504355
13: Andrew Jackson: and Martin Van Buren; Similarity score: 0.502953
14: Gerald R. Ford: and 

### (c)

In [16]:
nc = 12
model = KMeans(n_clusters = nc, n_jobs = -1, max_iter=50)
sou_clust = model.fit(tf_idf)
labels = model.predict(tf_idf)
cl_len = np.zeros(nc)
for i in range(nc):
    print("Cluster %d" % (i+1))
    for c in np.where(labels==i)[0]:
        cl_len[i] += 1
        print(presidents[c], years[c])
print(cl_len)
print("Average cluster length = %f" % np.mean(cl_len))

Cluster 1
Grover Cleveland 1885
Grover Cleveland 1894
Grover Cleveland 1896
William McKinley 1898
William McKinley 1899
William McKinley 1900
Cluster 2
George Washington 1790
George Washington 1791
George Washington 1792
George Washington 1793
George Washington 1794
George Washington 1795
George Washington 1796
John Adams 1797
John Adams 1798
John Adams 1799
John Adams 1800
Thomas Jefferson 1801
Thomas Jefferson 1802
Thomas Jefferson 1803
Thomas Jefferson 1804
Thomas Jefferson 1805
Thomas Jefferson 1806
Thomas Jefferson 1807
Thomas Jefferson 1808
James Madison 1809
James Madison 1810
James Madison 1811
James Madison 1812
James Madison 1813
James Madison 1814
James Madison 1815
James Madison 1816
James Monroe 1817
James Monroe 1818
James Monroe 1819
James Monroe 1820
James Monroe 1821
James Monroe 1822
James Monroe 1823
James Monroe 1824
John Quincy Adams 1825
John Quincy Adams 1826
John Quincy Adams 1827
John Quincy Adams 1828
Andrew Jackson 1831
Andrew Jackson 1832
Andrew Jackson 1833

After choosing several sizes of clusters, I thought that 12 was a great option because although it allows 4 clusters of length 1, it allows us to see interesting trends. For instance, we can see in this last cluster only contains Obama and Bill J. Clinton speeches, and these two correspond to the top 2 most similar pair of presidents! Each cluster combines speeches with others using the distance between each tf-idf vector, which allows a clustering of the similar speeches in order to compare them.