In [1]:
import os
import re
import matplotlib.pyplot as plt
import numpy as np
import string
import pandas as pd


from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, FloatType
from __future__ import print_function, unicode_literals, with_statement, absolute_import, generators, nested_scopes, division
from operator import add
from pyspark.mllib.clustering import KMeans

%matplotlib inline 

In [2]:
# Local Spark
# spark session is now the entry point of spark program
# for line 8, can use local[n] for run spark locally with n cores
spark = SparkSession.builder \
    .master('local') \
    .appName('Spark Try') \
    .getOrCreate()

In [3]:
data = spark.read.json('/project/cmsc25025/sou/speeches.json')
#data.show()

# Part a
Compute the TF-IDF vectors for each SOU address. You should lower case all of the text, and remove punctuation.

For example, you could use something like this:
       s = s.lower().translate(string.maketrans("",""), string.punctuation)
You will have to make choices about the size of the term vocabulary to use—for example throwing out the 20 most common words, and words that appear fewer than, say, 50 times.

In [4]:
def trans(t):
    return t.lower().encode('utf-8').translate(string.maketrans("",""), string.punctuation).strip().split()

transRDD = data.rdd.map(lambda x:(int(x['year']), trans(x['text']), x['president'])).cache()
num_doc = len(transRDD.collect())

In [5]:
allwords = transRDD.flatMap(lambda x: [(w,1) for w in x[1]]).reduceByKey(add).map(lambda x: (x[1], x[0]))\
            .sortByKey().filter(lambda x: int(x[0])>50)
vocab = allwords.map(lambda x: x[1]).collect()[:-20]

In [6]:
def df(l):
    res = []
    for word in vocab:
        if word in l:
            res.append((word, 1))
        else:
            res.append((word, 0))
    return res

def tf_and_df(x):
    #y = x[0]
    l = x[1]
    res = []
    for word in vocab:
        c = l.count(word)
        doc_f = dfdic[word]
        tfidf = c*np.log(num_doc/doc_f)
        #res.append([y,word, tfidf])
        res.append(tfidf)
    return res
    

In [7]:
#get df
dflist = transRDD.flatMap(lambda x: df(x[1])).reduceByKey(add).collect()
dfdic = dict(dflist)

In [8]:
tfidfRDD = transRDD.map(lambda x: (x[0],x[2],tf_and_df(x))).cache()
#tfidfRDD.take(4)

In [9]:
year_col = tfidfRDD.map(lambda x: x[0]).collect()
president_col = tfidfRDD.map(lambda x: x[1]).collect()
vector_col = tfidfRDD.map(lambda x: x[2]).collect()

In [10]:
d = {"year": year_col, 'president': president_col, "vector": vector_col}
resDF = pd.DataFrame(d)
resDF[:5]

Unnamed: 0,president,vector,year
0,James Monroe,"[0.0, 0.0, 10.6234906663, 0.0, 0.0, 0.0, 0.0, ...",1821
1,William McKinley,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.682...",1897
2,Dwight D. Eisenhower,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.89417447466, ...",1960
3,Calvin Coolidge,"[0.0, 1.6138725095, 0.0, 0.0, 0.0, 2.329492545...",1923
4,James Madison,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1816


# Part b
Based on cosine similarity, find
- 50 most similar pairs of SOUs given by different Presidents.
- 50 most similar pairs of SOUs given by the same President.
- 25 most similar pairs of Presidents, averaging the cosine similarity over all pairsof their SOUs.

When you read the above speeches, do they indeed seem similar to you? (You can read the speeches in a more reader-friendly format here: http://www.presidency. ucsb.edu/sou.php) Comment on what you find, and describe what is needed to construct a better similarity measure between documents.

In [11]:
def cos_sim(a, b):
    a = np.array(a)
    b = np.array(b)
    res = np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return res

def cal(resDF):
    pres_1 =[]
    y_1 = []
    pres_2 =[]
    y_2 = []
    dis = []
    
    for i in range(len(resDF)):
        p1 = resDF.iloc[i,0]
        y1 = resDF.iloc[i,2]
        a = resDF.iloc[i,1]
        for j in range(i+1, len(resDF)):
            p2 = resDF.iloc[j,0]
            y2 = resDF.iloc[j,2]
            b = resDF.iloc[j,1]
            diff = cos_sim(a,b)
            pres_1.append(p1)
            pres_2.append(p2)
            y_1.append(y1)
            y_2.append(y2)
            dis.append(diff)

    d = {"year_1": y_1, 'president_1': pres_1, "year_2": y_2, "president_2": pres_2, 'distance': dis}
    res = pd.DataFrame(d)
    return res
    

In [12]:
df_b = cal(resDF)

50 most similar pairs of SOUs given by different Presidents

In [13]:
df_b_diff = df_b.loc[df_b['president_1'] != df_b['president_2']].sort_values(by='distance')
df_b_diff[:50]

Unnamed: 0,distance,president_1,president_2,year_1,year_2
4805,0.017885,Barack Obama,John Adams,2010,1800
18667,0.01911,John Adams,Barack Obama,1800,2012
18723,0.019945,John Adams,George Bush,1800,1990
7917,0.021342,Barack Obama,John Adams,2009,1800
18651,0.022578,John Adams,Richard M. Nixon,1800,1973
4778,0.023292,Barack Obama,James Madison,2010,1809
11778,0.023416,Barack Obama,John Adams,2013,1800
5331,0.024215,George W. Bush,George Washington,2002,1791
18735,0.024422,John Adams,Jimmy Carter,1800,1981
18701,0.024645,John Adams,George W. Bush,1800,2001


50 most similar pairs of SOUs given by the same President

In [14]:
df_b_same = df_b.loc[df_b['president_1'] == df_b['president_2']].sort_values(by='distance')
df_b_same[:50]

Unnamed: 0,distance,president_1,president_2,year_1,year_2
23013,0.0937,Franklin D. Roosevelt,Franklin D. Roosevelt,1937,1942
23324,0.120969,Franklin D. Roosevelt,Franklin D. Roosevelt,1942,1934
15962,0.141709,Woodrow Wilson,Woodrow Wilson,1916,1917
8580,0.14365,Harry S Truman,Harry S Truman,1946,1951
16314,0.145113,Franklin D. Roosevelt,Franklin D. Roosevelt,1935,1942
17403,0.146047,Franklin D. Roosevelt,Franklin D. Roosevelt,1943,1934
15142,0.14943,James Madison,James Madison,1809,1814
23193,0.149505,Franklin D. Roosevelt,Franklin D. Roosevelt,1945,1934
23011,0.1508,Franklin D. Roosevelt,Franklin D. Roosevelt,1937,1945
3200,0.151224,Franklin D. Roosevelt,Franklin D. Roosevelt,1944,1937


25 most similar pairs of Presidents, averaging the cosine similarity over all pairsof their SOUs

In [15]:
df_b_p = df_b_diff.iloc[:, 0:3]
df_b_p['presidents'] = df_b_p[['president_1','president_2']].values.tolist()
df_b_p=df_b_p.groupby(df_b_p['presidents'].apply(lambda x: tuple(sorted(x))))\
                      .mean().sort_values(by = 'distance')
df_b_p[:25]

Unnamed: 0_level_0,distance
presidents,Unnamed: 1_level_1
"(Barack Obama, John Adams)",0.033015
"(George Bush, John Adams)",0.03787
"(Jimmy Carter, John Adams)",0.038405
"(John Adams, William J. Clinton)",0.042801
"(Barack Obama, James Madison)",0.042882
"(John Adams, Ronald Reagan)",0.043461
"(James Madison, Jimmy Carter)",0.045615
"(Barack Obama, George Washington)",0.045728
"(Barack Obama, James Monroe)",0.047224
"(George Washington, Jimmy Carter)",0.048001


Interpretation:
I actually find it hard to directly interpret these results by reading the addresses. Although, Obama's address in 2010 and Adams' address in 1800 are paired as the most similar speeches, I do not find them particularly similar apart from the fact that they both mention the importance of judiciary and uniting people together against other forces. (In 1800, they are agianst the UK and in 2010, they are against the financial crisis.) 

To improve the similarity measurement, I think it's important to do some preprocessing to the "words" that we've obtained. This requires better tokenization techniques. For example, plural and singular forms of nouns should be considered as the same.

# Part c
Using this vector representation, cluster the speeches using k-means.

Experiment with different number of clusters, and display the clusters obtained (in some manner that you choose). Comment on the clustering results, and whether or not the results are interpretable.

Although the SOU dataset is not very large, you should try to exploit parallelism whenever possible in order to become familiar with this paradigm.

In [16]:
train_data = tfidfRDD.map(lambda x: x[2])

In [17]:
def compare_k(k, train_data):
    clusters = KMeans.train(train_data, k, maxIterations=50, initializationMode="random")
    #centers = clusters.clusterCenters
    #avg_dis = tfidfRDD.map(lambda x: ((x[2]-centers[clusters.predict(x[2])])**2)).values().mean()
    #print("Average Eulidean distance from center is", avg_dis)
    
    res = tfidfRDD.map(lambda x: (x[0], x[1],clusters.predict(x[2]))).toDF()
    for i in range(k):
        print("\nSample speeches in Cluster", i)
        res.filter(res._3==i).show(10)

In [18]:
for i in [5,10,15]:
    compare_k(i, train_data)


Sample speeches in Cluster 0
+----+------------------+---+
|  _1|                _2| _3|
+----+------------------+---+
|1897|  William McKinley|  0|
|1886|  Grover Cleveland|  0|
|1905|Theodore Roosevelt|  0|
|1848|     James K. Polk|  0|
|1856|   Franklin Pierce|  0|
|1867|    Andrew Johnson|  0|
|1860|    James Buchanan|  0|
|1873|  Ulysses S. Grant|  0|
|1908|Theodore Roosevelt|  0|
|1907|Theodore Roosevelt|  0|
+----+------------------+---+
only showing top 10 rows


Sample speeches in Cluster 1
+----+-------------+---+
|  _1|           _2| _3|
+----+-------------+---+
|1846|James K. Polk|  1|
+----+-------------+---+


Sample speeches in Cluster 2
+----+------------+---+
|  _1|          _2| _3|
+----+------------+---+
|1981|Jimmy Carter|  2|
+----+------------+---+


Sample speeches in Cluster 3
+----+------------------+---+
|  _1|                _2| _3|
+----+------------------+---+
|2010|      Barack Obama|  3|
|2009|      Barack Obama|  3|
|1998|William J. Clinton|  3|
|1995|W

Interpreting these clusterings is much easier than interpreting similarities betweeen individual speeches! It's apparent that generally presidents in the same cluster have every similar party affilation. Though, the party system from a long time ago is very different from now, we can still tell the similary from their positions in the ideology spectrum.

Comparing between having 5, 10 and 15 clusters, I would say that 5 seems to be better than the other two in this case. Though this is a randomized process, we do observe that many presidents have their own cluster when the number of clusters is as high as 10 or 15. This indicates that 10 or 15 clusters might be too many for this dataset.