In [1]:
import os, sys, math
from text_util import *
from collections import Counter, defaultdict

import nltk
stemmer = nltk.stem.porter.PorterStemmer()

## To calculate the normalized Term Frequency – Inverse Document Frequency for every president

In [2]:
import re
refs_pat = '^[a-z][a-z\'-]+[a-z]$'
refs_prog = re.compile(refs_pat)
NAMEWORDS = set()

In [3]:
# l2norm() function to divide a person's TF value by the l2norm of the term vector 
def l2norm(vec):
    return float(math.sqrt(sum(map(lambda (term, c): c**2, vec))))

In [4]:
# get_terms() to perform all of the data cleaning
def get_terms(s):
    # Casing and short words
    # We can deal with these by lower-casing all of the terms and filtering out the short terms.
    # We also want to remove lines that begins with '>' which is typical in emails that were 
    # forwarded or replied to
    s = s.lower()
    lines = filter(lambda line: not line.strip().startswith(">"), s.split('\n'))
    arr = '\n'.join(lines).split()
    terms = []
    for term in arr:
        if re.match(refs_pat, term) != None:
            terms.append(term)
    terms = map(lambda term: term.replace("'s",'').replace("'", '').replace(".", "").replace(",", ""), terms)
    terms = filter(lambda term: len(term) > 3, terms)
    # Stop Words
    # The email_util module defines a variable STOPWORDS that contains a list of common english 
    # stop words in lower case. We can filter out terms that are found in in this list.
    from text_util import STOPWORDS
    terms = filter(lambda term: term not in STOPWORDS, terms)
    # Remove names from the terms
    terms = filter(lambda term: term not in NAMEWORDS, terms)
    terms = filter(stemmer.stem, terms)
    return terms

In [5]:
# To calculate term frequency
folder_tf = defaultdict(Counter)

for e in TextWalker('sotu'):
    terms_in_text = get_terms(e['text'])
    folder_tf[e['folder']].update(terms_in_text)

In [6]:
# To calculate inverse document frequency
terms_per_folder = defaultdict(set)
ntexts = 0
for e in TextWalker('sotu'):
    terms_in_text = get_terms(e['text'])
    # this collects all of the terms in each folder
    terms_per_folder[e['folder']].update(terms_in_text)

In [7]:
# Each iteration retrieves the terms for a given folder, and adds them all to the counter.
allterms = Counter()
for folder, terms in terms_per_folder.iteritems():
    # this will increment the counter value for each term in `terms`
    allterms.update(terms)

In [8]:
# To normalize weights
for key in folder_tf.keys():
    tfs = folder_tf[key]
    normfactor = float(l2norm(tfs.iteritems()))
    for term in tfs.keys():
        tfs[term] /= normfactor

In [9]:
idfs = {}
# The number of keys should be the number of folders 
nfolders = len(terms_per_folder)   
for term, count in allterms.iteritems():
    idfs[term] = math.log( nfolders / (1.0 + count) )

In [10]:
# Calculate tf-idf for each folder
# key is folder name, value is a list of (term, tfidf score) pairs
tfidfs = {}
for folder, tfs in folder_tf.items():
    tfidfs[folder] = map(lambda (k, v): (k, v*idfs[k]), tfs.items())
    pass

In [11]:
# Print the top terms
f = open(r"TFIDF.txt", "w")
for folder, terms in tfidfs.items():
    print folder
    f.write(folder + '\n')
    sorted_by_count_top20 = sorted(terms, key=lambda (k, v):v, reverse=True)[:20]
    for pair in sorted_by_count_top20:
        print '\t', pair
        f.write('\t' + str(pair) + '\n')
f.close()

James Madison 4
	('british', 0.07416591794975048)
	('madison', 0.05625201403667222)
	('militia', 0.05510277214520098)
	('edicts', 0.04614189914505176)
	('savages', 0.04148516050423836)
	('enemy', 0.039558749797707744)
	('captain', 0.03813838022298889)
	('proofs', 0.03813838022298889)
	('danish', 0.035157508772920135)
	('revocation', 0.035157508772920135)
	('milan', 0.03300393629137833)
	('seminary', 0.03300393629137833)
	('orders', 0.032925600486767626)
	('regulars', 0.0323724090184379)
	('major-general', 0.0323724090184379)
	('detroit', 0.030761266096701174)
	('james', 0.02976404945306569)
	('lake', 0.028663207463346975)
	('detachments', 0.02812600701833611)
	('tribes', 0.026245092861105712)
George H.W. Bush 41
	('budget', 0.11140573155794081)
	('kids', 0.08677762873729751)
	('help', 0.08226387754137214)
	('tell', 0.07432912157693332)
	('billion', 0.06828093224518954)
	('saddam', 0.06391746679567961)
	('economic', 0.060019455684755044)
	('environmental', 0.05682299342475569)
	('soviet

## To calculate the Cosine Similarity between every president by their texts

In [12]:
from math import *

# To calculate folder cosine similarity
def cal_similarity(folder1_tfidfs, folder2_tfidfs):
    # compute the similarity between the two arguments
    folder1_score = dict(folder1_tfidfs)
    folder2_score = dict(folder2_tfidfs)
    
    numerator = 0.0
    for key, valus in folder1_score.items():
        dotscore = folder1_score[key]*folder2_score.get(key, 0.0)
        numerator += dotscore
    # compute the l2 norm of each vector
    folder1_norm = math.sqrt(sum([score**2 for score in folder1_score.values()]))
    folder2_norm = math.sqrt(sum([score**2 for score in folder2_score.values()]))
    denominator = folder1_norm * folder2_norm + 1.0
    
    similarity = numerator/denominator
    return (similarity)   

In [13]:
def sort_by_count(key_value, top_n):
    sorted_by_count_topn = sorted(key_value, key=lambda (k, v):v, reverse=True)[:top_n]
    return (sorted_by_count_topn)

In [14]:
num_of_folders = len(tfidfs.keys())
folder_similarity = dict()

for i in range(0, num_of_folders-1):
    for j in range(i+1, num_of_folders):
        folder1 = tfidfs.keys()[i]
        folder2 = tfidfs.keys()[j]
        similarity = cal_similarity(sort_by_count(tfidfs[folder1], 100), sort_by_count(tfidfs[folder2], 100))
        key = '%s and %s' % (folder1, folder2)
        folder_similarity[key] = similarity        
        #print('The similarity between President [%s] and [%s] is %.2f' % (folder1, folder2, similarity))

sorted_similarity = sort_by_count(folder_similarity.items(), len(folder_similarity))
f = open(r"Cosine.txt", "w")
f.write('The sorted President cosine similarities are:' + '\n')
print('The sorted President cosine similarities are:')
for k,v in sorted_similarity:
    print('%s: %s' % (k, v))
    f.write('%s: %s' % (k, v) + '\n')
f.close()

The sorted President cosine similarities are:
Barack Obama 44 and William J. Clinton 42: 0.0676334806187
Ronald Reagan 40 and Jimmy Carter 39: 0.0623734943335
Harry S. Truman 33 and Dwight D. Eisenhower 34: 0.0595218469118
Jimmy Carter 39 and Gerald R. Ford 38: 0.0584774329405
George H.W. Bush 41 and Ronald Reagan 40: 0.0558422689536
Barack Obama 44 and George W. Bush 43: 0.0546894180587
Ronald Reagan 40 and Gerald R. Ford 38: 0.0530576877938
Dwight D. Eisenhower 34 and John F. Kennedy 35: 0.0518104579733
John F. Kennedy 35 and Jimmy Carter 39: 0.0516111916764
Ronald Reagan 40 and William J. Clinton 42: 0.0514051229734
George H.W. Bush 41 and William J. Clinton 42: 0.0508569724243
Lyndon B. Johnson 36 and John F. Kennedy 35: 0.0483871769983
Dwight D. Eisenhower 34 and Jimmy Carter 39: 0.0477242185966
William J. Clinton 42 and George W. Bush 43: 0.0476947761951
George H.W. Bush 41 and George W. Bush 43: 0.047345218519
George H.W. Bush 41 and Jimmy Carter 39: 0.0470845378234
Harry S. Tru