In [17]:
import re

path_to_speeches = '../hein-daily/speeches_114.txt'

def parse_speech_file(lowercase=True):
    speech_ids = []
    speeches = []
    with open(path_to_speeches, 'rb') as f:
        lines = f.readlines()[1:]  # skip header
        for line in lines:
            speech_id, speech = str(line).split('|', 1)
            speech_id = re.findall(r'\d+', speech_id)[0]
            speech_ids.append(speech_id)
            
            if lowercase:
                speech = speech.lower()
            speech = speech[:-2]  # remove trailing \n
            toks = re.findall(r'[a-zA-Z]+', speech)  # only keep alpha tokens
            speeches.append(toks)
    return speech_ids, speeches

speech_ids, speeches = parse_speech_file()
print(speeches[0])

['the', 'representativeselect', 'and', 'their', 'guests', 'will', 'please', 'remain', 'standing', 'and', 'join', 'in', 'the', 'pledge', 'of', 'allegiance']


In [18]:
from collections import Counter

unique_words_per_speech = []
for toks in speeches:
    unique_words_per_speech.extend(list(set(toks)))
doc_counts = Counter(unique_words_per_speech)  # each word mapped to number of docs that it's in
print(doc_counts.most_common(10))

[('the', 100537), ('to', 76190), ('i', 74979), ('of', 67849), ('mr', 63393), ('and', 61276), ('is', 60953), ('a', 54561), ('from', 53888), ('for', 53785)]


In [19]:
import numpy as np 

idf = {}  # inverse document frequency
N = len(speeches)
for tok, doc_count in doc_counts.most_common():
    idf[tok] = np.log((N - doc_count + 0.5) / (doc_count + 0.5))

print(idf['the'])
print(idf['immigrant'])
print(idf['america'])

-2.1463797236547517
5.637586369694843
2.5311101387817185


In [20]:
from doc_retrieval import matching_score

query = ['immigrant', 'immigrants', 'immigration']
avg_len = np.mean([len(s) for s in speeches])
print('Average speech len: %.3f' % (avg_len))

scored_speeches = []
for i, (sid, speech) in enumerate(zip(speech_ids, speeches)):
    score = matching_score(query, speech, idf, avg_len)
    scored_speeches.append((sid, speech, score))

Average speech len: 193.444


In [21]:
scored_speeches = sorted(scored_speeches, key=lambda x:x[2], reverse=True)

for i in np.arange(0, 2000, 500):
    s_id, s, score = scored_speeches[i]
    print('Rank %d: id = %s, num_words = %d, matching score = %.3f' % (i, s_id, len(s), score))
    excerpt = s[:500]
    if len(s) > 500:
        excerpt.append('...')
    print(' '.join(excerpt))
    print()

Rank 0: id = 1140034933, num_words = 606, matching score = 22.959
mr speaker i rise to commemorate immigrant heritage month and salute the contributions of immigrants to the never ending glory and story of america i stand in solidarity with our immigrant soldiers who have fought to defend and extend our freedoms from the shores of the atlantic to the pacific in the deserts of africa and the jungles of asia to the seas of the persian gulf i stand in solidarity with our immigrant innovators and entrepreneurs who make up over percent of the ceo of the fortune companies which employ over million of our fellow citizens and i stand in solidarity with the children of our immigrants who will be an integral part of our nations future achievements this is why i have consistently introduced legislation such as h r save america comprehensive immigration act of which sets forth a comprehensive and humane solution to immigrant reform this legislation works to secure our borders and brings close to m

In [22]:
# taking the top 1000
immigrant_speech_ids = [t[0] for t in scored_speeches[:1000]]
immigrant_speeches = [' '.join(t[1]) for t in scored_speeches[:1000]]

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(immigrant_speeches)
print('Vocab size:', len(cv.get_feature_names()))

Vocab size: 18084


In [59]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

from sklearn.decomposition import LatentDirichletAllocation as LDA
        
lda = LDA(n_components=10, n_jobs=-1)
topics = lda.fit_transform(X)

In [71]:
def print_topics(model, count_vectorizer, top_n):
    words = count_vectorizer.get_feature_names()
    topic_probs = np.sum(topics, axis=0)
    topic_probs = topic_probs / np.sum(topic_probs)
    sorted_topic_idx = np.argsort(-1 * topic_probs)  # sort topics by highest to lowest prob
    for idx in sorted_topic_idx:
        print('Topic %d: p=%.3f' % (idx, topic_probs[idx]))
        word_probs = model.components_[idx]
        sorted_word_idx = np.argsort(-1 * word_probs)  # sort words by highest to lowest prob
        print(' '.join([words[i] for i in sorted_word_idx[:top_n]]))
        print()

print_topics(lda, cv, top_n=50)

Topic 8: p=0.617
immigration security people homeland president law department country house mr time funding senate federal just going enforcement republicans want said congress states american border speaker dont republican work united make colleagues america vote think need way come say like know debate amendment immigrants year floor senator today government executive illegal

Topic 0: p=0.208
mr community years immigrants american school states united today speaker young people family daca new america country work service children immigrant nation life program rise dream honor world students state year education americans president th families university high like great women history immigration medical communities better dreamers district college asian

Topic 6: p=0.155
president congress immigration executive states united action program american presidents workers laws mr law constitution speaker people authority jobs amnesty immigrants actions court power house support american

In [62]:
path_to_metadata = '../hein-daily/114_SpeakerMap.txt'

def parse_metadata_file():
    speech_id_to_record = {}
    with open(path_to_metadata, 'rb') as f:
        lines = f.readlines()
        header = str(lines[0])[2:-3]
        fields = header.split('|')
        print('Fields:', fields)
        for line in lines[1:]:
            line = str(line)[2:-3]
            values = line.split('|')
            if len(values) == len(fields):
                record = {field:val for field, val in zip(fields, values)}
                speech_id_to_record[record['speech_id']] = record
    return speech_id_to_record

records = parse_metadata_file()
print(len(records))

Fields: ['speakerid', 'speech_id', 'lastname', 'firstname', 'chamber', 'state', 'gender', 'party', 'district', 'nonvoting']
67971


In [63]:
parties = [records[s_id]['party'] if s_id in records else 'UNK' for s_id in immigrant_speech_ids]
print(Counter(parties))

Counter({'D': 509, 'R': 336, 'UNK': 151, 'I': 4})


In [70]:
party2probs = {}
for party in ['D', 'R']:
    indices = [i for i, p in enumerate(parties) if p == party]
    topic_subset = topics[indices]
    probs = np.sum(topic_subset, axis=0)
    probs = probs / np.sum(probs)
    print('%s topic probs:' % party, probs)
    party2probs[party] = probs

print('Dem - Rep topic probs:', [round(v, 4) for v in party2probs['D'] - party2probs['R']])

D topic probs: [0.18170887 0.00253023 0.00126387 0.00190935 0.00184224 0.0010977
 0.10132316 0.00202821 0.70500941 0.00128697]
R topic probs: [0.06141181 0.00172214 0.00278657 0.00245914 0.00656747 0.00142288
 0.2606237  0.00260425 0.65921058 0.00119145]
Dem - Rep topic probs: [0.1203, 0.0008, -0.0015, -0.0005, -0.0047, -0.0003, -0.1593, -0.0006, 0.0458, 0.0001]
