In [30]:
"""Convert Senate speech data from 114th Congress to bag of words format.

The data is provided by [1]. Specifically, we use the `hein-daily` data. To 
run this script, make sure the relevant files are in 
`data/senate-speeches-114/raw/`. The files needed for this script are 
`speeches_114.txt`, `descr_114.txt`, and `114_SpeakerMap.txt`.

#### References
[1]: Gentzkow, Matthew, Jesse M. Shapiro, and Matt Taddy. Congressional Record 
     for the 43rd-114th Congresses: Parsed Speeches and Phrase Counts. Palo 
     Alto, CA: Stanford Libraries [distributor], 2018-01-16. 
     https://data.stanford.edu/congress_text
"""

import os
import setup_utils as utils

import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
import re

In [31]:
project_dir = os.path.abspath(
    os.path.join(os.path.dirname('.'), os.pardir)) 
data_dir = os.path.join(project_dir, "data\\paradigms\\raw")
save_dir = os.path.join(project_dir, "data\\paradigms\\clean")



In [32]:
def convert_markdown_to_plain_text(markdown_string):

    markdown_string = str(markdown_string)
    # Remove newlines
    plain_text = markdown_string.replace('\n', ' ')
    
    # Remove bold formatting (e.g., **text**)
    plain_text = re.sub(r'\*\*(.*?)\*\*', r'\1', plain_text)
    
    # Remove backslashes
    plain_text = plain_text.replace('\\', '')
    
    # Remove other markdown formatting if needed
    plain_text = plain_text.replace('\t', '')
    
    return plain_text

def fix_spaces(string):
    new_string = ' '.join(string.strip().split())
    return new_string

df = pd.read_csv(os.path.join(data_dir, 'paradigms.csv'), index_col=0)[['Judge Name', 'Paradigm']]

df['Paradigm'] = df['Paradigm'].apply(convert_markdown_to_plain_text)
df['Paradigm'] = df['Paradigm'].apply(fix_spaces)
df.sort_index(inplace=True)
df = df[~df['Judge Name'].isna()]
df = df[df['Paradigm'].str.len() > 5]
df.head()

Unnamed: 0_level_0,Judge Name,Paradigm
Judge ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Chris Palmer,Tabroom.com is mostly my fault. Therefore I'm ...
3,Aaron Hardy,It's been a number of years since I've been an...
319,Paul Wexler,"Debate Paradigm Paul Wexler Coach since 1993, ..."
1057,Shunta Jordan,Just a brief update for the high school commun...
1088,Bill Smelko,Please email me your speech documents. I have ...


In [33]:
df.shape

(1817, 2)

In [34]:

speaker = np.array(df.index.values)

speeches = np.array(df['Paradigm'])


# Create mapping between names and IDs.
speaker_to_speaker_id = dict(
    [(y, x) for x, y in enumerate(speaker)])
author_indices = np.array(
    [speaker_to_speaker_id[s] for s in speaker])
author_map = np.array(list(speaker_to_speaker_id.keys()))

print(author_map.shape)
print(author_indices.shape)
print(speaker.shape)
len(set(speaker))


(1817,)
(1817,)
(1817,)


1817

In [35]:


stopwords = list(
    np.loadtxt('stops.txt',
               dtype=str,
               delimiter=",")[0:100,0])

stopwords

['the',
 'i',
 'to',
 'a',
 'and',
 'of',
 'you',
 'is',
 'that',
 'in',
 'for',
 'it',
 'debate',
 'if',
 'not',
 'be',
 'are',
 'on',
 'your',
 'have',
 'me',
 'or',
 'as',
 'arguments',
 'will',
 'but',
 'my',
 's',
 'with',
 'this',
 'do',
 'of the',
 'should',
 'if you',
 'don',
 'don t',
 'what',
 'an',
 'can',
 'think',
 'in the',
 'more',
 'am',
 'argument',
 'at',
 'about',
 'i am',
 'i will',
 'debates',
 'so',
 'round',
 'make',
 'm',
 'i m',
 'they',
 'just',
 'vote',
 'how',
 'than',
 'other',
 'to the',
 'the debate',
 'why',
 'by',
 'when',
 'to be',
 'read',
 'i think',
 'all',
 'team',
 'judge',
 'most',
 'some',
 'i have',
 'on the',
 'there',
 'their',
 'them',
 'also',
 'is a',
 'from',
 'out',
 'these',
 'you are',
 'any',
 'please',
 'well',
 'because',
 'that i',
 're',
 'it is',
 'need',
 'i don',
 'i don t',
 'one',
 'it s',
 'no',
 'very',
 'important',
 'for the']

In [36]:


count_vectorizer = CountVectorizer(min_df=0.001,
                                   max_df=0.3, 
                                   stop_words='english', 
                                   ngram_range=(1, 3),
                                   token_pattern="[a-zA-Z]+")


# Learn initial document term matrix. This is only initial because we use it to
# identify words to exclude based on author counts.
counts = count_vectorizer.fit_transform(speeches)

vocabulary = np.array(
    [k for (k, v) in sorted(count_vectorizer.vocabulary_.items(), 
                            key=lambda kv: kv[1])])


In [37]:

# Remove phrases spoken by less than 10 Senators.
counts_per_author = utils.bincount_2d(author_indices, counts.toarray())
min_authors_per_word = 8
author_counts_per_word = np.sum(counts_per_author > 0, axis=0)
acceptable_words = np.where(
    author_counts_per_word >= min_authors_per_word)[0]

ranking = author_counts_per_word.argsort()
len(vocabulary)
# vocabulary[ranking][:1000]


array(['aa', 'jargon topic', 'jargon understand', 't begin',
       'jargon used', 't based', 'jargon want', 't ballot unless',
       't ballot clear', 't bad thing', 'jason regnier', 'jazmine',
       't awesome', 'jccc climate', 'jeanette', 'jeanette rodriguez',
       'jeanette rodriguez toni', 't automatically presume',
       'jerk format evidence', 'jerk format', 't authors', 'jerk debate',
       't automatically assume', 'jeremy hammond', 'jargon think',
       'jeopardize', 'jefferson high school', 'jefferson high',
       'jefferey yan', 'jefferey', 'jeff buntin northwestern',
       't automatically mean', 'jenks', 'jargon theory',
       't believe judge', 'jargon team', 'jargon concepts',
       'jargon clearly impacted', 'jargon believe', 'jargon b',
       't believe m', 'jargon argument', 'jargon don', 't believe need',
       'jared anderson', 't believe teams', 't belittle opponents',
       'jan feb topic', 'jan feb', 'james madison years', 't believe s',
       'je

In [38]:

# Fit final document-term matrix with modified vocabulary.
count_vectorizer = CountVectorizer(ngram_range=(1, 3),
                                   vocabulary=vocabulary[acceptable_words])
counts = count_vectorizer.fit_transform(speeches)
vocabulary = np.array(
    [k for (k, v) in sorted(count_vectorizer.vocabulary_.items(), 
                            key=lambda kv: kv[1])])

# Adjust counts by removing unigram/n-gram pairs which co-occur.
counts_dense = utils.remove_cooccurring_ngrams(counts, vocabulary)

# Remove speeches with not enough words.
existing_speeches = np.where(np.sum(counts_dense, axis=1) > 1)[0]
counts_dense = counts_dense[existing_speeches]
author_indices = author_indices[existing_speeches]


In [43]:
# no stopword removal, remove words said rarely

speaker_to_speaker_id_f = dict(
    [(y, x) for x, y in enumerate(speaker)])
author_indices_f = np.array(
    [speaker_to_speaker_id_f[s] for s in speaker])
author_map_f = np.array(list(speaker_to_speaker_id_f.keys()))

count_vectorizer_f = CountVectorizer(min_df=0.001,
                                   ngram_range=(1, 3),
                                   token_pattern="[a-zA-Z]+")


# Learn initial document term matrix. This is only initial because we use it to
# identify words to exclude based on author counts.
counts_f = count_vectorizer_f.fit_transform(speeches)

vocabulary_f = np.array(
    [k for (k, v) in sorted(count_vectorizer_f.vocabulary_.items(), 
                            key=lambda kv: kv[1])])

# Remove phrases spoken by less than 10 Senators.
counts_per_author_f = utils.bincount_2d(author_indices_f, counts_f.toarray())
min_authors_per_word_f = 8
author_counts_per_word_f = np.sum(counts_per_author_f > 0, axis=0)
acceptable_words_f = np.where(
    author_counts_per_word_f >= min_authors_per_word_f)[0]

ranking = author_counts_per_word_f.argsort()
print(len(vocabulary_f))
print(len(acceptable_words_f))
print(counts_f.shape)

count_vectorizer_f = CountVectorizer(ngram_range=(1, 3),
                                     vocabulary=vocabulary_f[acceptable_words_f]
                                   )
counts_f = count_vectorizer_f.fit_transform(speeches)


239266
40653
(1817, 239266)


In [44]:

vocabulary_f = np.array(
    [k for (k, v) in sorted(count_vectorizer_f.vocabulary_.items(), 
                            key=lambda kv: kv[1])])

print(len(vocabulary_f))

# Adjust counts by removing unigram/n-gram pairs which co-occur.
counts_dense_f = utils.remove_cooccurring_ngrams(counts_f, vocabulary_f)

# Remove speeches with not enough words.
existing_speeches_f = np.where(np.sum(counts_dense_f, axis=1) > 1)[0]
counts_dense = counts_dense_f[existing_speeches_f]
author_indices_f = author_indices_f[existing_speeches]

40653


9752

In [None]:
# Save data.
if not os.path.exists(save_dir):
  os.makedirs(save_dir)

# `counts.npz` is a [num_documents, num_words] sparse matrix containing the
# word counts for each document.
sparse.save_npz(os.path.join(save_dir, "counts.npz"),
                sparse.csr_matrix(counts_dense).astype(np.float32))
# `author_indices.npy` is a [num_documents] vector where each entry is an
# integer indicating the author of the corresponding document.
np.save(os.path.join(save_dir, "author_indices.npy"), author_indices)
# `vocabulary.txt` is a [num_words] vector where each entry is a string
# denoting the corresponding word in the vocabulary.
np.savetxt(os.path.join(save_dir, "vocabulary.txt"), vocabulary, fmt="%s")
# `author_map.txt` is a [num_authors] vector of strings providing the name of
# each author in the corpus.
np.savetxt(os.path.join(save_dir, "author_map.txt"), author_map, fmt="%s")
# `raw_documents.txt` contains all the documents we ended up using.
raw_documents = [document.replace("\n", ' ').replace("\r", ' ') 
                 for document in speeches[existing_speeches]]
np.savetxt(os.path.join(save_dir, "raw_documents.txt"), 
           raw_documents, 
           fmt="%s")


In [None]:
# Save data for full list.
if not os.path.exists(save_dir):
  os.makedirs(save_dir)

# `counts.npz` is a [num_documents, num_words] sparse matrix containing the
# word counts for each document.
sparse.save_npz(os.path.join(save_dir, "counts_f.npz"),
                sparse.csr_matrix(counts_dense_f).astype(np.float32))
# `author_indices.npy` is a [num_documents] vector where each entry is an
# integer indicating the author of the corresponding document.
np.save(os.path.join(save_dir, "author_indices_f.npy"), author_indices_f)
# `vocabulary.txt` is a [num_words] vector where each entry is a string
# denoting the corresponding word in the vocabulary.
np.savetxt(os.path.join(save_dir, "vocabulary_f.txt"), vocabulary_f, fmt="%s")
# `author_map.txt` is a [num_authors] vector of strings providing the name of
# each author in the corpus.
np.savetxt(os.path.join(save_dir, "author_map_f.txt"), author_map_f, fmt="%s")
# `raw_documents.txt` contains all the documents we ended up using.
raw_documents_f = [document.replace("\n", ' ').replace("\r", ' ') 
                 for document in speeches[existing_speeches_f]]
np.savetxt(os.path.join(save_dir, "raw_documents_f.txt"), 
           raw_documents_f, 
           fmt="%s")


In [None]:

#og dataframe
df.to_csv(os.path.join(save_dir, 'id_name_para.csv'))
