In [185]:
"""Convert Senate speech data from 114th Congress to bag of words format.

The data is provided by [1]. Specifically, we use the `hein-daily` data. To 
run this script, make sure the relevant files are in 
`data/senate-speeches-114/raw/`. The files needed for this script are 
`speeches_114.txt`, `descr_114.txt`, and `114_SpeakerMap.txt`.

#### References
[1]: Gentzkow, Matthew, Jesse M. Shapiro, and Matt Taddy. Congressional Record 
     for the 43rd-114th Congresses: Parsed Speeches and Phrase Counts. Palo 
     Alto, CA: Stanford Libraries [distributor], 2018-01-16. 
     https://data.stanford.edu/congress_text
"""

import os
import setup_utils as utils

import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
import re
import nltk

In [186]:
nltkstops = nltk.corpus.stopwords.words('english')

In [216]:
## Set params

stops = 'custom'

In [217]:
project_dir = os.path.abspath(
    os.path.join(os.path.dirname('.'), os.pardir)) 
data_dir = os.path.join(project_dir, "data\\paradigms\\raw")
save_dir = os.path.join(project_dir, "data\\paradigms\\clean\\"+stops)



In [189]:
def convert_markdown_to_plain_text(markdown_string):

    markdown_string = str(markdown_string)
    # Remove newlines
    plain_text = markdown_string.replace('\n', ' ')
    
    # Remove bold formatting (e.g., **text**)
    plain_text = re.sub(r'\*\*(.*?)\*\*', r'\1', plain_text)
    
    # Remove backslashes
    plain_text = plain_text.replace('\\', '')
    
    # Remove other markdown formatting if needed
    plain_text = plain_text.replace('\t', '')
    
    return plain_text

def fix_spaces(string):
    new_string = ' '.join(string.strip().split())
    return new_string

df = pd.read_csv(os.path.join(data_dir, 'paradigms.csv'), index_col=0)[['Judge Name', 'Paradigm',"Judge's CEDA rounds"]]
df = df[df["Judge's CEDA rounds"] > 5] #scare away tabroom ghosts
df.drop("Judge's CEDA rounds", axis=1, inplace=True)
df['Paradigm'] = df['Paradigm'].apply(lambda x: x.encode("utf-8").decode("utf-8").lower())
df['Paradigm'] = df['Paradigm'].apply(convert_markdown_to_plain_text)
df['Paradigm'] = df['Paradigm'].apply(fix_spaces)
df.sort_index(inplace=True)
df = df[~df['Judge Name'].isna()]
df = df[df['Paradigm'].str.len() > 5]
df.head()

Unnamed: 0_level_0,Judge Name,Paradigm
Judge ID,Unnamed: 1_level_1,Unnamed: 2_level_1
3,Aaron Hardy,it's been a number of years since i've been an...
1057,Shunta Jordan,just a brief update for the high school commun...
1088,Bill Smelko,please email me your speech documents. i have ...
1265,Maggie Berthiaume,maggie berthiaume woodward academy current coa...
1464,Bill Russell,bill russell judge philosophy overview- i love...


In [190]:
from nltk.tokenize import sent_tokenize, word_tokenize
df['Paradigm'] = df['Paradigm'].apply(lambda x: [word_tokenize(t) for t in sent_tokenize(x)])
df.head()


Unnamed: 0_level_0,Judge Name,Paradigm
Judge ID,Unnamed: 1_level_1,Unnamed: 2_level_1
3,Aaron Hardy,"[[it, 's, been, a, number, of, years, since, i..."
1057,Shunta Jordan,"[[just, a, brief, update, for, the, high, scho..."
1088,Bill Smelko,"[[please, email, me, your, speech, documents, ..."
1265,Maggie Berthiaume,"[[maggie, berthiaume, woodward, academy, curre..."
1464,Bill Russell,"[[bill, russell, judge, philosophy, overview-,..."


In [191]:
clone = df['Paradigm'].apply(lambda x: [word for sentence in x for word in sentence])
length_raw = len(set(item for sublist in clone for item in sublist))
length_raw

26987

In [192]:
# stupid phraseizer
from collections import defaultdict
from tqdm import tqdm

def count_word_occurrences(documents, T):
    delta = 2
    word_counts = defaultdict(int)
    word_following_counts = defaultdict(int)

    print('iterating through documents!')
    for document in tqdm(documents):
        for sentence in document:
            for i in range(len(sentence) - 1):
                word_i = sentence[i]
                word_j = sentence[i + 1]

                word_counts[word_i] += 1
                word_counts[word_j] += 1

                if word_i != word_j:
                    word_following_counts[(word_i, word_j)] += 1
    vocab_size = len(word_counts)
    result_dict = {}
    print('scoring terms')
    for (word_i, word_j), count_ij in tqdm(word_following_counts.items()):
        count_i = word_counts[word_i]
        count_j = word_counts[word_j]
        if count_i not in nltk.corpus.stopwords.words('english'):
            score = (count_ij - delta)*vocab_size / (count_i * count_j)
            if score > T:
                result_dict[(word_i, word_j)] = word_i + '_' + word_j

    return result_dict

In [193]:
def replacer(document, replacement_dict):
    new_doc = []
    for sentence in document:
        new_sentence = []
        i = 0
        while i < len(sentence) - 2:
            if (sentence[i], sentence[i+1]) in replacement_dict:
                new_sentence.append(replacement_dict[(sentence[i], sentence[i+1])])
                i += 2
            else:
                new_sentence.append(sentence[i])
                i += 1
        new_doc.append(new_sentence)
    return new_doc


In [194]:
two_grams = count_word_occurrences(df['Paradigm'], 5)


iterating through documents!


100%|██████████| 1262/1262 [00:00<00:00, 1586.22it/s]


scoring terms


100%|██████████| 241894/241894 [00:30<00:00, 8049.41it/s]


In [195]:
df['Paradigm'] = df['Paradigm'].apply(replacer, replacement_dict=two_grams)

In [196]:
clone = df['Paradigm'].apply(lambda x: [word for sentence in x for word in sentence])
length_2grams = len(set(item for sublist in clone for item in sublist))
length_2grams

26393

In [197]:
three_four_grams = count_word_occurrences(df['Paradigm'], 2.5)

iterating through documents!


100%|██████████| 1262/1262 [00:00<00:00, 1697.84it/s]


scoring terms


100%|██████████| 226788/226788 [00:29<00:00, 7761.65it/s]


In [198]:
df['Paradigm'] = df['Paradigm'].apply(replacer, replacement_dict=three_four_grams)

In [199]:
clone = df['Paradigm'].apply(lambda x: [word for sentence in x for word in sentence])
length_4grams = len(set(item for sublist in clone for item in sublist))
length_4grams

25356

In [200]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
def pos_tag_and_lemmatize(sentences):
    lemmatizer = WordNetLemmatizer()
    tagged_sentences = []
    for sentence in sentences: 
        tagged_words = pos_tag(sentence)    # Perform POS tagging
        lemmatized_words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(pos)) for word, pos in tagged_words]
        tagged_sentences.append(lemmatized_words)
    return tagged_sentences

df['Paradigm'] = df['Paradigm'].apply(pos_tag_and_lemmatize)

In [201]:
clone = df['Paradigm'].apply(lambda x: [word for sentence in x for word in sentence])
length_lemma = len(set(item for sublist in clone for item in sublist))
length_lemma

22331

In [202]:

speaker = np.array(df.index.values)
clone = df['Paradigm'].apply(lambda x: [word for sentence in x for word in sentence])
speeches = list(clone)

# Create mapping between names and IDs.
speaker_to_speaker_id = dict(
    [(y, x) for x, y in enumerate(speaker)])
author_indices = np.array(
    [speaker_to_speaker_id[s] for s in speaker])
author_map = np.array(list(speaker_to_speaker_id.keys()))

print(author_map.shape)
print(author_indices.shape)
print(speaker.shape)
len(set(speaker))

speeches = [' '.join(doc) for doc in speeches]


(1262,)
(1262,)
(1262,)


In [203]:
find_stops = True

In [204]:
len(nltkstops)

179

In [205]:
if find_stops:
    count_vectorizer = CountVectorizer(min_df=1,
                                   max_df=1., 
                                   ngram_range=(1, 1),
                                   stop_words=nltkstops)


    # Learn initial document term matrix. This is only initial because we use it to
    # identify words to exclude based on author counts.
    counts = count_vectorizer.fit_transform(speeches)

    vocabulary = np.array(
        [k for (k, v) in sorted(count_vectorizer.vocabulary_.items(), 
                                key=lambda kv: kv[1])])

In [206]:
counts_per_author = utils.bincount_2d(author_indices, counts.toarray())
min_authors_per_word = 8
author_counts_per_word = np.sum(counts_per_author > 0, axis=0)
ranking = np.flip(np.argsort(author_counts_per_word))
TF = counts.toarray().sum(axis=0)/counts.toarray().sum(axis=0).sum()

IDF = np.log((author_counts_per_word * (1/counts.shape[0]))**-1)


terms_per_author = counts.toarray().sum(axis=1)
terms_per_author = np.where(terms_per_author==0, 1, terms_per_author)
tfidfpart1 = counts.toarray()/terms_per_author[:, np.newaxis]


TFIDF = (tfidfpart1 * np.exp(IDF)).sum(axis=0) / np.where(author_counts_per_word==0, 1, author_counts_per_word).astype(float)

ppt = counts.toarray() / counts.toarray().sum(axis=0)
entropy = (np.multiply(ppt, -np.log(np.where(ppt==0, 1, ppt)))).sum(axis=0)


print(len(vocabulary))
print(counts.shape)
print(counts_per_author.shape)
print(author_counts_per_word.shape)
print(TF.shape)
print(IDF.shape)
#vocabulary[ranking][:20]
print(TFIDF.shape)
print(entropy.T.shape)


15777
(1262, 15777)
(1262, 15777)
(15777,)
(15777,)
(15777,)
(15777,)
(15777,)


In [207]:
vocab_df = pd.DataFrame({'words':vocabulary, 'TF':TF, 'IDF':IDF, 'TFIDF':TFIDF, 'entropy':entropy})
vocab_df.sort_values('TF', ascending=False)

Unnamed: 0,words,TF,IDF,TFIDF,entropy
3732,debate,0.027423,0.055389,0.034059,6.757688
1162,argument,0.019571,0.113139,0.026055,6.669567
14082,think,0.011204,0.324813,0.019664,6.401307
8602,make,0.010075,0.229702,0.016818,6.592842
694,aff,0.009867,0.472225,0.022133,6.283151
...,...,...,...,...,...
5369,familiarty,0.000003,7.140453,2.253571,0.000000
5372,famously,0.000003,7.140453,1.628387,0.000000
5374,fanaticism,0.000003,7.140453,0.632899,0.000000
5377,fantasize,0.000003,7.140453,2.640167,0.000000


In [208]:
top_TF = vocab_df.sort_values('TF', ascending=False)[['words','TF']].iloc[:1500].set_index('words')
bot_IDF = vocab_df.sort_values('IDF', ascending=True)[['words','IDF']].iloc[:1500].set_index('words')
bot_TFIDF = vocab_df.sort_values('TFIDF', ascending=True)[['words','TFIDF']].iloc[:1500].set_index('words')
top_entropy = vocab_df.sort_values('entropy', ascending=False)[['words','entropy']].iloc[:1500].set_index('words')

full = top_TF.join([bot_IDF,bot_TFIDF,top_entropy], how='outer')


In [209]:
full.shape

(1650, 4)

In [210]:
# full.to_excel('check_for_stops.xlsx')

PermissionError: [Errno 13] Permission denied: 'check_for_stops.xlsx'

In [211]:
new_stops = pd.read_excel('check_for_stops.xlsx')
new_stops = list(new_stops[new_stops['include']=='bad']['words'])

In [212]:
new_stops = list(set(nltkstops) | set(new_stops))

In [213]:

count_vectorizer = CountVectorizer(min_df=1,
                                max_df=1., 
                                ngram_range=(1, 1),
                                stop_words=new_stops)

counts = count_vectorizer.fit_transform(speeches)

vocabulary = np.array(
    [k for (k, v) in sorted(count_vectorizer.vocabulary_.items(), 
                            key=lambda kv: kv[1])])

In [214]:

# Remove speeches with not enough words.
existing_speeches = np.where(np.sum(counts, axis=1) > 1)[0]
counts_dense = counts[existing_speeches]
author_indices = author_indices[existing_speeches]

len(vocabulary)

15748

In [218]:
save_dir

'c:\\Users\\austin\\Documents\\python-projs\\uchicago\\research\\Thesis\\TBIP_testing\\data\\paradigms\\clean\\custom'

In [225]:
# Save data.
if not os.path.exists(save_dir):
  os.makedirs(save_dir)

# `counts.npz` is a [num_documents, num_words] sparse matrix containing the
# word counts for each document.
sparse.save_npz(os.path.join(save_dir, "counts.npz"),
                sparse.csr_matrix(counts_dense).astype(np.float32))
# `author_indices.npy` is a [num_documents] vector where each entry is an
# integer indicating the author of the corresponding document.
np.save(os.path.join(save_dir, "author_indices.npy"), author_indices)
# `vocabulary.txt` is a [num_words] vector where each entry is a string
# denoting the corresponding word in the vocabulary.
np.savetxt(os.path.join(save_dir, "vocabulary.txt"), vocabulary, fmt="%s")
# `author_map.txt` is a [num_authors] vector of strings providing the name of
# each author in the corpus.
np.savetxt(os.path.join(save_dir, "author_map.txt"), author_map, fmt="%s")
# `raw_documents.txt` contains all the documents we ended up using.
raw_documents = [document.replace("\n", ' ').replace("\r", ' ') 
                 for document in np.array(speeches)[existing_speeches]]
np.savetxt(os.path.join(save_dir, "raw_documents.txt"), 
           raw_documents, 
           fmt="%s")


In [224]:
len(speeches)

1262

In [223]:
len(existing_speeches)


1255

In [220]:

#og dataframe
df.to_csv(os.path.join(save_dir, 'id_name_para.csv'))
