In [None]:
import xmltodict
from urllib.request import urlopen
import xml.etree.ElementTree as ET
import pandas as pd

from dateutil.parser import parse
from tqdm import tqdm

import matplotlib.pyplot as plt
from IPython.display import display 
from joblib import Parallel, delayed
import multiprocessing

import urllib
from bs4 import BeautifulSoup

import spacy

nlp = spacy.load('en_core_web_sm')

    
plt.style.use('seaborn-paper')
%matplotlib inline

In [None]:
link = 'http://clerk.house.gov/evs/{session}/roll{roll}.xml'

link_list = []

congress_session  = {'2018' : 129,
                     '2017' : 710,
                     '2016' : 622,
                     '2015' : 705, 
                     '2014' : 564,
                     '2013' : 641,
                     '2012' : 659,
                     '2011' : 949,
                     '2010' : 664,
                     '2009' : 991,
                     '2008' : 690,
                     '2007' : 1186,
                     '2006' : 543,
                     '2005' : 671,
                     '2004' : 544,
                     '2003' : 677,
                     '2002' : 484,
                     '2001' : 512,
                     '2000' : 603
                    }

for session, roll  in congress_session.items():
#     print(session, roll)
    for i in range (1, roll + 1):
        temp = link.replace('{session}', session)
        temp = temp.replace('{roll}', '{:03}'.format(i))
        link_list.append(temp)

print('link_list', len(link_list))
print(link_list[1])

In [None]:
#         <majority>D</majority>
#         <congress>103</congress>
#         <session>1st</session>
#         <chamber>U.S. House of Representatives</chamber>
#         <rollcall-num>615</rollcall-num>
#         <legis-num>H R 3167</legis-num>
#         <vote-question>On Agreeing to the Conference Report</vote-question>
#         <vote-type>RECORDED VOTE</vote-type>
#         <vote-result>Passed</vote-result>
#         <action-date>22-Nov-1993</action-date>
#         <action-time time-etz="23:59">11:59 PM</action-time>
#         <vote-desc>UNEMPLOYMENT COMPENSATION...</vote-desc>

#         <recorded-vote>
#             <legislator party="D" state="HI" role="legislator">Abercrombie</legislator>
#             <vote>Aye</vote>
#         </recorded-vote>

In [None]:
def xml2df(xml_data):
#     print(xml_data)
    root = ET.XML(xml_data) # element tree
#     print(root)
    vote_metadata = root.find('vote-metadata')
    vote_data = root.find('vote-data').iter('recorded-vote')
    records = []
    
    for vote in vote_data:
        record = {} #Place holder for our record
        record['majority'] = vote_metadata.find('majority').text
        record['congress'] = vote_metadata.find('congress').text
        record['session'] = vote_metadata.find('session').text

        try:
            record['chamber'] = vote_metadata.find('chamber').text
        except Exception:
            record['chamber'] = None

        record['rollcall_num'] = vote_metadata.find('rollcall-num').text
        try:
            record['legis_num'] = vote_metadata.find('legis-num').text
        except Exception:
            record['legis_num'] = None

        record['vote_question'] = vote_metadata.find('vote-question').text
        record['vote_type'] = vote_metadata.find('vote-type').text
        record['vote_result'] = vote_metadata.find('vote-result').text
        record['action_date'] = vote_metadata.find('action-date').text
        record['action_time'] = vote_metadata.find('action-time').text
        record['vote_desc'] = vote_metadata.find('vote-desc').text
    
        record['party'] = vote.find('legislator').attrib['party']
        record['state'] = vote.find('legislator').attrib['state']
        record['role'] = vote.find('legislator').attrib['role']
        record['name'] = vote.find('legislator').text
        
        if any(vote.find('vote').text in s for s in ['Yea', 'Aye']):
            record['vote'] = 'Yea'
        elif any(vote.find('vote').text in s for s in ['No', 'Nay']):
            record['vote'] = 'Nay'
        else:
            continue
        records.append(record)
        
    return pd.DataFrame(records)
    
    
# xml = urlopen(link_list[2]).read()
# df = xml2df(xml)
# df.tail()

In [None]:
%%time


def processLink(link):
#     print('processing', link)
    urlContent = urlopen(link)
    xml = urlContent.read()
    return xml2df(xml)

num_cores = multiprocessing.cpu_count()
print('num_cores', num_cores)

# processLink(link_list[2])
    
results = Parallel(n_jobs=num_cores)(delayed(processLink)(i) for i in tqdm(link_list))

df_votes = pd.concat(results)
df_votes.reset_index(drop=True, inplace=True)

# df_hansard['date'] =  pd.to_datetime(df_hansard['date'])

df_votes.to_csv('../data/votes_all.csv')
print(len(df_votes))

df_votes.tail()

In [None]:
# df_votes = df_votes[df_votes['vote'].isin(['Yea', 'Aye', 'No', 'Nay'])]
df_votes['vote'].value_counts().plot(kind='bar', alpha=.5, figsize=(12, 6), fontsize=14)
df_votes.describe()

# House bills

In [None]:
df_votes = pd.read_csv('../data/votes_all.csv')
df_filtered = df_votes.drop_duplicates('legis_num', keep='first')
df_filtered.reset_index(inplace=True)
df_filtered.tail()

In [None]:

import re
link = 'https://www.congress.gov/bill/{congress}th-congress/house-bill/{billNum}/text?format=txt&r=1'

def creat_list(row):
#     print(row)
    link_temp = link.replace('{congress}', str(row['congress']))
    return link_temp.replace('{billNum}', ''.join(re.findall('\d+', row['legis_num'])))


df_filtered['link'] = df_filtered.apply(creat_list, axis=1)
df_filtered.tail()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
import string
import re
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”", ' ', '', '``', '--', '''''',]

def tokenizeText(text):
#     print('hereeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee', text)
    tokens = []
    for tok in nlp(text):
        tokens.append(tok.text.strip())
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    tokens = [tok for tok in tokens if not tok.isdigit()]
    return u' '.join(tokens)



In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from multiprocessing import Pool

num_partitions = 10 #number of partitions to split dataframe
num_cores = 7 #number of cores on your machine


def processBillLink(link):
    billText = ''
    try:
        req = urllib.request.Request(link, headers={'User-Agent' : "Magic Browser"}) 
        htmlContent = urllib.request.urlopen(req).read()
        
        soup = BeautifulSoup(htmlContent, 'lxml')
        billText = soup.find("pre", {"id": "billTextContainer"}).text
        billText = tokenizeText(billText[:999999].lower()) # trunkate for Spacy
    except Exception:
        print('didnt find', link)
        pass

    return billText

# processBillLink(df_filtered['link'][1])

link_list = df_filtered['link'].tolist()
print(link_list[1])

num_cores = multiprocessing.cpu_count()
print('num_cores', num_cores)
results = Parallel(n_jobs=num_cores)(delayed(processBillLink)(i) for i in tqdm(link_list[:10]))


In [None]:
print(results[1])

In [None]:
print(results[1])
df_filtered['billText'] = results
df_filtered.head()

In [None]:
import nltk
# nltk.download()
from nltk.tokenize import sent_tokenize, word_tokenize
 
data = "All work and no play makes jack a dull boy, all work and no play"
# print(word_tokenize(df_filtered['billText'][1]))
print(parser(df_filtered['billText'][1]))


# print(tokenizeText(u' '.join(df_filtered['billText'][1])))

In [None]:
# print(results[1])

df_filtered['billText'] = df_filtered['billText'].apply(tokenizeText) 

In [None]:
%%time

# records = []
link_list = []
# congress = '115'
print('bills',  len(df_votes['legis_num'].unique()))



for row in tqdm(df_votes['legis_num'].unique()):

        
df_bill = pd.DataFrame(records)
print('found ', len(df_bill), ' out of ', len(df_votes['legis_num'].unique()))
df_bill.to_csv('../data/bill_all.csv')
df_bill.tail()

In [None]:
df_bill.to_csv('../data/bill_all.csv')
df_bill.tail()