In [1]:
from utils import *
import pandas as pd
import re
import warnings

warnings.filterwarnings('ignore')

#### Extract Bills & Summaries

In [3]:
# Bills
INDIR_115_HR = './data/115/bills/hr'
INDIR_115_S = './data/115/bills/s'

# Concurrent Resolutions
INDIR_115_HCONRES = './data/115/bills/hconres'
INDIR_115_SCONRES = './data/115/bills/sconres'

# Joint Resolutions
INDIR_115_HJRES = './data/115/bills/hjres'
INDIR_115_SJRES = './data/115/bills/sjres'

# Simple Resolutions
INDIR_115_HRES = './data/115/bills/hres'
INDIR_115_SRES = './data/115/bills/sres'

OUT_DIR = './out3'

INDIR_ALL = [INDIR_115_HR, INDIR_115_S, 
              INDIR_115_HCONRES, INDIR_115_SCONRES, 
              INDIR_115_HJRES, INDIR_115_SJRES, 
              INDIR_115_HRES, INDIR_115_SRES
             ]

In [4]:
data = []

for i in INDIR_ALL:
    print('Processing {}'.format(i))
    for d in walk_dirs(i):
        data.append(d)
        
df_115 = pd.DataFrame(data)

Processing ./data/115/bills/hr
Processing ./data/115/bills/s
Processing ./data/115/bills/hconres
Processing ./data/115/bills/sconres
Processing ./data/115/bills/hjres
Processing ./data/115/bills/sjres
Processing ./data/115/bills/hres
Processing ./data/115/bills/sres


In [5]:
df_115.head()

Unnamed: 0,Directory,Number,Subtype,Summary,Type,Version
0,./data/115/bills/hr/hr2615/text-versions/ih,HR2615,HR,1,Bills,IH
1,./data/115/bills/hr/hr2615/text-versions/rh,HR2615,HR,1,Bills,RH
2,./data/115/bills/hr/hr3741/text-versions/ih,HR3741,HR,0,Bills,IH
3,./data/115/bills/hr/hr1914/text-versions/ih,HR1914,HR,1,Bills,IH
4,./data/115/bills/hr/hr1586/text-versions/ih,HR1586,HR,1,Bills,IH


#### Summary Stats, Deduplicate, Filter for Bills with Summaries

In [6]:
print('Number of rows: {}'.format(len(df_115)))
print('Number of unique bills: {}'.format(len(df_115.Number.unique())))

Number of rows: 9390
Number of unique bills: 7626


When Version = N/A it means there are no bill texts, so these are not usable.

In [7]:
with_bill_text = df_115[df_115.Version != 'N/A']
print('Number of unique bills with bill text: {}'.format(len(with_bill_text.Number.unique())))

Number of unique bills with bill text: 7548


We are only interested in bills that have at least one summary.

In [8]:
with_summary = with_bill_text[with_bill_text.Summary > 0]
print('Number of unique bills with bill text and at least one summary: {}'.format(len(with_summary.Number.unique())))

Number of unique bills with bill text and at least one summary: 5192


How many of each type are there?

In [9]:
with_summary.groupby(['Type', 'Subtype']).size()

Type                    Subtype
Bills                   HR         3872
                        S          1560
Concurrent Resolutions  HCONRES     118
                        SCONRES      45
Joint Resolutions       HJRES       173
                        SJRES        68
Simple Resolutions      HRES        631
                        SRES        332
dtype: int64

Since there are multiple versions of each bill, choose the most recent one.

In [10]:
with_summary['to_use'] = 0
recents_marked = get_recent_bills(with_summary)
unique_bills = recents_marked[recents_marked.to_use == 1]

('str date', u'May 2, 2017')
('str date', u'July 18, 2017')
('str date', u'January 3, 2017')
('here dict', './data/115/bills/hr/hr26/text-versions/ih', OrderedDict([(u'bill', OrderedDict([(u'@bill-stage', u'Introduced-in-House'), (u'@bill-type', u'olc'), (u'@dms-id', u'H62D8AD6A768A4B238E77D360D7B58E51'), (u'@key', u'H'), (u'@public-private', u'public'), (u'metadata', OrderedDict([(u'@xmlns:dc', u'http://purl.org/dc/elements/1.1/'), (u'dublinCore', OrderedDict([(u'dc:title', u'115 HR 26 IH: Regulations from the Executive in Need of Scrutiny Act of 2017'), (u'dc:publisher', u'U.S. House of Representatives'), (u'dc:date', None), (u'dc:format', u'text/xml'), (u'dc:language', u'EN'), (u'dc:rights', u'Pursuant to Title 17 Section 105 of the United States Code, this file is not subject to copyright protection and is in the public domain.')]))])), (u'form', OrderedDict([(u'distribution-code', OrderedDict([(u'@display', u'yes'), ('#text', u'I')])), (u'congress', u'115th CONGRESS'), (u'session'

In [11]:
unique_bills.head()

Unnamed: 0,Directory,Number,Subtype,Summary,Type,Version,to_use
0,./data/115/bills/hr/hr2615/text-versions/ih,HR2615,HR,1,Bills,IH,1
3,./data/115/bills/hr/hr1914/text-versions/ih,HR1914,HR,1,Bills,IH,1
4,./data/115/bills/hr/hr1586/text-versions/ih,HR1586,HR,1,Bills,IH,1
5,./data/115/bills/hr/hr125/text-versions/ih,HR125,HR,1,Bills,IH,1
8,./data/115/bills/hr/hr2927/text-versions/ih,HR2927,HR,1,Bills,IH,1


In [12]:
unique_bills.Version.value_counts()

IH     3371
IS     1277
RS      165
ATS     142
EH      108
ENR      96
PCS      15
RH       12
RFS       3
RDS       2
RFH       1
Name: Version, dtype: int64

#### Get & Clean Bill & Summary Texts

In [13]:
def get_clean_bill(row):
    path = row['Directory']
    with open(path+'/document.txt') as f:
        bill_text = f.read()
    text1 = bill_text.split('_______________________________________________________________________')[-1]
    text2 = [i.strip().rstrip() for i in text1.split('\n')]
    text3 = [re.sub('--', ' ', i) for i in text2]
    exclude = '|'.join(['`', "''.", '<all>', 'Sec\. [0-9]+\. ', 'Subtitle [A-Za-z0-9] ', 'A BILL', 
                        'A [A-Z]+ RESOLUTION', 'AN ACT'])
    text4 = [re.sub(exclude, '', i, flags=re.I) for i in text3]
    text5 = [i for i in text4 if i!='']
    bill = re.sub('\. \([A-Za-z0-9]\)', '.', ' '.join(text5))
    return bill

In [14]:
def get_title(row):
    path = row['Directory']

    with open(path+'/document.xml') as f: # for bills
        data = xmltodict.parse(f.read())
    
    t = 'resolution'
    if 'bill' in data:
        t = 'bill'       
    title = data[t]['form']['official-title']
#     
    if isinstance(title, dict):
        if "#text" in title:
            title = title['#text']
    if not isinstance(title, unicode):
        print title, path
        return ""
    text6 = [i.strip().rstrip() for i in title.split('\n')]
    text7 = [re.sub('\(Sec\. [0-9]+\)', '', i) for i in text6]
    text8 = [i for i in text7 if i!='']
    title = ' '.join(text8)
    return title

    

In [15]:
def get_clean_summary(row):
    path = row['Directory']
    with open('/'.join(path.split('/')[0:6])+'/data.xml') as f:
        dict1 = xmltodict.parse(f.read())
    summary_text = dict1['bill']['summary']['#text']
    text6 = [i.strip().rstrip() for i in summary_text.split('\n')]
    text7 = [re.sub('\(Sec\. [0-9]+\)', '', i) for i in text6]
    text8 = [i for i in text7 if i!='']
    summary = ' '.join(text8)
    return summary

In [16]:
unique_bills['bill'] = unique_bills.apply(lambda x: get_clean_bill(x), axis=1)

In [17]:
unique_bills['title'] = unique_bills.apply(lambda x: get_title(x), axis=1)

OrderedDict([(u'@display', u'no')]) ./data/115/bills/hres/hres9/text-versions/eh
OrderedDict([(u'@display', u'no')]) ./data/115/bills/hres/hres45/text-versions/eh
OrderedDict([(u'@display', u'no')]) ./data/115/bills/hres/hres52/text-versions/eh


In [18]:
unique_bills['summary'] = unique_bills.apply(lambda x: get_clean_summary(x), axis=1)

#### Find Budget-related Bills

In [20]:
budget_words = ['budget', 'fund', 'appropriat']
def is_budget(row):
    bill = row['bill']
    if re.search('|'.join(budget_words), bill) is None:
        return 0
    else:
        return 1

In [21]:
unique_bills['is_budget'] = unique_bills.apply(lambda x: is_budget(x), axis=1)

In [22]:
budget_only = unique_bills[unique_bills.is_budget == 1]
budget_only.head()

Unnamed: 0,Directory,Number,Subtype,Summary,Type,Version,to_use,bill,title,summary,is_budget
0,./data/115/bills/hr/hr2615/text-versions/ih,HR2615,HR,1,Bills,IH,1,To authorize the exchange of certain land loca...,To authorize the exchange of certain land loca...,Gulf Islands National Seashore Land Exchange A...,1
4,./data/115/bills/hr/hr1586/text-versions/ih,HR1586,HR,1,Bills,IH,1,"To amend the Federal Food, Drug, and Cosmetic ...","To amend the Federal Food, Drug, and Cosmetic ...",Protecting Our Kids' Medicine Act of 2017 This...,1
10,./data/115/bills/hr/hr259/text-versions/ih,HR259,HR,1,Bills,IH,1,To prevent the territories of the United State...,To prevent the territories of the United State...,This bill amends the Health Care and Education...,1
13,./data/115/bills/hr/hr2652/text-versions/ih,HR2652,HR,1,Bills,IH,1,To direct the Secretary of Veterans Affairs to...,To direct the Secretary of Veterans Affairs to...,Veteran Overmedication Prevention Act of 2017 ...,1
20,./data/115/bills/hr/hr3094/text-versions/ih,HR3094,HR,1,Bills,IH,1,To authorize a national grant program for on-t...,To authorize a national grant program for on-t...,On-the-Job Training Act of 2017 This bill amen...,1


How many budget bills are there and of what kind?

In [23]:
print('Number of budget bills: {}'.format(len(budget_only)))

Number of budget bills: 2411


In [24]:
budget_only.groupby(['Type', 'Subtype']).size()

Type                    Subtype
Bills                   HR         1396
                        S           603
Concurrent Resolutions  HCONRES      35
                        SCONRES      10
Joint Resolutions       HJRES        35
                        SJRES        10
Simple Resolutions      HRES        211
                        SRES        111
dtype: int64

In [25]:
budget_only.to_csv("out/budget_only.tsv", sep='\t', encoding='utf-8')

### budget_only_bills_title stored in out directory, out directory will also have bills, and summaries, and be used for training and testing

In [52]:
count = 0
for index, row in unique_bills.iterrows():
#     row = tup[1]
    
    origPath = row['Directory'].strip().split("/")
    newPath = OUT_DIR+"/TITLE_"+origPath[2]+"_"+row['Number']+"_"+row['Version']+".out"
    with open(newPath, "w") as f:
        f.write(row['title'].encode("utf-8"))
    if count %500 == 0:
        print "Progress", 100.0*count/len(unique_bills)
    count+=1
    

Progress 0.0
Progress 9.63020030817
Progress 19.2604006163
Progress 28.8906009245
Progress 38.5208012327
Progress 48.1510015408
Progress 57.781201849
Progress 67.4114021572
Progress 77.0416024653
Progress 86.6718027735
Progress 96.3020030817
