In [None]:
from utils import *
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

#### Extract Bills & Summaries

In [None]:
# Bills
INDIR_115_HR = './data/115/bills/hr'
INDIR_115_S = './data/115/bills/s'

# Concurrent Resolutions
INDIR_115_HCONRES = './data/115/bills/hconres'
INDIR_115_SCONRES = './data/115/bills/sconres'

# Joint Resolutions
INDIR_115_HJRES = './data/115/bills/hjres'
INDIR_115_SJRES = './data/115/bills/sjres'

# Simple Resolutions
INDIR_115_HRES = './data/115/bills/hres'
INDIR_115_SRES = './data/115/bills/sres'

INDIR_ALL = [INDIR_115_HR, INDIR_115_S, 
              INDIR_115_HCONRES, INDIR_115_SCONRES, 
              INDIR_115_HJRES, INDIR_115_SJRES, 
              INDIR_115_HRES, INDIR_115_SRES
             ]

In [None]:
data = []

for i in INDIR_ALL:
    print('Processing {}'.format(i))
    for d in walk_dirs(i):
        data.append(d)
        
df_115 = pd.DataFrame(data)

In [None]:
df_115.head()

#### Summary Stats, Deduplicate, Filter for Bills with Summaries

In [None]:
print('Number of rows: {}'.format(len(df_115)))
print('Number of unique bills: {}'.format(len(df_115.Number.unique())))

When Version = N/A it means there are no bill texts, so these are not usable.

In [None]:
with_bill_text = df_115[df_115.Version != 'N/A']
print('Number of unique bills with bill text: {}'.format(len(with_bill_text.Number.unique())))

We are only interested in bills that have at least one summary.

In [None]:
with_summary = with_bill_text[with_bill_text.Summary > 0]
print('Number of unique bills with bill text and at least one summary: {}'.format(len(with_summary.Number.unique())))

How many of each type are there?

In [None]:
with_summary.groupby(['Type', 'Subtype']).size()

Since there are multiple versions of each bill, choose the most recent one.

In [None]:
with_summary['to_use'] = 0
recents_marked = get_recent_bills(with_summary)
unique_bills = recents_marked[recents_marked.to_use == 1]

In [None]:
unique_bills.head()

In [None]:
unique_bills.Version.value_counts()

#### Get & Clean Bill & Summary Texts

In [None]:
def get_clean_bill(row):
    path = row['Directory']
    with open(path+'/document.txt') as f:
        bill_text = f.read()
    text1 = bill_text.split('_______________________________________________________________________')[-1]
    text2 = [i.strip().rstrip() for i in text1.split('\n')]
    text3 = [re.sub('--', ' ', i) for i in text2]
    exclude = '|'.join(['`', "''.", '<all>', 'Sec\. [0-9]+\. ', 'Subtitle [A-Za-z0-9] ', 'A BILL', 
                        'A [A-Z]+ RESOLUTION', 'AN ACT'])
    text4 = [re.sub(exclude, '', i, flags=re.I) for i in text3]
    text5 = [i for i in text4 if i!='']
    bill = re.sub('\. \([A-Za-z0-9]\)', '.', ' '.join(text5))
    return bill

In [None]:
def get_clean_bill(row):
    path = row['Directory']
    with open(path+'/document.xml') as f:
        bill_xml = xmltodict.parse(f.read())
    
    text1 = bill_text.split('_______________________________________________________________________')[-1]
    text2 = [i.strip().rstrip() for i in text1.split('\n')]
    text3 = [re.sub('--', ' ', i) for i in text2]
    exclude = '|'.join(['`', "''.", '<all>', 'Sec\. [0-9]+\. ', 'Subtitle [A-Za-z0-9] ', 'A BILL', 
                        'A [A-Z]+ RESOLUTION', 'AN ACT'])
    text4 = [re.sub(exclude, '', i, flags=re.I) for i in text3]
    text5 = [i for i in text4 if i!='']
    bill = re.sub('\. \([A-Za-z0-9]\)', '.', ' '.join(text5))
    return bill



In [None]:
def get_title(row):
    path = row['Directory']

    with open(path+'/document.xml') as f: # for bills
        data = xmltodict.parse(f.read())
    
    t = 'resolution'
    if 'bill' in data:
        t = 'bill'       
    title = data[t]['form']['official-title']
#     
    if isinstance(title, dict):
        if "#text" in title:
            title = title['#text']
    if not isinstance(title, unicode):
        print title, path
        return ""
    text6 = [i.strip().rstrip() for i in title.split('\n')]
    text7 = [re.sub('\(Sec\. [0-9]+\)', '', i) for i in text6]
    text8 = [i for i in text7 if i!='']
    title = ' '.join(text8)
    return title

    

In [None]:
def get_clean_summary(row):
    path = row['Directory']
    with open('/'.join(path.split('/')[0:6])+'/data.xml') as f:
        dict1 = xmltodict.parse(f.read())
    summary_text = dict1['bill']['summary']['#text']
    text6 = [i.strip().rstrip() for i in summary_text.split('\n')]
    text7 = [re.sub('\(Sec\. [0-9]+\)', '', i) for i in text6]
    text8 = [i for i in text7 if i!='']
    summary = ' '.join(text8)
    return summary

In [None]:
unique_bills['bill'] = unique_bills.apply(lambda x: get_clean_bill(x), axis=1)

In [None]:
unique_bills['title'] = unique_bills.apply(lambda x: get_title(x), axis=1)

In [None]:
unique_bills['summary'] = unique_bills.apply(lambda x: get_clean_summary(x), axis=1)

In [None]:
unique_bills['title'][3]

#### Find Budget-related Bills

In [None]:
budget_words = ['budget', 'fund', 'appropriat']
def is_budget(row):
    bill = row['bill']
    if re.search('|'.join(budget_words), bill) is None:
        return 0
    else:
        return 1

In [None]:
unique_bills['is_budget'] = unique_bills.apply(lambda x: is_budget(x), axis=1)

In [None]:
budget_only = unique_bills[unique_bills.is_budget == 1]
budget_only.head()

How many budget bills are there and of what kind?

In [None]:
print('Number of budget bills: {}'.format(len(budget_only)))

In [None]:
budget_only.groupby(['Type', 'Subtype']).size()

In [None]:
budget_only.to_csv("out/budget_only.tsv", sep='\t', encoding='utf-8')