In [62]:
from utils import *
import pandas as pd
import re

#### Extract Bills & Summaries

In [12]:
# Bills
INDIR_115_HR = './data/115/bills/hr'
INDIR_115_S = './data/115/bills/s'

# Concurrent Resolutions
INDIR_115_HCONRES = './data/115/bills/hconres'
INDIR_115_SCONRES = './data/115/bills/sconres'

# Joint Resolutions
INDIR_115_HJRES = './data/115/bills/hjres'
INDIR_115_SJRES = './data/115/bills/sjres'

# Simple Resolutions
INDIR_115_HRES = './data/115/bills/hres'
INDIR_115_SRES = './data/115/bills/sres'

INDIR_ALL = [INDIR_115_HR, INDIR_115_S, 
              INDIR_115_HCONRES, INDIR_115_SCONRES, 
              INDIR_115_HJRES, INDIR_115_SJRES, 
              INDIR_115_HRES, INDIR_115_SRES
             ]

In [13]:
data = []

for i in INDIR_ALL:
    print('Processing {}'.format(i))
    for d in walk_dirs(i):
        data.append(d)
        
df_115 = pd.DataFrame(data)

Processing ./data/115/bills/hr
Processing ./data/115/bills/s
Processing ./data/115/bills/hconres
Processing ./data/115/bills/sconres
Processing ./data/115/bills/hjres
Processing ./data/115/bills/sjres
Processing ./data/115/bills/hres
Processing ./data/115/bills/sres


In [14]:
df_115.head()

Unnamed: 0,Directory,Number,Subtype,Summary,Type,Version
0,./data/115/bills/hr/hr1/text-versions/ih,HR1,HR,1,Bills,IH
1,./data/115/bills/hr/hr10/text-versions/eh,HR10,HR,1,Bills,EH
2,./data/115/bills/hr/hr10/text-versions/ih,HR10,HR,1,Bills,IH
3,./data/115/bills/hr/hr10/text-versions/rfs,HR10,HR,1,Bills,RFS
4,./data/115/bills/hr/hr10/text-versions/rh,HR10,HR,1,Bills,RH


#### Summary Stats, Deduplicate, Filter for Bills with Summaries

In [15]:
print('Number of rows: {}'.format(len(df_115)))
print('Number of unique bills: {}'.format(len(df_115.Number.unique())))

Number of rows: 9390
Number of unique bills: 7626


When Version = N/A it means there are no bill texts, so these are not usable.

In [16]:
with_bill_text = df_115[df_115.Version != 'N/A']
print('Number of unique bills with bill text: {}'.format(len(with_bill_text.Number.unique())))

Number of unique bills with bill text: 7548


We are only interested in bills that have at least one summary.

In [17]:
with_summary = with_bill_text[with_bill_text.Summary > 0]
print('Number of unique bills with bill text and at least one summary: {}'.format(len(with_summary.Number.unique())))

Number of unique bills with bill text and at least one summary: 5192


How many of each type are there?

In [18]:
with_summary.groupby(['Type', 'Subtype']).size()

Type                    Subtype
Bills                   HR         3872
                        S          1560
Concurrent Resolutions  HCONRES     118
                        SCONRES      45
Joint Resolutions       HJRES       173
                        SJRES        68
Simple Resolutions      HRES        631
                        SRES        332
dtype: int64

Since there are multiple versions of each bill, choose the most recent one.

In [19]:
with_summary['to_use'] = 0
recents_marked = get_recent_bills(with_summary)
unique_bills = recents_marked[recents_marked.to_use == 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
  df = with_summary[with_summary.Number == bill_no][with_summary.Version != 'EAS']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
  idx = with_summary[with_summary.Number == bill_no][with_summary.Version == to_use].index
  idx = with_summary[with_summary.Number == bill_no][with_summary.Version == 'ENR'].index


In [20]:
unique_bills.head()

Unnamed: 0,Directory,Number,Subtype,Summary,Type,Version,to_use
0,./data/115/bills/hr/hr1/text-versions/ih,HR1,HR,1,Bills,IH,1
1,./data/115/bills/hr/hr10/text-versions/eh,HR10,HR,1,Bills,EH,1
5,./data/115/bills/hr/hr100/text-versions/ih,HR100,HR,1,Bills,IH,1
6,./data/115/bills/hr/hr1000/text-versions/ih,HR1000,HR,1,Bills,IH,1
7,./data/115/bills/hr/hr1001/text-versions/ih,HR1001,HR,1,Bills,IH,1


In [21]:
unique_bills.Version.value_counts()

IH     3063
IS     1364
EH      444
ATS     146
ENR      96
ES       54
PCS      12
RH       12
CPS       1
Name: Version, dtype: int64

#### Get & Clean Bill & Summary Texts

In [202]:
def get_clean_bill(row):
    path = row['Directory']
    with open(path+'/document.txt') as f:
        bill_text = f.read()
    text1 = bill_text.split('_______________________________________________________________________')[-1]
    text2 = [i.strip().rstrip() for i in text1.split('\n')]
    text3 = [re.sub('--', ' ', i) for i in text2]
    exclude = '|'.join(['`', "''.", '<all>', 'Sec\. [0-9]+\. ', 'Subtitle [A-Za-z0-9] ', 'A BILL', 
                        'A [A-Z]+ RESOLUTION', 'AN ACT'])
    text4 = [re.sub(exclude, '', i, flags=re.I) for i in text3]
    text5 = [i for i in text4 if i!='']
    bill = re.sub('\. \([A-Za-z0-9]\)', '.', ' '.join(text5))
    return bill

In [190]:
def get_clean_summary(row):
    path = row['Directory']
    with open('/'.join(path.split('/')[0:6])+'/data.xml') as f:
        dict1 = xmltodict.parse(f.read())
    summary_text = dict1['bill']['summary']['#text']
    text6 = [i.strip().rstrip() for i in summary_text.split('\n')]
    text7 = [re.sub('\(Sec\. [0-9]+\)', '', i) for i in text6]
    text8 = [i for i in text7 if i!='']
    summary = ' '.join(text8)
    return summary

In [203]:
unique_bills['bill'] = unique_bills.apply(lambda x: get_clean_bill(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [193]:
unique_bills['summary'] = unique_bills.apply(lambda x: get_clean_summary(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


#### Find Budget-related Bills

In [198]:
budget_words = ['budget', 'fund', 'appropriat']
def is_budget(row):
    bill = row['bill']
    if re.search('|'.join(budget_words), bill) is None:
        return 0
    else:
        return 1

In [200]:
unique_bills['is_budget'] = unique_bills.apply(lambda x: is_budget(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [205]:
budget_only = unique_bills[unique_bills.is_budget == 1]
budget_only.head()

Unnamed: 0,Directory,Number,Subtype,Summary,Type,Version,to_use,bill,summary,is_budget
0,./data/115/bills/hr/hr1/text-versions/ih,HR1,HR,1,Bills,IH,1,To provide for reconciliation pursuant to titl...,Tax Cuts and Jobs Act This bill amends the Int...,1
5,./data/115/bills/hr/hr100/text-versions/ih,HR100,HR,1,Bills,IH,1,"To amend title 23, United States Code, to modi...",Support Local Transportation Act This bill rev...,1
6,./data/115/bills/hr/hr1000/text-versions/ih,HR1000,HR,1,Bills,IH,1,To establish the National Full Employment Trus...,Humphrey-Hawkins 21st Century Full Employment ...,1
8,./data/115/bills/hr/hr1002/text-versions/ih,HR1002,HR,1,Bills,IH,1,"To authorize a National Heritage Area Program,...",National Heritage Area Act of 2017 This bill e...,1
19,./data/115/bills/hr/hr1006/text-versions/ih,HR1006,HR,1,Bills,IH,1,To clarify the rights of all persons who are h...,This bill amends the Immigration and Nationali...,1


How many budget bills are there and of what kind?

In [206]:
print('Number of budget bills: {}'.format(len(budget_only)))

Number of budget bills: 2323


In [207]:
budget_only.groupby(['Type', 'Subtype']).size()

Type                    Subtype
Bills                   HR         1258
                        S           654
Concurrent Resolutions  HCONRES      33
                        SCONRES      10
Joint Resolutions       HJRES        35
                        SJRES         9
Simple Resolutions      HRES        210
                        SRES        114
dtype: int64