# Bill Parsing

Contains code to parse the raw XML files containing congressional bills. Functions defined here are also found in utils.py for later use.

In [25]:
import xmltodict
import pandas as pd
import numpy as np
import os
import re
from nltk.tokenize import sent_tokenize, word_tokenize

### Cleaning

In [2]:
def remove_tags(read_file):
    read_file = re.sub(b'</?external-xref[^<>]*>',b'', read_file)
    read_file = re.sub(b'<quote>',b'"', read_file)
    read_file = re.sub(b'</quote>',b'"', read_file)
    read_file = re.sub(b'</?term[^<>]*>',b' ', read_file)
    read_file = re.sub(b'</?pagebreak[^<>]*>',b'', read_file)
    return read_file

In [3]:
def flatten(d):
    output = ''
    for k, v in d.items():
        if not k.startswith("@"):
            if isinstance(v, dict):
                output += flatten(v)
            elif isinstance(v, list):
                for l in v:
                    if isinstance(l, dict):
                        output += flatten(l)
                    else:
                        if l:
                            output += l + ' '
            else:
                if v:
                    output += v + ' '
    return output

In [4]:
def section_bill(d):
    global section
    if isinstance(d, dict):
        if 'section' in d.keys():
            section += 1
        if 'subsection' in d.keys():
            section += 1
        if 'paragraph' in d.keys():
            section += len(d['paragraph'])
        for k, v in d.items():
            if not k.startswith("@"):
                section_bill(v)
    elif isinstance(d, list):
        for l in d:
            section_bill(l)
    else:
        if d:
            section = section
    return section

In [5]:
def length_bill(text):
    word = word_tokenize(text)
    word = len([s for s in word if re.match(r'.*[A-Za-z0-9].*',s)])
    # Use period to detect sentence or semicolon?
    sent_tokenize_list = sent_tokenize(text)
    sentence = len(sent_tokenize_list)
    return sentence, word

In [11]:
def bill_to_dict(filename, doc):
    def _clean_body(bodydict):
        clean_bodydict = {}
        for k, v in bodydict.items():
            if not k.startswith("@"):
                if isinstance(v, list):
                    sec_str = ''
                    for sub_dict in v:
                        if sub_dict:
                            sec_str += flatten(sub_dict)
                    clean_bodydict[k] = sec_str
                else:
                    clean_bodydict[k] = flatten(v)
        return clean_bodydict
    
    bill_type = list(doc)[0]
    data_dict = {}
    
    data_dict["file-name"] = filename
    data_dict["bill-type"] = bill_type
    data_dict['official-title'] = None
    data_dict['legis-type'] = None
    data_dict['dc:title'] = None
    data_dict['legis-num'] = None
    
    metadata = {}
    for n in list(doc[bill_type]):
        # group meta data
        if n.startswith("@"): 
            metadata[n] = doc[bill_type][n]
        
        # unify name for different types
        elif n in ['legis-body', 'resolution-body', 'engrossed-amendment-body']:
            if isinstance(doc[bill_type][n], dict):
                data_dict['body'] = _clean_body(doc[bill_type][n])  
                data_dict['whole_body'] = flatten(doc[bill_type][n])
            elif isinstance(doc[bill_type][n], list):
                # just take last one
                data_dict['body'] = _clean_body(doc[bill_type][n][-1])
                data_dict['whole_body'] = flatten(doc[bill_type][n][-1])
            else:
                print('NOT dict nor list')
                data_dict['body'] = doc[bill_type][n]
            
            ## calculate the length of bills in paragraphs, sentences and words
            global section
            section = 0
            if isinstance(doc[bill_type][n], dict):
                section = section_bill(doc[bill_type][n])
            elif isinstance(doc[bill_type][n], list):
                section = section_bill(doc[bill_type][n][-1])
            else:
                section = 0
            data_dict['section'] = section
            try:
                sentence, word = length_bill(data_dict['whole_body'])
            except Exception as e:         
                print("While counting length, the error occurs: {}".format(e))
            finally:
                data_dict['sentence'] = sentence
                data_dict['word'] = word
                
        elif n == 'engrossed-amendment-form':
            data_dict['form'] = doc[bill_type][n]
            
        ## add fields legis-type, official-title from 'form'
        elif n == 'form':
            try:
                data_dict['legis-type'] = doc[bill_type][n]['legis-type']
                data_dict['legis-num'] = doc[bill_type][n]['legis-num']
                data_dict['official-title'] = doc[bill_type][n]['official-title']
                if isinstance(data_dict['official-title'], dict):
                    data_dict['official-title'] = data_dict['official-title']['#text']
                data_dict['official-title'] = re.compile(r'[\n\r\t]').sub("", data_dict['official-title'])
                if isinstance(data_dict['legis-type'], dict):
                    data_dict['legis-type'] = data_dict['legis-type']['#text']
                if isinstance(data_dict['legis-num'], dict):
                    data_dict['legis-num'] = data_dict['legis-num']['#text']
            except Exception as e:
                print("Do not exist %s"%e)
                if e == '#text':
                    data_dict['official-title'] = None
                    
        ## add field dc:title from 'metadata'
        elif n == 'metadata':
            try:
                data_dict['dc:title'] = doc[bill_type][n]['dublinCore']['dc:title']
            except Exception as e:
                print("Do not exist %s"%e)
                data_dict['dc:title'] = None
        else:
            data_dict[n] = doc[bill_type][n]

    data_dict["metadata"] = metadata
    return data_dict

### Create Bills DataFrame

In [12]:
indir = './data/bills' 

data = []

count=0
for root, dirs, filenames in os.walk(indir):
    for filename in filenames:
        if count%1000==0:
            print(count)
        with open(os.path.join(indir, filename), 'rb') as f:
            doc = xmltodict.parse(remove_tags(f.read()))
            data.append(bill_to_dict(filename, doc))
        count+=1

0
1000
2000
3000
4000
5000
Do not exist '#text'
Do not exist '#text'
Do not exist '#text'
6000
7000
8000


In [13]:
# choose relevant columns
select_keys = ['metadata', 'bill-type', 'body', 'file-name','dc:title', 'official-title', 'legis-type', 'legis-num',
               'section', 'sentence', 'word']
select_data = []
for d in data:
    select_data.append({ select_key: d[select_key] for select_key in select_keys })
df = pd.io.json.json_normalize(select_data)

In [14]:
df.head()

Unnamed: 0,bill-type,body.amendment,body.division,body.section,body.title,dc:title,file-name,legis-num,legis-type,legis-type.#text,...,metadata.@public-private,metadata.@resolution-stage,metadata.@resolution-type,metadata.@stage-count,metadata.@star-print,official-title,official-title.@display,section,sentence,word
0,bill,,,1. Permanent resident status for Joseph\n\t\t\...,,115 S556 IS: For the relief of Joseph Gabra an...,92510.140382952224512.47152.xml,S. 556,A BILL,,...,private,,,,,For the relief of Joseph Gabra and Sharon Kamel.,,4,11,390
1,resolution,,,That it is the sense of Congress that— (1) the...,,115 HCON 10 IH: Expressing the sense of the Co...,US_Bill_Text_115_HCONRES10_IH.xml,H. CON. RES. 10,CONCURRENT RESOLUTION,,...,public,Introduced-in-House,house-concurrent,,no-star-print,Expressing the sense of the Congress that tax-...,,5,1,126
2,resolution,,,That it is the sense of Congress that— (1) the...,,115 HCON 11 IH: Expressing the sense of Congre...,US_Bill_Text_115_HCONRES11_IH.xml,H. CON. RES. 11,CONCURRENT RESOLUTION,,...,public,Introduced-in-House,house-concurrent,,no-star-print,Expressing the sense of Congress that Jerusale...,,3,1,38
3,resolution,,,That Congress— (1) supports the designation of...,,115 HCON 12 IH: Supporting the designation of ...,US_Bill_Text_115_HCONRES12_IH.xml,H. CON. RES. 12,CONCURRENT RESOLUTION,,...,public,Introduced-in-House,house-concurrent,,no-star-print,Supporting the designation of the week of Sept...,,4,1,80
4,resolution,,,That Congress should not impose any new perfor...,,115 HCON 13 IH: Supporting the Local Radio Fre...,US_Bill_Text_115_HCONRES13_IH.xml,H. CON. RES. 13,CONCURRENT RESOLUTION,,...,public,Introduced-in-House,house-concurrent,,no-star-print,Supporting the Local Radio Freedom Act.,,1,1,43


In [15]:
df.to_pickle("data/bills.pkl")

# Simple stats

In [16]:
df['bill-type'].value_counts()

bill             6712
resolution       1312
amendment-doc      15
Name: bill-type, dtype: int64