In [4]:
import xmltodict
import pandas as pd
import os
import re

# Cleaning

In [5]:
def remove_tags(read_file):
    read_file = re.sub(b'</?external-xref[^<>]*>',b'', read_file)
    read_file = re.sub(b'<quote>',b'"', read_file)
    read_file = re.sub(b'</quote>',b'"', read_file)
    read_file = re.sub(b'</?term[^<>]*>',b' ', read_file)
    read_file = re.sub(b'</?pagebreak[^<>]*>',b'', read_file)
    return read_file

In [56]:
def flatten(d):
    output = ''
    for k, v in d.items():
        if not k.startswith("@"):
            if isinstance(v, dict):
                output += flatten(v)
            elif isinstance(v, list):
                for l in v:
                    if isinstance(l, dict):
                        output += flatten(l)
                    else:
                        if l:
                            output += l + ' '
            else:
                if v:
                    output += v + ' '
    return output

In [57]:
def bill_to_dict(filename, doc):
    def _clean_body(bodydict):
        clean_bodydict = {}
        for k, v in bodydict.items():
            if not k.startswith("@"):
                if isinstance(v, list):
                    sec_str = ''
                    for sub_dict in v:
                        if sub_dict:
                            sec_str += flatten(sub_dict)
                    clean_bodydict[k] = sec_str
#                 elif isinstance(v, dict):
                else:
                    clean_bodydict[k] = flatten(v)
#                 else:
#                     print('xxxxxxxxx')
#         print(1)
#         print(clean_bodydict)
        return clean_bodydict
    
    bill_type = list(doc)[0]
    data_dict = {}
    
    data_dict["file-name"] = filename
    data_dict["bill-type"] = bill_type
    
    metadata = {}
    for n in list(doc[bill_type]):
        # group meta data
        if n.startswith("@"): 
            metadata[n] = doc[bill_type][n]
        # unify name for different types
        elif n in ['legis-body', 'resolution-body', 'engrossed-amendment-body']:
            if isinstance(doc[bill_type][n], dict):
                data_dict['body'] = _clean_body(doc[bill_type][n])    
            elif isinstance(doc[bill_type][n], list):
                # just take last one
                data_dict['body'] = _clean_body(doc[bill_type][n][-1])
            else:
                print('NOT dict nor list')
                data_dict['body'] = doc[bill_type][n]
                
            
        elif n == 'engrossed-amendment-form':
            data_dict['form'] = doc[bill_type][n]
        else:
            data_dict[n] = doc[bill_type][n]
    data_dict["metadata"] = metadata
    return data_dict

In [58]:
# test
indir = 'bill_text_115'
tmpdata=[]

with open(os.path.join(indir, 'US_Bill_Text_115_HCONRES1_EH.xml'), 'rb') as f:
    doc = xmltodict.parse(remove_tags(f.read()))
    tmpdata.append(bill_to_dict(filename, doc))
print(tmpdata[0]['body'])

2
That pursuant to clause 4, section 5, article I of the
                Constitution, during the One Hundred Fifteenth Congress the Speaker of the House and the Majority Leader
                of the Senate or their respective designees, acting jointly after consultation with the Minority Leader
                of the House and the Minority Leader of the Senate, may notify the Members of the House and the Senate,
                respectively, to assemble at a place outside the District of Columbia if, in their opinion, the public
                interest shall warrant it.
{'section': 'That pursuant to clause 4, section 5, article I of the\n                Constitution, during the One Hundred Fifteenth Congress the Speaker of the House and the Majority Leader\n                of the Senate or their respective designees, acting jointly after consultation with the Minority Leader\n                of the House and the Minority Leader of the Senate, may notify the Members of the House and 

In [8]:
# I put all the files in one folder for simplicity
indir = 'bill_text_115' 

data = [] # a list of dict, each element is a xml file

for root, dirs, filenames in os.walk(indir):
    for filename in filenames:
        with open(os.path.join(indir, filename), 'rb') as f:
            doc = xmltodict.parse(remove_tags(f.read()))
            data.append(bill_to_dict(filename, doc))

In [9]:
# [list(x) for x in set(tuple(x) for x in data)]

# Select Some Columns

In [10]:
select_keys = ['metadata', 'bill-type', 'body', 'file-name']
select_data = []
for d in data:
    select_data.append({ select_key: d[select_key] for select_key in select_keys })

In [11]:
df = pd.io.json.json_normalize(select_data)

In [12]:
df.head()

Unnamed: 0,bill-type,body.amendment,body.division,body.section,body.title,file-name,metadata.@amend-degree,metadata.@amend-stage,metadata.@amend-type,metadata.@bill-stage,metadata.@bill-type,metadata.@dms-id,metadata.@key,metadata.@public-print,metadata.@public-private,metadata.@resolution-stage,metadata.@resolution-type,metadata.@stage-count,metadata.@star-print
0,bill,,,1. Permanent resident status for Joseph\n ...,,92510.140382952224512.47152.xml,,,,Introduced-in-Senate,,A1,,,private,,,,
1,resolution,,,That it is the sense of Congress that— (1) the...,,US_Bill_Text_115_HCONRES10_IH.xml,,,,,,H6105DDE2819B44DBB5E467A21768B92E,H,,public,Introduced-in-House,house-concurrent,,no-star-print
2,resolution,,,That it is the sense of Congress that— (1) the...,,US_Bill_Text_115_HCONRES11_IH.xml,,,,,,H4AC6CF641CD640E393CFA510D0960D8C,H,,public,Introduced-in-House,house-concurrent,,no-star-print
3,resolution,,,That Congress— (1) supports the designation of...,,US_Bill_Text_115_HCONRES12_IH.xml,,,,,,H4E5CA555E82D40E08E6EDAC6DB9540D3,H,,public,Introduced-in-House,house-concurrent,,no-star-print
4,resolution,,,That Congress should not impose any new perfor...,,US_Bill_Text_115_HCONRES13_IH.xml,,,,,,HE39F52BA0A584A39A31BF533005200CC,H,,public,Introduced-in-House,house-concurrent,,no-star-print


In [13]:
df.to_pickle("data/bills.pkl") 

FileNotFoundError: [Errno 2] No such file or directory: 'data/bills.pkl'

# Simple stats

In [16]:
df['bill-type'].value_counts()

bill             6712
resolution       1312
amendment-doc      15
Name: bill-type, dtype: int64

# Sumy

- sudo easy_install -U pulp
- conda config --add channels conda-forge
- conda install spacy
- python -m spacy download en

In [119]:
# import pulp
import spacy

In [21]:
nlp = spacy.load('en')

In [143]:
body_indir = 'bill_text_body'

def write_doc(row):
    text = row['body.section']
    filename = 'BODY_'+row['file-name'][:-3]+'txt'  
    try:
        doc = nlp(text)
        with open(os.path.join(body_indir, filename), 'w') as f:
            for sent in doc.sents:
                f.write(sent.text+'\n')
    except:
        print('ERROR.\nfilename: {}\ntext:{}'.format(row['file-name'], text))

In [144]:
# run a subset
df1 = df[['body.section', 'file-name']][:10].copy()
df1.apply(write_doc, axis=1)

ERROR.
filename: US_Bill_Text_115_HR1436_IH.xml
text:nan
ERROR.
filename: US_Bill_Text_115_HR3198_IH.xml
text:nan
ERROR.
filename: US_Bill_Text_115_HR610_IH.xml
text:nan
ERROR.
filename: US_Bill_Text_115_S1519_PCS.xml
text:nan
ERROR.
filename: US_Bill_Text_115_S554_IS.xml
text:nan


0       None
1       None
2       None
3       None
4       None
5       None
6       None
7       None
8       None
9       None
10      None
11      None
12      None
13      None
14      None
15      None
16      None
17      None
18      None
19      None
20      None
21      None
22      None
23      None
24      None
25      None
26      None
27      None
28      None
29      None
        ... 
8009    None
8010    None
8011    None
8012    None
8013    None
8014    None
8015    None
8016    None
8017    None
8018    None
8019    None
8020    None
8021    None
8022    None
8023    None
8024    None
8025    None
8026    None
8027    None
8028    None
8029    None
8030    None
8031    None
8032    None
8033    None
8034    None
8035    None
8036    None
8037    None
8038    None
dtype: object

In [127]:
df['body.section'][0]

'1. Permanent resident status for Joseph Gabra and Sharon Kamel (a) In general Notwithstanding subsections (a) and (b) of section 201 of the Immigration and Nationality Act (8 U.S.C. 1151), Joseph Gabra and Sharon Kamel shall each be eligible for issuance of an immigrant visa or for adjustment of status to that of an alien lawfully admitted for permanent residence upon filing an application for issuance of an immigrant visa under section 204 of such Act ( 8 U.S.C. 1154) or for adjustment of status to lawful permanent resident. (b) Adjustment of status Immigration and Nationality Act If Joseph Gabra or Sharon Kamel enters the United States before the filing deadline specified in subsection (c), Joseph Gabra or Sharon Kamel shall be considered to have entered and remained lawfully in the United States and shall be eligible for adjustment of status under section 245 of the (8 U.S.C. 1255) as of the date of the enactment of this Act. (c) Application and payment of fees Subsections (a) and 

In [145]:
from sumy.parsers.plaintext import PlaintextParser #We're choosing a plaintext parser here, other parsers available for HTML etc.
from sumy.nlp.tokenizers import Tokenizer 
from sumy.summarizers.lex_rank import LexRankSummarizer #We're choosing Lexrank, other algorithms are also built in


parser = PlaintextParser.from_string(df['body.section'][0], Tokenizer("english"))
summarizer = LexRankSummarizer()
summary = summarizer(parser.document, 3) #Summarize the document with 5 sentences
for sentence in summary:
    print(sentence)

(b) Adjustment of status Immigration and Nationality Act If Joseph Gabra or Sharon Kamel enters the United States before the filing deadline specified in subsection (c), Joseph Gabra or Sharon Kamel shall be considered to have entered and remained lawfully in the United States and shall be eligible for adjustment of status under section 245 of the (8 U.S.C.
(c) Application and payment of fees Subsections (a) and (b) shall apply only if the applications for the issuance of immigrant visas or the applications for adjustment of status are filed with appropriate fees not later than two years after the date of the enactment of this Act.
1153(a)); or (2) if applicable, the total number of immigrant visas that are made available to natives of the country of birth of Joseph Gabra and Sharon Kamel under section 202(e) of that Act (8 U.S.C.
