In [1]:
import xmltodict
import pandas as pd
import os

In [2]:
# organizing the original xml data: 
# 1) Add file-name: just file name
# 2) Add bill-type: there are three types: 'bill', 'resolution', 'amendment-doc'
# 3) Grouped all "meta" data which start with @ from parser.
def bill_to_dict(filename, doc):
    bill_type = list(doc)[0]
    data_dict = {}
    
    data_dict["file-name"] = filename
    data_dict["bill-type"] = bill_type
    
    metadata = {}
    for n in list(doc[bill_type]):
        if n.startswith("@"):
            metadata[n] = doc[bill_type][n]
        else:
            data_dict[n] = doc[bill_type][n]
    data_dict["metadata"] = metadata
    return data_dict

In [3]:
# I put all the files in one folder for simplicity
indir = 'bill_text_115' 

data = [] # a list of dict, each element is a xml file

for root, dirs, filenames in os.walk(indir):
    for filename in filenames:
        with open(os.path.join(indir, filename), 'rb') as f:
            doc = xmltodict.parse(f.read())
            try:
                data.append(bill_to_dict(filename, doc))
            except:
                print(filename)

### Tried two ways to convert to dataframe
#### 1. pandas.io.json.json_normalize
- This will flat the nested structure. But since some tags have grand and grand grand sons, and some bills has additional information, the column number is large, so the head shows here are NA values.

In [4]:
df = pd.io.json.json_normalize(data)
df.head(2)

Unnamed: 0,attestation.attestation-group,attestation.attestation-group.attestation-date,attestation.attestation-group.attestation-date.#text,attestation.attestation-group.attestation-date.@chamber,attestation.attestation-group.attestation-date.@date,attestation.attestation-group.attestation-date.@legis-day,attestation.attestation-group.attestor,attestation.attestation-group.attestor.#text,attestation.attestation-group.attestor.@display,attestation.attestation-group.proxy.#text,...,resolution-body.section.text.external-xref.@parsable-cite,resolution-body.section.text.pagebreak,resolution-body.section.text.quote,resolution-body.section.text.quote.#text,resolution-body.section.text.quote.pagebreak,resolution-body.section.text.quote.quote,resolution-body.title,title-amends.official-title-amendment.#text,title-amends.official-title-amendment.pagebreak,title-amends.official-title-amendment.quote
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,


In [5]:
# df.to_csv("bills.csv")

#### 2. pandas.DataFrame
- Easier to see information like how many bill-type

In [6]:
df2 = pd.DataFrame(data)
df2.head(2)

Unnamed: 0,attestation,bill-type,endorsement,engrossed-amendment-body,engrossed-amendment-form,file-name,form,legis-body,metadata,official-title-amendment,preamble,resolution-body,title-amends
0,,bill,,,,92510.140382952224512.47152.xml,"{'distribution-code': {'@display': 'yes', '#te...",{'@display-enacting-clause': 'yes-display-enac...,"{'@public-private': 'private', '@bill-stage': ...",,,,
1,,resolution,,,,US_Bill_Text_115_HCONRES10_IH.xml,"{'distribution-code': {'@display': 'yes', '#te...",,"{'@public-private': 'public', '@star-print': '...",,{'whereas': [{'text': 'Whereas America’s frate...,"{'@id': 'H87EB0F92716248E78751D13F08B91455', '...",


In [7]:
df2['bill-type'].value_counts()

bill             6712
resolution       1312
amendment-doc      15
Name: bill-type, dtype: int64