In [1]:
import xmltodict
import pandas as pd
import os
import re

# Cleaning

In [2]:
def remove_tags(read_file):
    read_file = re.sub(b'</?external-xref[^<>]*>',b'', read_file)
    read_file = re.sub(b'<quote>',b'"', read_file)
    read_file = re.sub(b'</quote>',b'"', read_file)
    read_file = re.sub(b'</?term[^<>]*>',b' ', read_file)
    read_file = re.sub(b'</?pagebreak[^<>]*>',b'', read_file)
    return read_file

In [3]:
def flatten(d):
    output = ''
    for k, v in d.items():
        if not k.startswith("@"):
            if isinstance(v, dict):
                output += flatten(v)
            elif isinstance(v, list):
                for l in v:
                    if isinstance(l, dict):
                        output += flatten(l)
                    else:
                        if l:
                            output += l + ' '
            else:
                if v:
                    output += v + ' '
    return output

In [4]:
def bill_to_dict(filename, doc):
    def _clean_body(bodydict):
        clean_bodydict = {}
        for k, v in bodydict.items():
            if not k.startswith("@"):
                if isinstance(v, list):
                    sec_str = ''
                    for sub_dict in v:
                        if sub_dict:
                            sec_str += flatten(sub_dict)
                    clean_bodydict[k] = sec_str
#                 elif isinstance(v, dict):
                else:
                    clean_bodydict[k] = flatten(v)
#                 else:
#                     print('xxxxxxxxx')
        return clean_bodydict
    
    bill_type = list(doc)[0]
    data_dict = {}
    
    data_dict["file-name"] = filename
    data_dict["bill-type"] = bill_type
    
    metadata = {}
    for n in list(doc[bill_type]):
        # group meta data
        if n.startswith("@"): 
            metadata[n] = doc[bill_type][n]
        # unify name for different types
        elif n in ['legis-body', 'resolution-body', 'engrossed-amendment-body']:
            if isinstance(doc[bill_type][n], dict):
                data_dict['body'] = _clean_body(doc[bill_type][n])
            elif isinstance(doc[bill_type][n], list):
                # just take last one
                data_dict['body'] = _clean_body(doc[bill_type][n][-1])
            else:
                print('NOT dict nor list')
                data_dict['body'] = doc[bill_type][n]
                
        elif n == 'engrossed-amendment-form':
            data_dict['form'] = doc[bill_type][n]
        else:
            data_dict[n] = doc[bill_type][n]
    data_dict["metadata"] = metadata
    return data_dict

In [5]:
# I put all the files in one folder for simplicity
indir = 'bill_text_115' 

data = [] # a list of dict, each element is a xml file

for root, dirs, filenames in os.walk(indir):
    for filename in filenames:
        with open(os.path.join(indir, filename), 'rb') as f:
            doc = xmltodict.parse(remove_tags(f.read()))
            data.append(bill_to_dict(filename, doc))

In [None]:
# [list(x) for x in set(tuple(x) for x in data)]

# Select Some Columns

In [6]:
select_keys = ['metadata', 'bill-type', 'body', 'file-name']
select_data = []
for d in data:
    select_data.append({ select_key: d[select_key] for select_key in select_keys })

In [7]:
df = pd.io.json.json_normalize(select_data)

In [8]:
df

Unnamed: 0,bill-type,body.amendment,body.division,body.section,body.title,file-name,metadata.@amend-degree,metadata.@amend-stage,metadata.@amend-type,metadata.@bill-stage,metadata.@bill-type,metadata.@dms-id,metadata.@key,metadata.@public-print,metadata.@public-private,metadata.@resolution-stage,metadata.@resolution-type,metadata.@stage-count,metadata.@star-print
0,bill,,,1. Permanent resident status for Joseph\n ...,,92510.140382952224512.47152.xml,,,,Introduced-in-Senate,,A1,,,private,,,,
1,resolution,,,That it is the sense of Congress that— (1) the...,,US_Bill_Text_115_HCONRES10_IH.xml,,,,,,H6105DDE2819B44DBB5E467A21768B92E,H,,public,Introduced-in-House,house-concurrent,,no-star-print
2,resolution,,,That it is the sense of Congress that— (1) the...,,US_Bill_Text_115_HCONRES11_IH.xml,,,,,,H4AC6CF641CD640E393CFA510D0960D8C,H,,public,Introduced-in-House,house-concurrent,,no-star-print
3,resolution,,,That Congress— (1) supports the designation of...,,US_Bill_Text_115_HCONRES12_IH.xml,,,,,,H4E5CA555E82D40E08E6EDAC6DB9540D3,H,,public,Introduced-in-House,house-concurrent,,no-star-print
4,resolution,,,That Congress should not impose any new perfor...,,US_Bill_Text_115_HCONRES13_IH.xml,,,,,,HE39F52BA0A584A39A31BF533005200CC,H,,public,Introduced-in-House,house-concurrent,,no-star-print
5,resolution,,,1. Short title This resolution may be cited as...,,US_Bill_Text_115_HCONRES14_IH.xml,,,,,,H40B69ED168D842FCA4473EBF04679DB6,H,,public,Introduced-in-House,house-concurrent,,no-star-print
6,resolution,,,That— (1) Donald J. Trump won the 2016 preside...,,US_Bill_Text_115_HCONRES15_IH.xml,,,,,,HEC124C4E28034F0C8032E3BCCB955FBD,H,,public,Introduced-in-House,house-concurrent,,no-star-print
7,resolution,,,That it is the sense of Congress that— (1) a c...,,US_Bill_Text_115_HCONRES16_IH.xml,,,,,,HCC74C220526D4AFDAF81C0E2148FD2CB,H,,public,Introduced-in-House,house-concurrent,,no-star-print
8,resolution,,,That— 1. Short title This resolution may be ci...,,US_Bill_Text_115_HCONRES17_IH.xml,,,,,,H2AC8F7FC62CD410EA708DFCD97016B45,H,,public,Introduced-in-House,house-concurrent,,no-star-print
9,resolution,,,1. Use of rotunda for holocaust days of rememb...,,US_Bill_Text_115_HCONRES18_EH.xml,,,,,,H926C8CB44FFE4D25B18DD87483D485A5,H,,public,Engrossed-in-House,house-concurrent,1,no-star-print
