In [1]:
import xmltodict
import pandas as pd
import os
from bs4 import BeautifulSoup

In [2]:
def parse_summary(path, fileName):

    with open(path+fileName) as file:
        dict1 = xmltodict.parse(file.read()) # parse original XML to a dictionary
    
    if 'billStatus' in dict1.keys():
        
        dict2 = {} # initialize empty dictionary for this bill
        dict2['fileName'] = fileName # insert filename
        dict2['billNumber'] = dict1['billStatus']['bill']['billNumber'] # insert bill number
        dict2['contributor'] = dict1['billStatus']['dublinCore']['dc:contributor'] # insert contributor

        ### summaries (there may be multiple) ###
        summaries = dict1['billStatus']['bill']['summaries']['billSummaries']
        if summaries:
            if isinstance(summaries['item'], dict): # if there's only one summary
                # remove HTML tags from the summary and append it
                dict2['summary0'] = BeautifulSoup(summaries['item']['text'], 'lxml').text
                
            elif isinstance(summaries['item'], list): # if there are multiple summaries
                for i, item in enumerate(summaries['item']):
                    # remove HTML tags from each summary and append it
                    dict2['summary'+str(i)] = BeautifulSoup(item['text'], 'lxml').text
        
        ### titles (there may be multiple) ###
        dict2['title'] = dict1['billStatus']['bill']['title']
        titles = dict1['billStatus']['bill']['titles']['item'] # original title in 'title' tag
        for i, item in enumerate(titles): # all other titles
            dict3 = {}
            dict3[item['titleType']] = item['title']
            dict2['title'+str(i)] = str(dict3)

        return (1, dict2)
    
    else:
        return (0, fileName)

In [3]:
path = '../data/summaries/'
all_dicts = []
unused_filenames = []

for summ_file in os.listdir(path):
    parsed = parse_summary(path, summ_file)
    if parsed[0] == 1:
        all_dicts.append(parsed[1])
    else:
        unused_filenames.append(parsed[1])

In [4]:
df = pd.DataFrame(all_dicts)
df

Unnamed: 0,billNumber,contributor,fileName,summary0,summary1,summary2,summary3,summary4,title,title0,...,title68,title69,title7,title70,title71,title72,title73,title74,title8,title9
0,24,"Congressional Research Service, Library of Con...",104657.140052642395904.4245.xml,,,,,,Establishing a Joint Committee on Russian Inte...,{'Official Title as Introduced': 'Establishing...,...,,,,,,,,,,
1,165,"Congressional Research Service, Library of Con...",104657.140052642395904.4338.xml,,,,,,Expressing the sense of the House of Represent...,{'Official Title as Introduced': 'Expressing t...,...,,,,,,,,,,
2,12,"Congressional Research Service, Library of Con...",104657.140052642395904.4450.xml,,,,,,A joint resolution disapproving the rule submi...,{'Official Title as Introduced': 'A joint reso...,...,,,,,,,,,,
3,527,"Congressional Research Service, Library of Con...",104657.140052642395904.4455.xml,,,,,,A bill to improve access to emergency medical ...,{'Official Title as Introduced': 'A bill to im...,...,,,,,,,,,,
4,524,"Congressional Research Service, Library of Con...",104657.140052642395904.4458.xml,,,,,,A bill to amend the Internal Revenue Code of 1...,{'Official Title as Introduced': 'A bill to am...,...,,,,,,,,,,
5,521,"Congressional Research Service, Library of Con...",104657.140052642395904.4461.xml,,,,,,A bill to make the National Parks and Federal ...,{'Official Title as Introduced': 'A bill to ma...,...,,,,,,,,,,
6,520,"Congressional Research Service, Library of Con...",104657.140052642395904.4462.xml,,,,,,A bill to amend title XIX of the Social Securi...,{'Official Title as Introduced': 'A bill to am...,...,,,,,,,,,,
7,518,"Congressional Research Service, Library of Con...",104657.140052642395904.4464.xml,,,,,,A bill to amend the Federal Water Pollution Co...,{'Official Title as Introduced': 'A bill to am...,...,,,,,,,,,,
8,517,"Congressional Research Service, Library of Con...",104657.140052642395904.4465.xml,,,,,,A bill to amend the Clean Air Act with respect...,{'Official Title as Introduced': 'A bill to am...,...,,,,,,,,,,
9,516,"Congressional Research Service, Library of Con...",104657.140052642395904.4466.xml,,,,,,A bill to provide grants to assist States in d...,{'Official Title as Introduced': 'A bill to pr...,...,,,,,,,,,,


In [7]:
print('Number of files we received: ', len([name for name in os.listdir(path)]))
print('Number of bills with a valid file: ', len(df))
print('Number of bills with at least one summary: ', sum(df.summary0.notnull()))

Number of files we received:  8837
Number of bills with a valid file:  8759
Number of bills with at least one summary:  5395
