In [4]:
import xmltodict
import pandas as pd
import os
from bs4 import BeautifulSoup
from utils import *

In [2]:
def parse_summary(path, fileName):

    with open(path+fileName, 'rb') as file:
        dict1 = xmltodict.parse(file.read()) # parse original XML to a dictionary
    
    if 'billStatus' in dict1.keys():
        
        dict2 = {} # initialize empty dictionary for this bill
        dict2['fileName'] = fileName # insert filename
        dict2['billNumber'] = dict1['billStatus']['bill']['billNumber'] # insert bill number
        dict2['contributor'] = dict1['billStatus']['dublinCore']['dc:contributor'] # insert contributor

        ### summaries (there may be multiple) ###
        summaries = dict1['billStatus']['bill']['summaries']['billSummaries']
        if summaries:
            if isinstance(summaries['item'], dict): # if there's only one summary
                # remove HTML tags from the summary and append it
                dict2['summary0'] = BeautifulSoup(summaries['item']['text'], 'lxml').text
                # add length fields (sentence and word) to columns
                dict2['sentence0'], dict2['word0'] = length_bill(dict2['summary0'])
            elif isinstance(summaries['item'], list): # if there are multiple summaries
                for i, item in enumerate(summaries['item']):
                    # remove HTML tags from each summary and append it
                    dict2['summary'+str(i)] = BeautifulSoup(item['text'], 'lxml').text
                    # add length fields (sentence and word) to columns
                    dict2['sentence'+str(i)], dict2['word'+str(i)] = length_bill(dict2['summary'+str(i)])
        
        ### titles (there may be multiple) ###
        dict2['title'] = dict1['billStatus']['bill']['title']
        titles = dict1['billStatus']['bill']['titles']['item'] # original title in 'title' tag
        for i, item in enumerate(titles): # all other titles
            dict3 = {}
            dict3[item['titleType']] = item['title']
            dict2['title'+str(i)] = str(dict3)

        return (1, dict2)
    
    else:
        return (0, fileName)

In [5]:
path = './data/summaries/'
all_dicts = []
unused_filenames = []
count = 0
for summ_file in os.listdir(path):
    parsed = parse_summary(path, summ_file)
    if count%1000==0:
        print (count)
    if parsed[0] == 1:
        all_dicts.append(parsed[1])
    else:
        unused_filenames.append(parsed[1])
    count+=1

0
1000
2000
3000
4000
5000
6000
7000
8000


In [6]:
df = pd.DataFrame(all_dicts)
df.to_pickle("data/summaries.pkl")
df.head()

Unnamed: 0,billNumber,contributor,fileName,sentence0,sentence1,sentence2,sentence3,sentence4,summary0,summary1,...,title72,title73,title74,title8,title9,word0,word1,word2,word3,word4
0,24,"Congressional Research Service, Library of Con...",104657.140052642395904.4245.xml,,,,,,,,...,,,,,,,,,,
1,165,"Congressional Research Service, Library of Con...",104657.140052642395904.4338.xml,,,,,,,,...,,,,,,,,,,
2,12,"Congressional Research Service, Library of Con...",104657.140052642395904.4450.xml,,,,,,,,...,,,,,,,,,,
3,527,"Congressional Research Service, Library of Con...",104657.140052642395904.4455.xml,,,,,,,,...,,,,,,,,,,
4,524,"Congressional Research Service, Library of Con...",104657.140052642395904.4458.xml,,,,,,,,...,,,,,,,,,,
