In [2]:
import xmltodict
import pandas as pd
import os
from bs4 import BeautifulSoup

In [5]:
def parse_summary(path, fileName):

    with open(path+fileName) as file:
        dict1 = xmltodict.parse(file.read()) # parse original XML to a dictionary
    
    if 'billStatus' in dict1.keys():
        
        dict2 = {} # initialize empty dictionary for this bill
        dict2['fileName'] = fileName # insert filename
        dict2['billNumber'] = dict1['billStatus']['bill']['billNumber'] # insert bill number
        dict2['contributor'] = dict1['billStatus']['dublinCore']['dc:contributor'] # insert contributor

        ### summaries (there may be multiple) ###
        summaries = dict1['billStatus']['bill']['summaries']['billSummaries']
        if summaries:
            if isinstance(summaries['item'], dict): # if there's only one summary
                # remove HTML tags from the summary and append it
                dict2['summary0'] = BeautifulSoup(summaries['item']['text'], 'lxml').text
                
            elif isinstance(summaries['item'], list): # if there are multiple summaries
                for i, item in enumerate(summaries['item']):
                    # remove HTML tags from each summary and append it
                    dict2['summary'+str(i)] = BeautifulSoup(item['text'], 'lxml').text
        
        ### titles (there may be multiple) ###
        dict2['title'] = dict1['billStatus']['bill']['title']
        titles = dict1['billStatus']['bill']['titles']['item'] # original title in 'title' tag
        for i, item in enumerate(titles): # all other titles
            dict3 = {}
            dict3[item['titleType']] = item['title']
            dict2['title'+str(i)] = str(dict3)

        return (1, dict2)
    
    else:
        return (0, fileName)

In [8]:
path = '../data/summaries/'
all_dicts = []
unused_filenames = []
count = 0
for summ_file in os.listdir(path):
    parsed = parse_summary(path, summ_file)
    if count%1000==1:
        print ("Progress {}".format(count))
    if parsed[0] == 1:
        all_dicts.append(parsed[1])
    else:
        unused_filenames.append(parsed[1])
    count+=1

Progress 1
Progress 1001
Progress 2001
Progress 3001
Progress 4001
Progress 5001
Progress 6001
Progress 7001
Progress 8001


In [9]:
df = pd.DataFrame(all_dicts)
df.head()

Unnamed: 0,billNumber,contributor,fileName,summary0,summary1,summary2,summary3,summary4,title,title0,...,title68,title69,title7,title70,title71,title72,title73,title74,title8,title9
0,5,"Congressional Research Service, Library of Con...",US_Bill_Digest_115_hconres_5.xml,Calls upon President-elect Donald J. Trump to:...,,,,,Clarifying any potential misunderstanding as t...,{u'Official Title as Introduced': u'Clarifying...,...,,,,,,,,,,
1,487,"Congressional Research Service, Library of Con...",US_Bill_Digest_115_hres_487.xml,,,,,,Urging all Universities to designate and maint...,{u'Official Title as Introduced': u'Urging all...,...,,,,,,,,,,
2,51,"Congressional Research Service, Library of Con...",135087.140702719633152.2222306.xml,Funding for Student Scholarships for the 1890s...,,,,,Funding for Student Scholarships for the 1890s...,{u'Short Titles as Introduced': u'Funding for ...,...,,,,,,,,,,
3,194,"Congressional Research Service, Library of Con...",US_Bill_Digest_115_sres_194.xml,,,,,,"A resolution designating June 15, 2017, as ""Wo...",{u'Official Title as Introduced': u'A resoluti...,...,,,,,,,,,,
4,158,"Congressional Research Service, Library of Con...",US_Bill_Digest_115_s_158.xml,,,,,,Let Seniors Work Act of 2017,{u'(Extracted from GPO) Short Titles as Introd...,...,,,,,,,,,,


In [15]:
print('Number of files we received: {}'.format(len([name for name in os.listdir(path)])))
print('Number of bills with a valid file: {}'.format(len(df)))
print('Number of bills with at least one summary: {}'.format(sum(df.summary0.notnull())))
print('Number of bills with exactly two summary: {}'.format(sum(df.summary1.notnull())))
print('Number of bills with exactly three summary: {}'.format(sum(df.summary2.notnull())))
print('Number of bills with exactly four summary: {}'.format(sum(df.summary3.notnull())))

Number of files we received: 8837
Number of bills with a valid file: 8759
Number of bills with at least one summary: 5395
Number of bills with exactly two summary: 670
Number of bills with exactly three summary: 160
Number of bills with exactly four summary: 54


In [31]:
print "\nSample Summary 1\n"
print df[df.summary3.notnull()].iloc[0]["summary0"]
print "\nSample Summary 2\n"
print df[df.summary3.notnull()].iloc[0]["summary1"]
print "\nSample Summary 3\n"
print df[df.summary3.notnull()].iloc[0]["summary2"]


Sample Summary 1

Rapid DNA Act of 2017 This bill amends the DNA Identification Act of 1994 to require the Federal Bureau of Investigation (FBI) to issue standards and procedures for using Rapid DNA instruments to analyze DNA samples of criminal offenders.  Rapid DNA instruments carry out a fully automated process to create a DNA analysis from a DNA sample. DNA samples prepared by criminal justice agencies using Rapid DNA instruments in compliance the FBI-issued standards and procedures may be included in the Combined DNA Index System (CODIS). The bill amends the DNA Analysis Backlog Elimination Act of 2000 to allow the FBI to waive certain existing requirements if a DNA sample is analyzed using Rapid DNA instruments and the results are included in CODIS.  

Sample Summary 2

(This measure has not been amended since it was introduced. The summary has been expanded because action occurred on the measure.) Rapid DNA Act of 2017 (Sec. 2) This bill amends the DNA Identification Act of 199

In [35]:
df.loc[df['fileName']=="US_Bill_Digest_115_s_1402.xml"]

Unnamed: 0,billNumber,contributor,fileName,summary0,summary1,summary2,summary3,summary4,title,title0,...,title68,title69,title7,title70,title71,title72,title73,title74,title8,title9
4781,1402,"Congressional Research Service, Library of Con...",US_Bill_Digest_115_s_1402.xml,School Food Modernization Act This bill amends...,,,,,School Food Modernization Act,{u'Short Titles as Introduced': u'School Food ...,...,,,,,,,,,,


In [38]:
df.to_pickle("data/summaries.pkl") 