In [1]:
import xmltodict
import pandas as pd
import os
import re

In [2]:
billDir = 'data/bills' 
summariesDir = 'data/summaries'

### First Way to Merge

In [3]:
def getBillFileName(summaryFileName, extension):
    """
    Return bill file names based on summaryfilenames
    """
#     US_Bill_Text_115_HR1607_IH.xml <- US_Bill_Digest_115_hr_1607.xml
#     possibleExt = ["_RH.xml","_IH.xml","_EH.xml","_RFS.xml","_IS.xml"]
    billFileName = []
    if summaryFileName.startswith("US_Bill_Digest_115_"):
        tempFile = summaryFileName.split("_")
        tempFile = tempFile[-2:]
        tempFile[0] = tempFile[0].upper()
        for ext in extension:
            billFileName.append("US_Bill_Text_115_"+tempFile[0]+tempFile[1][:-4]+"_"+ext)
    return billFileName

In [3]:
# def print_head(d,top=10):
#     count = 0
#     for k in d:
#         if count==top:
#             break
#         print ("{} -> {}".format(k,d[k]))
#         count+=1
#     return

In [4]:
def getBillsToSummaries(billDir, summariesDir):

    billFiles = set()
    extension = set()
    for root, dirs, filenames in os.walk(billDir):
        for filename in filenames:
            billFiles.add(filename)
            extension.add(filename.split("_")[-1])
    assert len(billFiles)==8039
    
    billsToSummary = {}
    summariesNoMatch = set()
    for root, dirs, filenames in os.walk(summariesDir):
        for filename in filenames:
            billFileList = getBillFileName(filename, extension)
            flag = 1
            for billF in billFileList:
                if billF in billFiles:
                    billsToSummary[billF] = filename
                    flag=0
            if flag:
                summariesNoMatch.add(filename)
    return billsToSummary, billFiles, summariesNoMatch

In [5]:
billsToSummary, billFiles, summariesNoMatch = getBillsToSummaries(billDir, summariesDir)

In [6]:
print("Total Number of Bills {}".format(len(billFiles)))
print("Bills with summary {}".format(len(billsToSummary)))
print("Summaries with no bill {}".format(len(summariesNoMatch)))

Total Number of Bills 8039
Bills with summary 7954
Summaries with no bill 2298


In [7]:
bills = pd.read_pickle('data/bills.pkl')
summaries = pd.read_pickle('data/summaries.pkl')

In [8]:
index = ["fileName-Bill", "Bill-Body", "fileName-Summary","summary0", "summary1","summary2","summary3", "summary4"]
data = []

for bill_file in billsToSummary:
    summary_file = billsToSummary[bill_file]
    bill_body = bills.loc[bills['file-name'] == bill_file]['body.section'].values[0]
    summary_body0 = summaries.loc[summaries['fileName'] == summary_file]['summary0'].values[0]
    summary_body1 = summaries.loc[summaries['fileName'] == summary_file]['summary1'].values[0]
    summary_body2 = summaries.loc[summaries['fileName'] == summary_file]['summary2'].values[0]
    summary_body3 = summaries.loc[summaries['fileName'] == summary_file]['summary3'].values[0]
    summary_body4 = summaries.loc[summaries['fileName'] == summary_file]['summary4'].values[0]
    data.append((bill_file,bill_body, summary_file,summary_body0,summary_body1,summary_body2,summary_body3,summary_body4))

In [9]:
matched = pd.DataFrame(data, columns=index)
matched.shape

(7954, 8)

In [10]:
matched.head()

Unnamed: 0,fileName-Bill,Bill-Body,fileName-Summary,summary0,summary1,summary2,summary3,summary4
0,US_Bill_Text_115_SRES212_IS.xml,"That the Senate— (1) supports the rights, free...",US_Bill_Digest_115_sres_212.xml,,,,,
1,US_Bill_Text_115_HR1812_IH.xml,1. Short title Congressional Leadership In Mit...,US_Bill_Digest_115_hr_1812.xml,Congressional Leadership In Mitigating Adminis...,,,,
2,US_Bill_Text_115_HR1317_IH.xml,1. Short title Servicemember Retirement Improv...,US_Bill_Digest_115_hr_1317.xml,Servicemember Retirement Improvement Act This...,,,,
3,US_Bill_Text_115_S1129_RS.xml,1. Short title; table of contents (a) Short ti...,US_Bill_Digest_115_s_1129.xml,,,,,
4,US_Bill_Text_115_HR908_IH.xml,1. Short title Medicare Advantage Quality Paym...,US_Bill_Digest_115_hr_908.xml,Medicare Advantage Quality Payment Relief Act ...,,,,


In [11]:
matched.to_pickle('data/matched.pkl')

### Another Way to Merge

In [14]:
bills_sel = bills[['file-name', 'bill-type', 'legis-type', 'body.section', 'section', 'sentence', 'word']]
summaries_sel = summaries[['fileName', 'sentence0', 'sentence1','sentence2', 'sentence3', 'sentence4', 
                              'summary0', 'summary1','summary2', 'summary3', 'summary4',
                              'word0', 'word1', 'word2', 'word3', 'word4']]

temp_files = []
for file in bills_sel['file-name']:
    try:
        temp_files.append(billsToSummary[file])
    except:
        temp_files.append(None)

bills_sel = bills_sel.copy()
bills_sel["fileName-Summary"] = pd.Series(temp_files, index = bills_sel.index)
matched2 = bills_sel.merge(summaries_sel, how = 'left', left_on = ['fileName-Summary'], 
                           right_on = ['fileName'], indicator = True)
matched2 = (matched2[matched2._merge == 'both']).drop(['_merge'], axis = 1)

In [15]:
matched2.to_pickle('data/matched2.pkl')