In [1]:
import xmltodict
import pandas as pd
import os
import re

In [2]:
billDir = 'data/bills' 
summariesDir = 'data/summaries'

### First Way to Merge

In [3]:
def getBillFileName(summaryFileName, extension):
    """
    Return bill file names based on summaryfilenames
    """
#     US_Bill_Text_115_HR1607_IH.xml <- US_Bill_Digest_115_hr_1607.xml
#     possibleExt = ["_RH.xml","_IH.xml","_EH.xml","_RFS.xml","_IS.xml"]
    billFileName = []
    if summaryFileName.startswith("US_Bill_Digest_115_"):
        tempFile = summaryFileName.split("_")
        tempFile = tempFile[-2:]
        tempFile[0] = tempFile[0].upper()
        for ext in extension:
            billFileName.append("US_Bill_Text_115_"+tempFile[0]+tempFile[1][:-4]+"_"+ext)
    return billFileName

In [3]:
# def print_head(d,top=10):
#     count = 0
#     for k in d:
#         if count==top:
#             break
#         print ("{} -> {}".format(k,d[k]))
#         count+=1
#     return

In [4]:
def getBillsToSummaries(billDir, summariesDir):

    billFiles = set()
    extension = set()
    for root, dirs, filenames in os.walk(billDir):
        for filename in filenames:
            billFiles.add(filename)
            extension.add(filename.split("_")[-1])
    assert len(billFiles)==8039
    
    billsToSummary = {}
    summariesNoMatch = set()
    for root, dirs, filenames in os.walk(summariesDir):
        for filename in filenames:
            billFileList = getBillFileName(filename, extension)
            flag = 1
            for billF in billFileList:
                if billF in billFiles:
                    billsToSummary[billF] = filename
                    flag=0
            if flag:
                summariesNoMatch.add(filename)
    return billsToSummary, billFiles, summariesNoMatch

In [5]:
billsToSummary, billFiles, summariesNoMatch = getBillsToSummaries(billDir, summariesDir)

In [6]:
print("Total Number of Bills {}".format(len(billFiles)))
print("Bills with summary {}".format(len(billsToSummary)))
print("Summaries with no bill {}".format(len(summariesNoMatch)))

Total Number of Bills 8039
Bills with summary 7954
Summaries with no bill 2298


In [59]:
bills = pd.read_pickle('data/bills.pkl')
summaries = pd.read_pickle('data/summaries.pkl')

In [8]:
index = ["fileName-Bill", "Bill-Body", "fileName-Summary","summary0", "summary1","summary2","summary3", "summary4"]
data = []

for bill_file in billsToSummary:
    summary_file = billsToSummary[bill_file]
    bill_body = bills.loc[bills['file-name'] == bill_file]['body.section'].values[0]
    summary_body0 = summaries.loc[summaries['fileName'] == summary_file]['summary0'].values[0]
    summary_body1 = summaries.loc[summaries['fileName'] == summary_file]['summary1'].values[0]
    summary_body2 = summaries.loc[summaries['fileName'] == summary_file]['summary2'].values[0]
    summary_body3 = summaries.loc[summaries['fileName'] == summary_file]['summary3'].values[0]
    summary_body4 = summaries.loc[summaries['fileName'] == summary_file]['summary4'].values[0]
    data.append((bill_file,bill_body, summary_file,summary_body0,summary_body1,summary_body2,summary_body3,summary_body4))

In [9]:
matched = pd.DataFrame(data, columns=index)
matched.shape

(7954, 8)

In [10]:
matched.head()

Unnamed: 0,fileName-Bill,Bill-Body,fileName-Summary,summary0,summary1,summary2,summary3,summary4
0,US_Bill_Text_115_SRES212_IS.xml,"That the Senate— (1) supports the rights, free...",US_Bill_Digest_115_sres_212.xml,,,,,
1,US_Bill_Text_115_HR1812_IH.xml,1. Short title Congressional Leadership In Mit...,US_Bill_Digest_115_hr_1812.xml,Congressional Leadership In Mitigating Adminis...,,,,
2,US_Bill_Text_115_HR1317_IH.xml,1. Short title Servicemember Retirement Improv...,US_Bill_Digest_115_hr_1317.xml,Servicemember Retirement Improvement Act This...,,,,
3,US_Bill_Text_115_S1129_RS.xml,1. Short title; table of contents (a) Short ti...,US_Bill_Digest_115_s_1129.xml,,,,,
4,US_Bill_Text_115_HR908_IH.xml,1. Short title Medicare Advantage Quality Paym...,US_Bill_Digest_115_hr_908.xml,Medicare Advantage Quality Payment Relief Act ...,,,,


In [11]:
matched.to_pickle('data/matched.pkl')

### Another Way to Merge

In [14]:
bills_sel = bills[['file-name', 'bill-type', 'legis-type', 'body.section', 'section', 'sentence', 'word']]
summaries_sel = summaries[['fileName', 'sentence0', 'sentence1','sentence2', 'sentence3', 'sentence4', 
                              'summary0', 'summary1','summary2', 'summary3', 'summary4',
                              'word0', 'word1', 'word2', 'word3', 'word4']]

temp_files = []
for file in bills_sel['file-name']:
    try:
        temp_files.append(billsToSummary[file])
    except:
        temp_files.append(None)

bills_sel = bills_sel.copy()
bills_sel["fileName-Summary"] = pd.Series(temp_files, index = bills_sel.index)
matched2 = bills_sel.merge(summaries_sel, how = 'left', left_on = ['fileName-Summary'], 
                           right_on = ['fileName'], indicator = True)
matched2 = (matched2[matched2._merge == 'both']).drop(['_merge'], axis = 1)

In [15]:
matched2.to_pickle('data/matched2.pkl')

### Third Way to Merge

In [60]:
def get_bill_info(row):
    if row['legis-num']:
        splitted = row['legis-num'].split('. ')
        billType = None
        billNum = splitted[-1]
        if len(splitted) > 2:
            billType = ''.join(splitted[:-1])
        else:
            billType = splitted[0]
        return ' '.join([billType, billNum])
    else:
        return None

In [61]:
bills['billInfo'] = bills.apply(lambda row: get_bill_info(row), axis=1)

In [62]:
bills

Unnamed: 0,bill-type,body.amendment,body.division,body.section,body.title,dc:title,file-name,legis-num,legis-type,legis-type.#text,...,metadata.@resolution-stage,metadata.@resolution-type,metadata.@stage-count,metadata.@star-print,official-title,official-title.@display,section,sentence,word,billInfo
0,bill,,,1. Permanent resident status for Joseph\n\t\t\...,,115 S556 IS: For the relief of Joseph Gabra an...,92510.140382952224512.47152.xml,S. 556,A BILL,,...,,,,,For the relief of Joseph Gabra and Sharon Kamel.,,4,11,390,S 556
1,resolution,,,That it is the sense of Congress that— (1) the...,,115 HCON 10 IH: Expressing the sense of the Co...,US_Bill_Text_115_HCONRES10_IH.xml,H. CON. RES. 10,CONCURRENT RESOLUTION,,...,Introduced-in-House,house-concurrent,,no-star-print,Expressing the sense of the Congress that tax-...,,5,1,126,HCONRES 10
2,resolution,,,That it is the sense of Congress that— (1) the...,,115 HCON 11 IH: Expressing the sense of Congre...,US_Bill_Text_115_HCONRES11_IH.xml,H. CON. RES. 11,CONCURRENT RESOLUTION,,...,Introduced-in-House,house-concurrent,,no-star-print,Expressing the sense of Congress that Jerusale...,,3,1,38,HCONRES 11
3,resolution,,,That Congress— (1) supports the designation of...,,115 HCON 12 IH: Supporting the designation of ...,US_Bill_Text_115_HCONRES12_IH.xml,H. CON. RES. 12,CONCURRENT RESOLUTION,,...,Introduced-in-House,house-concurrent,,no-star-print,Supporting the designation of the week of Sept...,,4,1,80,HCONRES 12
4,resolution,,,That Congress should not impose any new perfor...,,115 HCON 13 IH: Supporting the Local Radio Fre...,US_Bill_Text_115_HCONRES13_IH.xml,H. CON. RES. 13,CONCURRENT RESOLUTION,,...,Introduced-in-House,house-concurrent,,no-star-print,Supporting the Local Radio Freedom Act.,,1,1,43,HCONRES 13
5,resolution,,,1. Short title This resolution may be cited as...,,115 HCON 14 IH: Establishing the Joint Ad Hoc ...,US_Bill_Text_115_HCONRES14_IH.xml,H. CON. RES. 14,CONCURRENT RESOLUTION,,...,Introduced-in-House,house-concurrent,,no-star-print,Establishing the Joint Ad Hoc Committee on Tra...,,20,39,1511,HCONRES 14
6,resolution,,,That— (1) Donald J. Trump won the 2016 preside...,,115 HCON 15 IH: Asserting that Congress should...,US_Bill_Text_115_HCONRES15_IH.xml,H. CON. RES. 15,CONCURRENT RESOLUTION,,...,Introduced-in-House,house-concurrent,,no-star-print,Asserting that Congress should expend the reso...,,3,1,228,HCONRES 15
7,resolution,,,That it is the sense of Congress that— (1) a c...,,115 HCON 16 IH: Expressing the sense of Congre...,US_Bill_Text_115_HCONRES16_IH.xml,H. CON. RES. 16,CONCURRENT RESOLUTION,,...,Introduced-in-House,house-concurrent,,no-star-print,Expressing the sense of Congress that a commem...,,3,1,41,HCONRES 16
8,resolution,,,That— 1. Short title This resolution may be ci...,,115 HCON 17 IH: Honoring and praising the Nati...,US_Bill_Text_115_HCONRES17_IH.xml,H. CON. RES. 17,CONCURRENT RESOLUTION,,...,Introduced-in-House,house-concurrent,,no-star-print,Honoring and praising the National Association...,,3,4,84,HCONRES 17
9,resolution,,,1. Use of rotunda for holocaust days of rememb...,,115 HCON 18 EH: Permitting the use of the rotu...,US_Bill_Text_115_HCONRES18_EH.xml,H. CON. RES. 18,CONCURRENT RESOLUTION,,...,Engrossed-in-House,house-concurrent,1,no-star-print,Permitting the use of the rotunda of the Capit...,,1,3,64,HCONRES 18


In [32]:
summaries

Unnamed: 0,billNumber,billType,contributor,fileName,sentence0,sentence1,sentence2,sentence3,sentence4,summary0,...,title72,title73,title74,title8,title9,word0,word1,word2,word3,word4
0,24,HCONRES,"Congressional Research Service, Library of Con...",104657.140052642395904.4245.xml,,,,,,,...,,,,,,,,,,
1,165,HRES,"Congressional Research Service, Library of Con...",104657.140052642395904.4338.xml,,,,,,,...,,,,,,,,,,
2,12,SJRES,"Congressional Research Service, Library of Con...",104657.140052642395904.4450.xml,,,,,,,...,,,,,,,,,,
3,527,S,"Congressional Research Service, Library of Con...",104657.140052642395904.4455.xml,,,,,,,...,,,,,,,,,,
4,524,S,"Congressional Research Service, Library of Con...",104657.140052642395904.4458.xml,,,,,,,...,,,,,,,,,,
5,521,S,"Congressional Research Service, Library of Con...",104657.140052642395904.4461.xml,,,,,,,...,,,,,,,,,,
6,520,S,"Congressional Research Service, Library of Con...",104657.140052642395904.4462.xml,,,,,,,...,,,,,,,,,,
7,518,S,"Congressional Research Service, Library of Con...",104657.140052642395904.4464.xml,,,,,,,...,,,,,,,,,,
8,517,S,"Congressional Research Service, Library of Con...",104657.140052642395904.4465.xml,,,,,,,...,,,,,,,,,,
9,516,S,"Congressional Research Service, Library of Con...",104657.140052642395904.4466.xml,,,,,,,...,,,,,,,,,,


In [63]:
with_summary = summaries[~summaries.summary0.isnull()]

In [39]:
with_summary

Unnamed: 0,billNumber,billType,contributor,fileName,sentence0,sentence1,sentence2,sentence3,sentence4,summary0,...,title72,title73,title74,title8,title9,word0,word1,word2,word3,word4
25,367,S,"Congressional Research Service, Library of Con...",104657.140052642395904.4912.xml,1.0,,,,,Probation Officer Protection Act of 2017 This ...,...,,,,,,55.0,,,,
26,363,S,"Congressional Research Service, Library of Con...",104657.140052642395904.5351.xml,1.0,,,,,North Country National Scenic Trail Route Adju...,...,,,,,,45.0,,,,
27,83,HRES,"Congressional Research Service, Library of Con...",104657.140052642395904.5736.xml,1.0,,,,,Provides amounts for expenses of the House Com...,...,,,,,,14.0,,,,
28,59,SRES,"Congressional Research Service, Library of Con...",104657.140052642395904.9638.xml,2.0,,,,,Supports the designation of Darwin Day. Recog...,...,,,,,,28.0,,,,
30,387,S,"Congressional Research Service, Library of Con...",104657.140052642395904.9654.xml,2.0,,,,,Consumer Financial Protection Bureau Accountab...,...,,,,,,61.0,,,,
31,386,S,"Congressional Research Service, Library of Con...",104657.140052642395904.9655.xml,5.0,,,,,Judgment Fund Transparency and Terrorism Finan...,...,,,,,,203.0,,,,
32,353,S,"Congressional Research Service, Library of Con...",104657.140052642395904.9662.xml,3.0,,,,,Preserve Access to Medicare Rural Home Health ...,...,,,,,,124.0,,,,
33,306,S,"Congressional Research Service, Library of Con...",104657.140052642395904.9667.xml,3.0,,,,,Biennial Budgeting and Appropriations Act Thi...,...,,,,,,93.0,,,,
34,299,S,"Congressional Research Service, Library of Con...",104657.140052642395904.9668.xml,10.0,,,,,Agency Accountability Act of 2017 This bill re...,...,,,,,,246.0,,,,
35,200,S,"Congressional Research Service, Library of Con...",104657.140052642395904.9676.xml,2.0,,,,,Restricting First Use of Nuclear Weapons Act o...,...,,,,,,76.0,,,,


In [34]:
bills_sel = ['file-name', 'bill-type', 'legis-type','billInfo', 'body.section', 'section', 'sentence', 'word']
summaries_sel = ['fileName', 'sentence0', 'sentence1','sentence2', 'sentence3', 'sentence4', 
                              'summary0', 'summary1','summary2', 'summary3', 'summary4',
                              'word0', 'word1', 'word2', 'word3', 'word4']

In [64]:
matched3 = []
for i in range(len(with_summary)):
    new_dict = {}
    summaryInfo = ' '.join([with_summary.iloc[i]['billType'], with_summary.iloc[i]['billNumber']])
    matchingBills = bills[bills['billInfo'] == summaryInfo]
    if len(matchingBills) > 0:
        for col in summaries_sel:
            new_dict[col] = with_summary.iloc[i][col]
        if len(matchingBills) == 1:
            for col in bills_sel:
                try:
                    new_dict[col] = matchingBills[col].values[0]
                except:
                    new_dict[col] = matchingBills[col]
        else:
            for col in bills_sel:
                try:
                    new_dict[col] = matchingBills.iloc[0][col].values[0]
                except:
                    new_dict[col] = matchingBills.iloc[0][col]
        matched3.append(new_dict)

In [66]:
matched3_df = pd.DataFrame(matched3)

In [67]:
matched3_df.to_pickle('data/matched3.pkl')