In [1]:
# this pulls hearings for one day
def hearingpull(date):
    import requests 
    import pandas as pd
    from lxml import html    
    from datetime import datetime,timedelta

    #formatting url
    day_url = "https://docs.house.gov/Committee/Calendar/ByDay.aspx?DayID=" + date.strftime("%m%d%Y")

    #import webpage and create tree
    webpage = requests.get(day_url)
    tree = html.fromstring(webpage.content)

    #importing and formatting hearing titles
    hearing_titles = tree.xpath("//table//a[@title]//text()[normalize-space()]")
    hearing_titles = [sub.replace('\r\n', '') for sub in hearing_titles] 
    hearing_titles = [sub.strip() for sub in hearing_titles]
    hearing_titles = [sub.encode("ascii", "replace").decode("utf-8") for sub in hearing_titles]
    hearing_titles = [str(sub).replace("???"," ") for sub in hearing_titles]
    hearing_titles = [str(sub).replace("'","") for sub in hearing_titles]
    hearing_titles = [str(sub).replace('"',"") for sub in hearing_titles]

    #import and format committee titles
    committee_titles = tree.xpath("//table//span[@title]//text()[normalize-space()]")
    committee_titles = [sub.replace('\r\n', '') for sub in committee_titles] 
    committee_titles = [sub.strip() for sub in committee_titles]
    committee_titles = [sub.encode("ascii", "replace").decode("utf-8") for sub in committee_titles]
    committee_titles = [str(sub).replace("???"," ") for sub in committee_titles]
    committee_titles = [str(sub).replace("'","") for sub in committee_titles]
    committee_titles = [str(sub).replace('"',"") for sub in committee_titles]


    #import and format links
    link_extension = tree.xpath("//table//a//@href")
    link_extension = ["https://docs.house.gov/Committee/Calendar/"+ex for ex in link_extension]

    #import times and dates
    times = list()
    dates = list()

    for hearing in range(len(link_extension)):
        hearing_link = requests.get(link_extension[hearing])
        hearing_tree = html.fromstring(hearing_link.content)
        try:
            datestring = hearing_tree.xpath("//div[@class='meeting-date']//p/text()[normalize-space()]")
            #get dates
            date = datestring[0][0:datestring[0].find("(")-1]
            date = date.replace('\r\n', '')
            date = date.strip()
            #get times
            time = datestring[0][datestring[0].find("("):len(datestring[0])]
            time = time.replace('\r\n', '')
            time = time.replace('(', '')
            time = time.replace(')', '')
            if time.find("-") > 0:
                time = time[0:time.find("-")]
            time = time.strip()
        except:
            date = tree.xpath("//div[@id='body']//span[@id='LabelPageTitle']//text()[normalize-space()]")[0]
            times = tree.xpath("//div//table[@class='table table-bordered']//tr//td[2]//span[@class='text-small']//text()[normalize-space()]")
            times = [sub.replace('\r\n', '') for sub in times] 
            times = [sub.strip() for sub in times]
            times = [sub.encode("ascii", "replace").decode("utf-8") for sub in times]
            times = [str(sub).replace("???"," ") for sub in times]
            times = [str(sub).replace("'","") for sub in times]
            times = [str(sub).replace('"',"") for sub in times]
            time = times[hearing]
        #append
        dates.append(date)
        times.append(time)

    #zipping into single dataframe
    day_results = pd.DataFrame(zip(dates,committee_titles,hearing_titles,times,link_extension),columns=["Date","Committee","Hearing Title","Time","Link"])

    return(day_results)

#this pulls hearings for a date range
def gethearingrange(datestart,dateend):
    from datetime import datetime, timedelta
    import pandas as pd
    import re
    
    results = pd.DataFrame(columns=["Date","Committee","Hearing Title","Time","Link"])

    datestart = datetime.strptime(datestart,"%m/%d/%Y").date()
    dateend = datetime.strptime(dateend,"%m/%d/%Y").date()
    
    while datestart <= dateend:
        results = results.append(hearingpull(datestart))
        datestart += timedelta(days=1)
    
    #remove misc. spaces from committee column
    results["Committee"] = [re.sub(' +', ' ',com) for com in results["Committee"]]
    
    results["Time"] = [time.replace("local time","") for time in results["Time"]]
    results["Time"] = [time.strip() for time in results["Time"]]
    
    results = results.drop_duplicates(subset=["Link"])
    return(results)

In [2]:
# PART ONE: GATHER COMMITTEE ASSIGNMENTS (this part takes forever if you're doing pre-116)

def getassignments(congress):
    import requests 
    import pandas as pd
    from lxml import html    
    from datetime import datetime, timedelta
    
    Com_Dict = {116:"",
               115:"https://web.archive.org/web/20181226060001/",
               114:"https://web.archive.org/web/20161203054650/",
               113:"https://web.archive.org/web/20141205200618/"}



    member_data = pd.DataFrame()

    clerk = Com_Dict.get(congress)+"http://clerk.house.gov/committee_info/index.aspx"
    webpage = requests.get(clerk)
    tree = html.fromstring(webpage.content)

    com_titles = tree.xpath("//div[@id='com_directory']//ul//li//a//text()")
    com_links = tree.xpath("//div[@id='com_directory']//ul//li//a//@href")

    ComLink_Dict = {116:"http://clerk.house.gov",
               115:"https://web.archive.org/",
               114:"https://web.archive.org/",
               113:"https://web.archive.org/"}

    com_links = [ComLink_Dict.get(congress)+end for end in com_links]
    com_codes = [title[title.find("=")+1:len(title)] for title in com_links]

    for com in range(len(com_links)):
        singlecom = requests.get(com_links[com])
        tree = html.fromstring(singlecom.content)

        members = tree.xpath("//div[@id='primary_group' or @id='secondary_group']//ol//li/a/text()")
        members = [sub.encode("ascii", "replace").decode("utf-8") for sub in members]
        members = [str(member).replace("??","e") for member in members]

        member_data = member_data.append(pd.DataFrame(members,columns=[com_codes[com]]).transpose())

        subcom_links = tree.xpath("//div[@id='subcom_list']//ul//li//@href")
        subcom_links = [ComLink_Dict.get(congress)+end for end in subcom_links]
        subcom_links = [link.replace("///","/") for link in subcom_links]

        subcom_codes = [title[title.find("=")+1:len(title)] for title in subcom_links]

        for subcom in range(len(subcom_links)):
            single_subcom = requests.get(subcom_links[subcom])
            tree = html.fromstring(single_subcom.content)
            members = tree.xpath("//div[@id='primary_group' or @id='secondary_group']//ol//li/a/text()")
            members = [sub.encode("ascii", "replace").decode("utf-8") for sub in members]
            members = [str(member).replace("??","e") for member in members]
            member_data = member_data.append(pd.DataFrame(members,columns=[subcom_codes[subcom]]).transpose())
            
    return(member_data)

In [3]:
# PART TWO: GET COMMITTEE CODES FOR HEARINGS, GATHER HEARING DATA
def gethearingdata(congress,member_data):
    import pandas as pd
    from datetime import datetime, timedelta
    
    dates_start = {116:"01/01/2019",
                   115:"01/01/2017",
                   114:"01/01/2015",
                   113:"01/01/2013"}
    dates_end = {116:"12/31/2019",
                   115:"12/31/2018",
                   114:"12/31/2016",
                   113:"12/31/2014"}
    hearing_data = gethearingrange(dates_start.get(congress),dates_end.get(congress))

    #import replacement (comcode) files

    replacement = pd.read_csv(str("https://raw.githubusercontent.com/rachelorey/Scheduling-Conflicts-in-Congress/master/replacement"+str(congress)+".csv"))

    # #drop all comcodes without member assignments from clerk.gov
    codes_to_drop = [value for value in replacement["Code"].unique() if value not in member_data.index.unique()]
    replacement = replacement[~replacement["Code"].isin(codes_to_drop)]

    #convert committees to lowercase for merging
    hearing_data["committee-low"] = hearing_data["Committee"].str.lower()
    replacement["committee-low"] = replacement["Committee"].str.lower()
    
    #drop original column in replacement df
    replacement.drop(["Committee"],axis=1,inplace=True)
    
    #merge codes and names
    hearing_data = pd.merge(hearing_data,replacement,on="committee-low",how="left")
    
    #drop lowercase column
    hearing_data.drop(["committee-low"],axis=1,inplace=True)
    
    return(hearing_data)

def testmatches(hearing_data):
    import pandas as pd
    match = pd.DataFrame(hearing_data[hearing_data["Code"].isna()]["Committee"].unique())
    return(match)

In [4]:
#PART THREE: LOOK FOR SCHEDULING CONFLICTS

def getconflicts(member_data,hearing_data):
    import pandas as pd
    from datetime import datetime,timedelta
    import itertools

    results = pd.DataFrame(columns=["MC","Hearing 1 Code","Hearing 1 Link","Hearing 2 Code","Hearing 2 Link","Date"])
    unique_dates = hearing_data["Date"].unique()

    for unique_day in unique_dates:

        #get dataframe of all hearings in selected day
        day = hearing_data[hearing_data["Date"]==unique_day]

        #make sure there are at least two different committees meeting today
        if len(day["Code"].unique()) >= 2:

            #ADD TWO HOURS TO HEARINGS TO CREATE HEARING LENGTH

            day["Time"] = [datetime.strptime(time,"%H:%M %p") for time in day["Time"]]
            counts = day["Code"].value_counts()

            Time_2 = list()

            for index, row in day.iterrows():
                if counts.loc[row["Code"]] == 1:
                    Time_2.append(row["Time"]+timedelta(hours=2))
                elif row["Time"]+timedelta(hours=2) < day[day["Code"]==row["Code"]]["Time"].max():
                    Time_2.append(row["Time"]+timedelta(hours=2))
                elif row["Time"] == day[day["Code"]==row["Code"]]["Time"].max():
                    Time_2.append(row["Time"]+timedelta(hours=2))
                else:
                    Time_2.append(day[day["Code"]==row["Code"]]["Time"].max())

            day["Time+2"] = Time_2

            #get overlapping hearings
            combos = pd.DataFrame(itertools.combinations(day.index,2),columns = ["Hearing Code 1","Hearing Code 2"])

            overlap = list()
            for combo in range(len(combos)):
                hearing1 = combos["Hearing Code 1"][combo]
                hearing2 = combos["Hearing Code 2"][combo]
                latest_start = max(day["Time"][hearing1],day["Time"][hearing2])
                earliest_end = min(day["Time+2"][hearing1],day["Time+2"][hearing2])
                if (earliest_end - latest_start) > timedelta(hours=0):
                    overlap.append("Overlaps")
                else:
                    overlap.append("No Overlap")
            combos["Overlap"] = overlap
            combos = combos[combos["Overlap"]=="Overlaps"]
            combos.reset_index(inplace=True,drop=True)

            #for each combination of committees in one day, determine which committees conflict 
                #and then get the members that are in both

            # change time if it's same committee overlapping
            for combo in range(len(combos)):
                hearing_1 = day[day.index==combos["Hearing Code 1"][combo]]
                hearing_2 = day[day.index==combos["Hearing Code 2"][combo]]

                hearing_1.reset_index(drop=True,inplace=True)
                hearing_2.reset_index(drop=True,inplace=True)

                hearing_1 = hearing_1["Code"][0]
                hearing_2 = hearing_2["Code"][0]




            for combo in range(len(combos)):
                #get committee code for hearing
                hearing_1 = day[day.index==combos["Hearing Code 1"][combo]]
                hearing_2 = day[day.index==combos["Hearing Code 2"][combo]]

                hearing_1.reset_index(drop=True,inplace=True)
                hearing_2.reset_index(drop=True,inplace=True)

                hearing_1 = hearing_1["Code"][0]
                hearing_2 = hearing_2["Code"][0]

                #get members in relevant hearings
                try:
                    hearing_1_members = member_data[member_data.index == hearing_1].dropna(axis=1).iloc[0,:]
                    hearing_2_members = member_data[member_data.index == hearing_2].dropna(axis=1).iloc[0,:]


                    #check to make sure it is not the same committee conflicting
                    if hearing_1 != hearing_2:
                        #get members that are in both conflicting committees
                        overlapping_members = [value for value in hearing_1_members if str(value) in str(hearing_2_members)]
                        overlapping_members = pd.DataFrame(overlapping_members)
                        #if there are overlapping members, add to results
                        if len(overlapping_members)>0:
                            hearinglist = [[day.loc[combos["Hearing Code 1"][combo]]["Code"]]*len(overlapping_members),
                                       [day.loc[combos["Hearing Code 1"][combo]]["Link"]]*len(overlapping_members),
                                       [day.loc[combos["Hearing Code 2"][combo]]["Code"]]*len(overlapping_members),
                                       [day.loc[combos["Hearing Code 2"][combo]]["Link"]]*len(overlapping_members),
                                      [day.loc[combos["Hearing Code 2"][combo]]["Date"]]*len(overlapping_members)]
                            hearinglist = pd.DataFrame(hearinglist).transpose()
                            res = pd.merge(overlapping_members,hearinglist,left_index=True,right_index=True)
                            res.columns = ["MC","Hearing 1 Code","Hearing 1 Link","Hearing 2 Code","Hearing 2 Link","Date"]
                            results = results.append(res)  
                except:
                    print("Issue with: ",hearing_1," or ",hearing_2)

    results.reset_index(inplace=True,drop=True)
    return(results)

In [5]:
# PART FOUR: RUN EVERYTHING TO GET RESULTS
def runeverything(congress):    
    member_data = getassignments(congress)
    hearing_data = gethearingdata(congress,member_data)
    match = testmatches(hearing_data)
    if len(match) > 0:
        print(match)
    results = getconflicts(member_data,hearing_data)
    return(results,member_data,hearing_data)

In [6]:
congresses = [113,114,115,116]

for congress in congresses:
    
    results,member_data,hearing_data = runeverything(congress)
    results.to_csv(str("D:\\OneDrive - Bipartisan Policy Center\\Congress\\Modernization\\Scheduling Conflicts\\NEW\\Results\\results"+str(congress)+".csv"),index=False)
    hearing_data.to_csv(str("D:\\OneDrive - Bipartisan Policy Center\\Congress\\Modernization\\Scheduling Conflicts\\NEW\\Results\\hearings"+str(congress)+".csv"),index=False)
    member_data.to_csv(str("D:\\OneDrive - Bipartisan Policy Center\\Congress\\Modernization\\Scheduling Conflicts\\NEW\\Results\\assignments"+str(congress)+".csv"))


    import pandas as pd

    member_count = pd.melt(member_data).drop(["variable"],axis=1)
    member_count = member_count[~member_count["value"].isna()]
    member_count = pd.DataFrame(pd.value_counts(member_count["value"]))
    member_count.columns = ["No. Committees"]

    conflict_count = pd.DataFrame(pd.value_counts(results["MC"]))
    conflict_by_assignments = pd.merge(member_count,conflict_count,left_index = True,right_index=True)
    conflict_by_assignments.columns = ["No. Committees","No. Conflicts"]

    conflict_by_assignments.to_csv("D:\\OneDrive - Bipartisan Policy Center\\Congress\\Modernization\\Scheduling Conflicts\\NEW\\Results\\conflict_by_assignment"+str(congress)+".csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
