# House Committee Scheduling Conflicts

This scripts analyzes the House Committee Respository to collect the number of scheduling conflicts in each member's schedule.

If the same committee has two hearings in the same hour (for example, a committee hearing and a markup), only one of hearing is counted in the analysis of conflicts.

#### NOTE TO SELF - SEARCH XXX FOR INPUT CHANGES BEFORE RUN

In [1]:
import requests
import pandas as pd
from lxml import html    
from datetime import datetime, timedelta
import re    
import itertools

import os

congress = int(input("Congress (3-digit number): "))

# Specify directory for this congress
directory = str(congress)

# Create the directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)


## Setting up Formulas

In [2]:
# this pulls hearings for one day
def hearingpull(date):

    #formatting url
    day_url = "https://docs.house.gov/Committee/Calendar/ByDay.aspx?DayID=" + date.strftime("%m%d%Y")

    #import webpage and create tree
    webpage = requests.get(day_url)
    tree = html.fromstring(webpage.content)

    #importing and formatting hearing titles
    hearing_titles = tree.xpath("//table//a[@title]//text()[normalize-space()]")
    hearing_titles = [sub.replace('\r\n', '') for sub in hearing_titles] 
    hearing_titles = [sub.strip() for sub in hearing_titles]
    hearing_titles = [sub.encode("ascii", "replace").decode("utf-8") for sub in hearing_titles]
    hearing_titles = [str(sub).replace("???"," ") for sub in hearing_titles]
    hearing_titles = [str(sub).replace("'","") for sub in hearing_titles]
    hearing_titles = [str(sub).replace('"',"") for sub in hearing_titles]

    #import and format committee titles
    committee_titles = tree.xpath("//table//span[@title]//text()[normalize-space()]")
    committee_titles = [sub.replace('\r\n', '') for sub in committee_titles] 
    committee_titles = [sub.strip() for sub in committee_titles]
    committee_titles = [sub.encode("ascii", "replace").decode("utf-8") for sub in committee_titles]
    committee_titles = [str(sub).replace("???"," ") for sub in committee_titles]
    committee_titles = [str(sub).replace("'","") for sub in committee_titles]
    committee_titles = [str(sub).replace('"',"") for sub in committee_titles]

    #import and format links
    link_extension = tree.xpath("//table//a//@href")
    link_extension = ["https://docs.house.gov/Committee/Calendar/"+ex for ex in link_extension]

    #import times and dates
    times = list()
    dates = list()

    for hearing in range(len(link_extension)):
        hearing_link = requests.get(link_extension[hearing])
        hearing_tree = html.fromstring(hearing_link.content)
        try:
            datestring = hearing_tree.xpath("//div[@class='meeting-date']//p/text()[normalize-space()]")
            #get dates
            date = datestring[0][0:datestring[0].find("(")-1]
            date = date.replace('\r\n', '')
            date = date.strip()
            #get times
            time = datestring[0][datestring[0].find("("):len(datestring[0])]
            time = time.replace('\r\n', '')
            time = time.replace('(', '')
            time = time.replace(')', '')
            if time.find("-") > 0:
                time = time[0:time.find("-")]
            time = time.strip()
        except:
            date = tree.xpath("//div[@id='body']//span[@id='LabelPageTitle']//text()[normalize-space()]")[0]
            times = tree.xpath("//div//table[@class='table table-bordered']//tr//td[2]//span[@class='text-small']//text()[normalize-space()]")
            times = [sub.replace('\r\n', '') for sub in times] 
            times = [sub.strip() for sub in times]
            times = [sub.encode("ascii", "replace").decode("utf-8") for sub in times]
            times = [str(sub).replace("???"," ") for sub in times]
            times = [str(sub).replace("'","") for sub in times]
            times = [str(sub).replace('"',"") for sub in times]
            time = times[hearing]
        #append
        dates += [date]
        times += [time]

    #zipping into single dataframe
    day_results = pd.DataFrame(zip(dates,committee_titles,hearing_titles,times,link_extension),columns=["Date","Committee","Hearing Title","Time","Link"])

    return(day_results)

#this pulls hearings for a date range
def gethearingrange(datestart,dateend):

    results = pd.DataFrame(columns=["Date","Committee","Hearing Title","Time","Link"])

    datestart = datetime.strptime(datestart,"%m/%d/%Y").date()
    dateend = datetime.strptime(dateend,"%m/%d/%Y").date()
    
    #if dateend is in the future, make dateend today
    if dateend > datetime.now().date():
        dateend = datetime.now().date()

    while datestart <= dateend:
        # results = results.append(hearingpull(datestart)) #append dep
        results = pd.concat([results, hearingpull(datestart)], ignore_index=True)
        datestart += timedelta(days=1)
    
    #remove misc. spaces from committee column
    results["Committee"] = [re.sub(' +', ' ',com) for com in results["Committee"]]
    
    results["Time"] = [time.replace("local time","") for time in results["Time"]]
    results["Time"] = [time.strip() for time in results["Time"]]
    
    results = results.drop_duplicates(subset=["Link"])
    
    results["Hour"] = [datetime.strptime(time,"%I:%M %p").hour for time in results["Time"]]
    results.drop_duplicates(subset=["Date","Committee","Hour"],inplace=True)

    return(results)

### PART ONE: GATHER COMMITTEE ASSIGNMENTS 
This part takes forever if you're doing prior congresses.

In [3]:
def getassignments(congress):

    ##uses web archive to get committee assignments for each congress (archive not needed for current congress)
    Com_Dict = {118:"",
                117:"https://web.archive.org/web/20221019171459", #XXX
                116:"https://web.archive.org/web/20201026031027/",
                115:"https://web.archive.org/web/20181026031027/"}


    member_data = pd.DataFrame()
    clerk = Com_Dict.get(congress)+"https://clerk.house.gov/committees"

    webpage = requests.get(clerk)
    tree = html.fromstring(webpage.content)


    com_titles = tree.xpath("//div[@class='col-sm-11 col-xs-10 library-committeePanel-heading']//a//text()")
    com_links = tree.xpath("//div[@class='col-sm-11 col-xs-10 library-committeePanel-heading']//a//@href")

    ComLink_Dict = {118:"http://clerk.house.gov",#XXX
                    117:"https://web.archive.org/",
                    116:"https://web.archive.org/",
                    115:"https://web.archive.org/"}

    com_links = [ComLink_Dict.get(congress)+end for end in com_links]
    com_links = com_links + ["http://clerk.house.gov/committees/VC00"]
    com_codes = [title[len(title)-4:len(title)] for title in com_links]


    for com in range(len(com_links)):

        singlecom = requests.get(com_links[com])
        tree = html.fromstring(singlecom.content)

        members = tree.xpath("//ul[@id='majority-members' or @id='minority-members']//li/a/span/text()")
        members = [sub.encode("ascii", "replace").decode("utf-8") for sub in members]
        members = [str(member).replace("??","e") for member in members]
        members = [str(member).replace("?","e") for member in members]

        # member_data = member_data.append(pd.DataFrame(members,columns=[com_codes[com]]).transpose()) #APPEND DEPRECATED
        member_data = pd.concat([member_data, pd.DataFrame(members,columns=[com_codes[com]]).transpose()])

        subcom_links = tree.xpath("//section[@class='subcommittees']//ul[@class='library-list_ul']//li//a/@href")
        subcom_links = [ComLink_Dict.get(congress)+end for end in subcom_links]
        subcom_links = [link.replace("///","/") for link in subcom_links]


        subcom_codes = [title[len(title)-4:len(title)] for title in subcom_links]

        for subcom in range(len(subcom_links)):
            single_subcom = requests.get(subcom_links[subcom])
            tree = html.fromstring(single_subcom.content)
            members = tree.xpath("//ul[@id='majority-members' or @id='minority-members']//li/a/span/text()")
            members = [sub.encode("ascii", "replace").decode("utf-8") for sub in members]
            members = [str(member).replace("??","e") for member in members]
            members = [str(member).replace("?","e") for member in members]
            # member_data = member_data.append(pd.DataFrame(members,columns=[subcom_codes[subcom]]).transpose()) #APPEND DEPRECATED
            member_data = pd.concat([member_data, pd.DataFrame(members,columns=[subcom_codes[subcom]]).transpose()])
    
    member_data.to_csv(directory+"/assignments.csv")
    return(member_data)

### PART TWO: GET COMMITTEE CODES FOR HEARINGS, GATHER HEARING DATA


In [4]:
def gethearingdata(congress,member_data):

    dates_start = {118:"01/03/2023",
                   117:"01/03/2021",
                   116:"01/03/2019",
                   115:"01/03/2017"}
    
    dates_end = {118:"01/02/2025",#XXX
                 117:"01/02/2023",
                 116:"01/02/2021",
                 115:"01/02/2019"}
    
    hearing_data = gethearingrange(dates_start.get(congress),dates_end.get(congress))

    #import replacement (comcode) files

    replacement = pd.read_csv(str("https://raw.githubusercontent.com/rachelorey/Scheduling-Conflicts-in-Congress/master/replacement"+str(congress)+".csv"))

    # #drop all comcodes without member assignments from clerk.gov
    codes_to_drop = [value for value in replacement["Code"].unique() if value not in member_data.index.unique()]
    replacement = replacement[~replacement["Code"].isin(codes_to_drop)]

    #convert committees to lowercase for merging
    hearing_data["committee-low"] = hearing_data["Committee"].str.lower()
    replacement["committee-low"] = replacement["Committee"].str.lower()
    
    #drop original column in replacement df
    replacement.drop(["Committee"],axis=1,inplace=True)
    
    #merge codes and names
    hearing_data = pd.merge(hearing_data,replacement,on="committee-low",how="left")
    
    #drop lowercase column
    hearing_data.drop(["committee-low"],axis=1,inplace=True)

    hearing_data.to_csv(directory+"/hearings.csv",index=False)
    
    return(hearing_data)

def testmatches(hearing_data):
    import pandas as pd
    match = pd.DataFrame(hearing_data[hearing_data["Code"].isna()]["Committee"].unique())
    match.to_csv(directory+"/match.csv",index=False)
    return(match)

In [13]:
hearing_data = pd.read_csv('118/hearings.csv')
#import replacement (comcode) files

replacement = pd.read_csv("replacement"+str(congress)+".csv")

#convert committees to lowercase for merging
hearing_data["committee-low"] = hearing_data["Committee"].str.lower()
replacement["committee-low"] = replacement["Committee"].str.lower()

#drop original column in replacement df
replacement.drop(["Committee"],axis=1,inplace=True)

#merge codes and names
hearing_data = pd.merge(hearing_data,replacement,on="committee-low",how="left")

#drop lowercase column
hearing_data.drop(["committee-low"],axis=1,inplace=True)

# hearing_data.to_csv(directory+"/hearings.csv",index=False)

# testmatches(hearing_data)

Unnamed: 0,Date,Committee,Hearing Title,Time,Link,Hour,Code_x,Code_y
0,"Monday, January 30, 2023",Committee on Rules,Organizational Meeting for the 118th Congress,4:30 PM,https://docs.house.gov/Committee/Calendar/ByEv...,16,RU00,RU00
1,"Monday, January 30, 2023",Committee on Rules,H.J. Res. 7 ? Relating to a national emergency...,5:00 PM,https://docs.house.gov/Committee/Calendar/ByEv...,17,RU00,RU00
2,"Tuesday, January 31, 2023",Committee on Ways and Means,Committee Organizational Meeting for the 118th...,10:00 AM,https://docs.house.gov/Committee/Calendar/ByEv...,10,WM00,WM00
3,"Wednesday, February 1, 2023",Committee on the Judiciary,Organizing of the House Committee on the Judic...,9:00 AM,https://docs.house.gov/Committee/Calendar/ByEv...,9,JU00,JU00
4,"Tuesday, January 31, 2023",Committee on Oversight and Accountability,Full Committee Organizational Meeting of the H...,11:00 AM,https://docs.house.gov/Committee/Calendar/ByEv...,11,GO00,GO00
...,...,...,...,...,...,...,...,...
1756,"Thursday, June 13, 2024",Subcommittee on Highways and Transit (Committe...,"Revenue, Ridership, and Post-Pandemic Lessons ...",11:00 AM,https://docs.house.gov/Committee/Calendar/ByEv...,11,,PW12
1757,"Thursday, June 13, 2024",Committee on the Budget,Medicare and Social Security: Examining Solven...,11:15 AM,https://docs.house.gov/Committee/Calendar/ByEv...,11,BU00,BU00
1758,"Thursday, June 13, 2024","Subcommittee on Environment, Manufacturing, an...",?Securing America?s Critical Materials Supply ...,11:30 AM,https://docs.house.gov/Committee/Calendar/ByEv...,11,,IF18
1759,"Thursday, June 13, 2024",Subcommittee on Government Operations and the ...,?What We Have Here?is a Failure to Collaborate...,11:30 AM,https://docs.house.gov/Committee/Calendar/ByEv...,11,,GO24


In [22]:
testmatches(hearing_data)

Unnamed: 0,0
0,Committee on Veterans Affairs
1,"Subcommittee on Cyber, Information Technologie..."
2,Select Committee on the Strategic Competition ...
3,"Subcommittee on Environment, Manufacturing, an..."
4,Subcommittee on Technology Modernization (Comm...
5,Subcommittee on Economic Opportunity (Committe...
6,"Subcommittee on the Middle East, North Africa,..."
7,Subcommittee on Oversight and Investigations (...
8,Subcommittee on the National Intelligence Ente...
9,"Subcommittee on National Security, Illicit Fin..."


### PART THREE: LOOK FOR SCHEDULING CONFLICTS

In [5]:
def getconflicts(member_data,hearing_data):

    results = pd.DataFrame(columns=["MC","Hearing 1 Code","Hearing 1 Link","Hearing 2 Code","Hearing 2 Link","Date"])
    unique_dates = hearing_data["Date"].unique()

    for unique_day in unique_dates:

        #get dataframe of all hearings in selected day
        day = hearing_data[hearing_data["Date"]==unique_day]

        #make sure there are at least two different committees meeting today
        if len(day["Code"].unique()) >= 2:

            #ADD TWO HOURS TO HEARINGS TO CREATE HEARING LENGTH

            day["Time"] = [datetime.strptime(time,"%H:%M %p") for time in day["Time"]]
            counts = day["Code"].value_counts()

            Time_2 = list()


            ### XXX Note to self - look into why i set it up this way before rather than just adding two hours across the board
            for index, row in day.iterrows():
                if counts.loc[row["Code"]] == 1:
                    Time_2.append(row["Time"]+timedelta(hours=2))
                elif row["Time"]+timedelta(hours=2) < day[day["Code"]==row["Code"]]["Time"].max():
                    Time_2.append(row["Time"]+timedelta(hours=2))
                elif row["Time"] == day[day["Code"]==row["Code"]]["Time"].max():
                    Time_2.append(row["Time"]+timedelta(hours=2))
                else:
                    Time_2.append(day[day["Code"]==row["Code"]]["Time"].max())


            day["Time+2"] = Time_2

            #get overlapping hearings
            combos = pd.DataFrame(itertools.combinations(day.index,2),columns = ["Hearing Code 1","Hearing Code 2"])

            overlap = list()
            for combo in range(len(combos)):
                hearing1 = combos["Hearing Code 1"][combo]
                hearing2 = combos["Hearing Code 2"][combo]
                latest_start = max(day["Time"][hearing1],day["Time"][hearing2])
                earliest_end = min(day["Time+2"][hearing1],day["Time+2"][hearing2])
                if (earliest_end - latest_start) > timedelta(hours=0):
                    overlap.append("Overlaps")
                else:
                    overlap.append("No Overlap")
            combos["Overlap"] = overlap
            combos = combos[combos["Overlap"]=="Overlaps"]
            combos.reset_index(inplace=True,drop=True)

            #for each combination of committees in one day, determine which committees conflict 
                #and then get the members that are in both

            # change time if it's same committee overlapping
            for combo in range(len(combos)):
                hearing_1 = day[day.index==combos["Hearing Code 1"][combo]]
                hearing_2 = day[day.index==combos["Hearing Code 2"][combo]]

                hearing_1.reset_index(drop=True,inplace=True)
                hearing_2.reset_index(drop=True,inplace=True)

                hearing_1 = hearing_1["Code"][0]
                hearing_2 = hearing_2["Code"][0]




            for combo in range(len(combos)):
                #get committee code for hearing
                hearing_1 = day[day.index==combos["Hearing Code 1"][combo]]
                hearing_2 = day[day.index==combos["Hearing Code 2"][combo]]

                hearing_1.reset_index(drop=True,inplace=True)
                hearing_2.reset_index(drop=True,inplace=True)

                hearing_1 = hearing_1["Code"][0]
                hearing_2 = hearing_2["Code"][0]

                #get members in relevant hearings
                try:
                    hearing_1_members = member_data[member_data.index == hearing_1].dropna(axis=1).iloc[0,:]
                    hearing_2_members = member_data[member_data.index == hearing_2].dropna(axis=1).iloc[0,:]


                    #check to make sure it is not the same committee conflicting
                    if hearing_1 != hearing_2:
                        #get members that are in both conflicting committees
                        overlapping_members = [value for value in hearing_1_members if str(value) in str(hearing_2_members)]
                        overlapping_members = pd.DataFrame(overlapping_members)
                        #if there are overlapping members, add to results
                        if len(overlapping_members)>0:
                            hearinglist = [[day.loc[combos["Hearing Code 1"][combo]]["Code"]]*len(overlapping_members),
                                       [day.loc[combos["Hearing Code 1"][combo]]["Link"]]*len(overlapping_members),
                                       [day.loc[combos["Hearing Code 2"][combo]]["Code"]]*len(overlapping_members),
                                       [day.loc[combos["Hearing Code 2"][combo]]["Link"]]*len(overlapping_members),
                                      [day.loc[combos["Hearing Code 2"][combo]]["Date"]]*len(overlapping_members)]
                            hearinglist = pd.DataFrame(hearinglist).transpose()
                            res = pd.merge(overlapping_members,hearinglist,left_index=True,right_index=True)
                            res.columns = ["MC","Hearing 1 Code","Hearing 1 Link","Hearing 2 Code","Hearing 2 Link","Date"]
                            results = pd.concat([results, res])
                            #results = results.append(res)  

                except:
                    print("Issue with: ",hearing_1," or ",hearing_2)

    results.reset_index(inplace=True,drop=True)
    results.to_csv(directory+"/results.csv",index=False)
    return(results)

### PART FOUR: RUN EVERYTHING TO GET RESULTS

In [6]:
def runeverything(congress):

    member_data = getassignments(congress)
    print("member_data success")
    hearing_data = gethearingdata(congress,member_data)
    print("hearing_data success")
    match = testmatches(hearing_data)
    print("match success")
    if len(match) > 0:
        display([i for i in match[0]])
        return(member_data,hearing_data)
    else:
        results = getconflicts(member_data,hearing_data)
        return(results,member_data,hearing_data)

# Running Everything for Specific Congress

In [7]:
results,member_data,hearing_data = runeverything(congress)

display(results)

member_data success
hearing_data success
match success


['Committee on Veterans Affairs',
 'Subcommittee on Health (Committee on Energy and Commerce)',
 'Subcommittee on Innovation, Data, and Commerce (Committee on Energy and Commerce)',
 'Subcommittee on Oversight and Investigations (Committee on Energy and Commerce)',
 'Subcommittee on Communications and Technology (Committee on Energy and Commerce)',
 'Subcommittee on Capital Markets (Committee on Financial Services)',
 'Subcommittee on Financial Institutions and Monetary Policy (Committee on Financial Services)',
 'Subcommittee on Water Resources and Environment (Committee on Transportation and Infrastructure)',
 'Subcommittee on Intelligence and Special Operations (Committee on Armed Services)',
 'Subcommittee on Cyber, Information Technologies, and Innovation (Committee on Armed Services)',
 'Subcommittee on Oversight and Investigations (Committee on Natural Resources)',
 'Subcommittee on Energy and Mineral Resources (Committee on Natural Resources)',
 'Subcommittee on Energy, Climate

ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
hearing_data

NameError: name 'hearing_data' is not defined

## Deprecated

In [None]:
# # PART ONE: GATHER COMMITTEE ASSIGNMENTS (this part takes forever if you're doing prior congresses)

# ##PRE 2020
# def getassignments(congress):
#     import requests 
#     import pandas as pd
#     from lxml import html    
#     from datetime import datetime, timedelta
    
#     Com_Dict = {117:"",
#                116:"https://web.archive.org/web/20191219040247/"}



#     member_data = pd.DataFrame()

#     clerk = Com_Dict.get(congress)+"http://clerk.house.gov/committee_info/index.aspx"
#     webpage = requests.get(clerk)
#     tree = html.fromstring(webpage.content)

#     com_titles = tree.xpath("//div[@id='com_directory']//ul//li//a//text()")
#     com_links = tree.xpath("//div[@id='com_directory']//ul//li//a//@href")

#     ComLink_Dict = {117:"http://clerk.house.gov",
#                116:"https://web.archive.org/"}

#     com_links = [ComLink_Dict.get(congress)+end for end in com_links]
#     com_codes = [title[title.find("=")+1:len(title)] for title in com_links]

#     for com in range(len(com_links)):
#         singlecom = requests.get(com_links[com])
#         tree = html.fromstring(singlecom.content)

#         members = tree.xpath("//div[@id='primary_group' or @id='secondary_group']//ol//li/a/text()")
#         members = [sub.encode("ascii", "replace").decode("utf-8") for sub in members]
#         members = [str(member).replace("??","e") for member in members]

#         member_data = member_data.append(pd.DataFrame(members,columns=[com_codes[com]]).transpose())

#         subcom_links = tree.xpath("//div[@id='subcom_list']//ul//li//@href")
#         subcom_links = [ComLink_Dict.get(congress)+end for end in subcom_links]
#         subcom_links = [link.replace("///","/") for link in subcom_links]

#         subcom_codes = [title[title.find("=")+1:len(title)] for title in subcom_links]

#         for subcom in range(len(subcom_links)):
#             single_subcom = requests.get(subcom_links[subcom])
#             tree = html.fromstring(single_subcom.content)
#             members = tree.xpath("//div[@id='primary_group' or @id='secondary_group']//ol//li/a/text()")
#             members = [sub.encode("ascii", "replace").decode("utf-8") for sub in members]
#             members = [str(member).replace("??","e") for member in members]
#             member_data = member_data.append(pd.DataFrame(members,columns=[subcom_codes[subcom]]).transpose())
            
#     return(member_data)