### Get Committee Names and Codes from Clerk of the House
Works for current Congress (can use archive for prior)

In [8]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd


# Send a GET request
# response = requests.get('https://clerk.house.gov/committees') current
response = requests.get('https://web.archive.org/web/20221205011459/https://clerk.house.gov/committees')


# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find all the anchor tags
anchor_tags = soup.find_all('a')

# Initialize two empty lists to store the committee names and codes
committee_names = []
codes = []

# For each anchor tag
for tag in anchor_tags:
    # Extract the href attribute
    href = tag.get('href')
    
    # Check if the href attribute contains the path for committees or subcommittees
    if href and ('/committees/' in href or '/subcommittees/' in href):
        # Extract the 4 character code
        code = re.search(r'/([A-Z0-9]{4})$', href)
        
        # If a code was found
        if code:
            # Add the committee name and code to the respective lists
            if '00' in href: #full committee
                committee_names.append(tag.text.strip())
            else:  # '/subcommittees/' in href
                previous_href = tag.find_previous('a', href=re.compile(r'/[A-Z0-9]{2}00$'))
                if previous_href:
                    committee_names.append('Subcommittee on ' + tag.text.strip() + (' (Committee on ' + previous_href.text.strip() + ')' if 'Committee on ' not in previous_href.text.strip() else ' (' + previous_href.text.strip() + ')'))
            codes.append(code.group(1))

# Create a DataFrame
df = pd.DataFrame({
    'Committee': committee_names,
    'Code': codes
})

#remove quotation marks from the df
df['Committee'] = df['Committee'].str.replace('"', '')

# display the DataFrame
display(df)

Unnamed: 0,Committee,Code
0,Agriculture,AG00
1,Appropriations,AP00
2,Armed Services,AS00
3,Budget,BU00
4,Education and Labor,ED00
...,...,...
158,Subcommittee on Strategic Technologies and Adv...,IG10
159,Select Committee on the Climate Crisis,CN00
160,Select Committee on Economic Disparity and Fai...,EF00
161,Select Committee on the Modernization of Congress,MH00


In [6]:
# concat 118/118comcode_manualadd.csv to df


# df.to_csv('replacement118.csv', index=False)


df = pd.concat([df, pd.read_csv('replacement117.csv')], ignore_index=True)
df = pd.concat([df, pd.read_csv('117/117comcode_manualadd.csv')], ignore_index=True)

#delete duplicate rows from df
df = df.drop_duplicates()
df.to_csv('replacement117.csv', index=False)