In [54]:
from scraping.utils.scraping_utils import get_html
import re
import pandas as pd

In [55]:
house_url = 'https://www.house.gov/representatives'
representatives_page = get_html(house_url)


In [56]:
block_id_regex = re.compile('housegov_reps_by_state-block_default-.*')
a = representatives_page.find(class_='view-content')

In [57]:
def extract_representative_data_from_row(row, state_name):
    cells = list(row.children)
    cells_content = [cell.text.strip() for cell in cells]
    cells_content = [cell_content for cell_content in cells_content if cell_content != '']
    if len(cells_content) != 6:
        cells_content.append('')
    representative = {
        'state': state_name,
        'district': cells_content[0],
        'name': cells_content[1],
        'party': cells_content[2],
        'office_room': cells_content[3],
        'phone': cells_content[4],
        'committee': cells_content[5],
        'page_link': cells[3].a['href']
    }
    return representative

def extract_state_representatives(state_table):
    state_name = state_table.caption.text.strip()
    print(state_name)
    rows = state_table.find_all('tr')
    return [extract_representative_data_from_row(row, state_name) for row in rows[1:]]

representatives = []
state_tables = a.find_all(id=block_id_regex, recursive=False)
for state_table in state_tables:
    state_representatives = extract_state_representatives(state_table)
    representatives.extend(state_representatives)
        

Alabama
Alaska
American Samoa
Arizona
Arkansas
California
Colorado
Connecticut
Delaware
District of Columbia
Florida
Georgia
Guam
Hawaii
Idaho
Illinois
Indiana
Iowa
Kansas
Kentucky
Louisiana
Maine
Maryland
Massachusetts
Michigan
Minnesota
Mississippi
Missouri
Montana
Nebraska
Nevada
New Hampshire
New Jersey
New Mexico
New York
North Carolina
North Dakota
Northern Mariana Islands
Ohio
Oklahoma
Oregon
Pennsylvania
Puerto Rico
Rhode Island
South Carolina
South Dakota
Tennessee
Texas
Utah
Vermont
Virginia
Virgin Islands
Washington
West Virginia
Wisconsin
Wyoming


In [58]:
representatives_df = pd.DataFrame(representatives)
to_drop = representatives_df[representatives_df['page_link'].str.contains('clerk.house')].index
representatives_df.drop(to_drop, inplace=True)
representatives_df

Unnamed: 0,state,district,name,party,office_room,phone,committee,page_link
0,Alabama,1st,"Carl, Jerry",R,1330 LHOB,(202) 225-4931,Appropriations|Natural Resources,https://carl.house.gov
1,Alabama,2nd,"Moore, Barry",R,1504 LHOB,(202) 225-2901,Agriculture|Judiciary,https://barrymoore.house.gov
2,Alabama,3rd,"Rogers, Mike",R,2469 RHOB,(202) 225-3261,Armed Services,https://mikerogers.house.gov/
3,Alabama,4th,"Aderholt, Robert",R,266 CHOB,(202) 225-4876,Appropriations,https://aderholt.house.gov/
4,Alabama,5th,"Strong, Dale",R,1337 LHOB,(202) 225-4801,"Armed Services|Homeland Security|Science, Spac...",https://strong.house.gov
...,...,...,...,...,...,...,...,...
435,Wisconsin,4th,"Moore, Gwen",D,2252 RHOB,(202) 225-4572,Ways and Means,https://gwenmoore.house.gov
436,Wisconsin,5th,"Fitzgerald, Scott",R,1507 LHOB,(202) 225-5101,Financial Services|Judiciary,https://fitzgerald.house.gov
437,Wisconsin,6th,"Grothman, Glenn",R,1511 LHOB,(202) 225-2476,Budget|Education and the Workforce|Oversight a...,https://grothman.house.gov
438,Wisconsin,7th,"Tiffany, Thomas",R,451 CHOB,(202) 225-3365,Natural Resources|Judiciary,https://tiffany.house.gov/


In [59]:
representatives_df.isna().sum()

state          0
district       0
name           0
party          0
office_room    0
phone          0
committee      0
page_link      0
dtype: int64

In [60]:
chosen_representatives_df = representatives_df.groupby(['state', 'party']).first()
chosen_representatives_df.reset_index()

Unnamed: 0,state,party,district,name,office_room,phone,committee,page_link
0,Alabama,D,7th,"Sewell, Terri",1035 LHOB,(202) 225-2665,Armed Services|House Administration|Joint Comm...,https://sewell.house.gov/
1,Alabama,R,1st,"Carl, Jerry",1330 LHOB,(202) 225-4931,Appropriations|Natural Resources,https://carl.house.gov
2,Alaska,D,At Large,"Peltola, Mary",153 CHOB,(202) 225-5765,Natural Resources|Transportation and Infrastru...,https://peltola.house.gov
3,American Samoa,R,Delegate,"Radewagen, Aumua Amata",2001 RHOB,(202) 225-8577,Foreign Affairs|Natural Resources|Veterans' Af...,https://radewagen.house.gov
4,Arizona,D,3rd,"Gallego, Ruben",1114 LHOB,(202) 225-4065,Armed Services|Natural Resources,https://rubengallego.house.gov/
...,...,...,...,...,...,...,...,...
80,Washington,R,4th,"Newhouse, Dan",504 CHOB,(202) 225-5816,Appropriations|Select Comm on the Strategic Co...,https://newhouse.house.gov
81,West Virginia,R,1st,"Miller, Carol",465 CHOB,(202) 225-3452,Ways and Means,https://miller.house.gov/
82,Wisconsin,D,2nd,"Pocan, Mark",1026 LHOB,(202) 225-2906,Appropriations,https://pocan.house.gov
83,Wisconsin,R,1st,"Steil, Bryan",1526 LHOB,(202) 225-3031,Financial Services|House Administration|Joint ...,https://steil.house.gov


In [61]:
representatives_df.to_csv('data/representatives.csv')
chosen_representatives_df.to_csv('data/chosen_representatives.csv')