In [55]:
import pandas as pd
from utils.scraping_utils import get_html

In [56]:
rep_df = pd.read_csv('data\chosen_representatives.csv')
rep_df

Unnamed: 0,state,party,district,name,office_room,phone,committee,page_link
0,Alabama,D,7th,"Sewell, Terri",1035 LHOB,(202) 225-2665,Armed Services|House Administration|Joint Comm...,https://sewell.house.gov/
1,Alabama,R,1st,"Carl, Jerry",1330 LHOB,(202) 225-4931,Appropriations|Natural Resources,https://carl.house.gov
2,Alaska,D,At Large,"Peltola, Mary",153 CHOB,(202) 225-5765,Natural Resources|Transportation and Infrastru...,https://peltola.house.gov
3,American Samoa,R,Delegate,"Radewagen, Aumua Amata",2001 RHOB,(202) 225-8577,Foreign Affairs|Natural Resources|Veterans' Af...,https://radewagen.house.gov
4,Arizona,D,3rd,"Gallego, Ruben",1114 LHOB,(202) 225-4065,Armed Services|Natural Resources,https://rubengallego.house.gov/
...,...,...,...,...,...,...,...,...
80,Washington,R,4th,"Newhouse, Dan",504 CHOB,(202) 225-5816,Appropriations|Select Comm on the Strategic Co...,https://newhouse.house.gov
81,West Virginia,R,1st,"Miller, Carol",465 CHOB,(202) 225-3452,Ways and Means,https://miller.house.gov/
82,Wisconsin,D,2nd,"Pocan, Mark",1026 LHOB,(202) 225-2906,Appropriations,https://pocan.house.gov
83,Wisconsin,R,1st,"Steil, Bryan",1526 LHOB,(202) 225-3031,Financial Services|House Administration|Joint ...,https://steil.house.gov


In [57]:
def extract_issues_wrapper(rep_link, extract_issues):
    issues_link = rep_link + 'issues'
    issues_html = get_html(issues_link)
    if issues_html is None:
        return []
    
    issues = extract_issues(issues_html, rep_link)
    return issues

def extract_article_links_wrapper(base_link, issues, extract_article_links):
    article_links = []
    for issue in issues:
        issue_article_links = extract_article_links(issue, base_link)
        if issue_article_links is not None:
            article_links.extend([{'issue': issue['name'], 'link': article_link} for article_link in issue_article_links])
    return article_links

def extract_articles_wrapper(article_links, extract_articles):
    articles = []
    for article_link in article_links:
        print(article_link)
        article = extract_articles(article_link)
        if article is not None:
            articles.append(article)
    return articles

def extract_articles(rep_link, extract_issues, extract_article_links, extract_article):
    if rep_link[-1] != '/':
        rep_link += '/'
    issues = extract_issues_wrapper(rep_link, extract_issues)
    article_links = extract_article_links_wrapper(rep_link, issues, extract_article_links)
    articles = extract_articles_wrapper(article_links, extract_article)
    return articles

In [58]:
def sewell_extract_issues(issues_html, base_link):
    issues_list_element = issues_html.find(class_='issues-group')
    issue_elements = issues_list_element.find_all(class_='issues-group-item')
    issues = []
    for issue_element in issue_elements:
        issue = {
            'name': issue_element.text.strip(),
            'link': base_link + issue_element.a['href']
        }
        issues.append(issue)
    return issues

def sewell_extract_article_links(issue, base_link):
    articles_page = get_html(issue['link'])
    articles_links = []
    while True:
        articles_container = articles_page.find(class_='recordsContainer')
        article_groups = articles_container.find_all('table', class_='recordList')
        article_elements = []
        for article_group in article_groups:
            article_elements.extend(article_group.find('tbody').find_all('tr')) 

        for article_element in article_elements:
            cells = list(article_element.children)
            # date = cells[1].text.strip()
            # article_name = cells[3].text.strip()
            article_href = base_link + cells[3].a['href'].strip()
            articles_links.append(article_href)
        next_page_btn = articles_page.find('a', string='Next >')
        if next_page_btn is None or next_page_btn['href'] == '#':
            break
        next_page_link = base_link + next_page_btn['href']
        print(next_page_link)
        articles_page = get_html(next_page_link)

    return articles_links

def sewell_extract_article(article_link):
    try:
        article_page = get_html(article_link['link'])
        article_element = article_page.find('article')
        title = article_element.find(class_='title').text.strip()
        date = article_element.find(class_='date').text.strip()
        text = article_element.find(class_='post-content').get_text().strip()
    except AttributeError as e:
        print('Error on: ', article_link)
        return None
    return {'issue': article_link['issue'], 'title': title, 'date': date, 'text': text}

In [59]:
def pocan_extract_issues(issues_html, base_link):
    issues_list_element = issues_html.find(class_='evo-content')
    issue_elements = issues_list_element.find_all(class_='evo-media-object')
    issues = []
    for issue_element in issue_elements:
        issue = {
            'name': issue_element.find('a').text.strip(),
            'link': base_link + issue_element.find('a')['href']
        }
        issues.append(issue)
    return issues

def pocan_extract_article_links(issue, base_link):
    articles_page = get_html(issue['link'])
    articles_links = []
    while True:
        articles_container = articles_page.find(class_='evo-view-wrapper')
        article_elements = articles_container.find_all(class_='evo-views-row')

        for article_element in article_elements:
            body = article_element.find(class_='media-body')
            # date = cells[1].text.strip()
            # article_name = cells[3].text.strip()
            article_href = base_link + body.find('a')['href'].strip()
            articles_links.append(article_href)
        
        next_page_btn = articles_page.find(class_='page__content').find(class_='pagination').find('li', class_='pager__item--next')
        if next_page_btn is None :
            break
        next_page_btn = next_page_btn.find('a')
        
        next_page_link = issue['link'] + next_page_btn['href']
        print(next_page_link)
        articles_page = get_html(next_page_link)

    return articles_links

def pocan_extract_article(article_link):
    try:
        article_page = get_html(article_link['link'])
        article_element = article_page.find(class_='evo-content')
        title = article_element.find('h1').span.text.strip()
        date = article_element.find(class_='row').div.text.strip()
        text = article_element.find(class_='evo-press-release__body').get_text().strip()
    except AttributeError as e:
        print('Error on: ', article_link)
        return None
    return {'issue': article_link['issue'], 'title': title, 'date': date, 'text': text}

In [60]:
extraction_functions = {
    'https://sewell.house.gov/': 
        {
            'extract_issues': sewell_extract_issues,
            'extract_article_links': sewell_extract_article_links,
            'extract_article': sewell_extract_article
        },
    'https://pocan.house.gov':
        {
            'extract_issues': pocan_extract_issues,
            'extract_article_links': pocan_extract_article_links,
            'extract_article': pocan_extract_article
        }
}

In [61]:
articles = []
skip_list = []#['https://sewell.house.gov/']
for name, state, party, committee, link in zip(rep_df['name'], rep_df['state'], rep_df['party'], rep_df['committee'], rep_df['page_link']):
    if link in extraction_functions and not (link in skip_list):
        page_functions = extraction_functions[link]
        print(link)
        rep_articles = extract_articles(link, page_functions['extract_issues'], page_functions['extract_article_links'], page_functions['extract_article'])
        for rep_article in rep_articles:
            rep_article['representative_name'] = name
            rep_article['representative_state'] = state
            rep_article['representative_party'] = party
            rep_article['representative_committee'] = committee
        articles.extend(rep_articles)

https://sewell.house.gov/
https://sewell.house.gov//economy?page=2
https://sewell.house.gov//economy?page=3
https://sewell.house.gov//economy?page=4
https://sewell.house.gov//economy?page=5
https://sewell.house.gov//economy?page=6
https://sewell.house.gov//economy?page=7
https://sewell.house.gov//economy?page=8
https://sewell.house.gov//economy?page=9
https://sewell.house.gov//economy?page=10
https://sewell.house.gov//economy?page=11
https://sewell.house.gov//economy?page=12
https://sewell.house.gov//education?page=2
https://sewell.house.gov//education?page=3
https://sewell.house.gov//education?page=4
https://sewell.house.gov//education?page=5
https://sewell.house.gov//healthcare?page=2
https://sewell.house.gov//healthcare?page=3
https://sewell.house.gov//healthcare?page=4
https://sewell.house.gov//healthcare?page=5
https://sewell.house.gov//healthcare?page=6
https://sewell.house.gov//healthcare?page=7
https://sewell.house.gov//healthcare?page=8
https://sewell.house.gov//healthcare?pag

In [63]:
articles_df = pd.DataFrame.from_dict(articles)
articles_df

Unnamed: 0,issue,title,date,text,representative_name,representative_state,representative_party,representative_committee
0,Economy,Rep. Sewell Delivers $1.6 Million in Federal F...,"October 11, 2024","Birmingham, AL – Today, U.S. Rep. Terri Sewell...","Sewell, Terri",Alabama,D,Armed Services|House Administration|Joint Comm...
1,Economy,Rep. Sewell Announces 13th Annual Job Fair,"July 11, 2024",The 2024 Job Fair will be held at the Bessemer...,"Sewell, Terri",Alabama,D,Armed Services|House Administration|Joint Comm...
2,Economy,Rep. Sewell Announces $16.9 Million from Presi...,"July 9, 2024","Montgomery, AL — Today, U.S. Rep. Terri Sewell...","Sewell, Terri",Alabama,D,Armed Services|House Administration|Joint Comm...
3,Economy,Rep. Sewell Announces $14+ Million in Departme...,"June 24, 2024","Fairfield, AL — Today, U.S. Rep. Terri Sewell ...","Sewell, Terri",Alabama,D,Armed Services|House Administration|Joint Comm...
4,Economy,Rep. Sewell Hosts Seniors Resource Roundtable ...,"June 18, 2024",Sewell highlighted how the Inflation Reduction...,"Sewell, Terri",Alabama,D,Armed Services|House Administration|Joint Comm...
...,...,...,...,...,...,...,...,...
970,Supporting Seniors,Sánchez and Pocan Introduce Strengthening Soci...,"May 8, 2019","WASHINGTON, DC (May 8, 2019) – U.S. Representa...","Pocan, Mark",Wisconsin,D,Appropriations
971,Supporting Seniors,"Pocan, Kaptur Introduce Legislation to Protect...","February 7, 2019","WASHINGTON, DC (February 7, 2019) – U.S. Repre...","Pocan, Mark",Wisconsin,D,Appropriations
972,Supporting Seniors,Congressional Progressive Caucus: House GOP Vo...,"January 13, 2017","WASHINGTON, D.C. – Congressional Progressive C...","Pocan, Mark",Wisconsin,D,Appropriations
973,Supporting Seniors,Congressional Progressive Caucus: Will House R...,"January 11, 2017","WASHINGTON, D.C. – Congressional Progressive C...","Pocan, Mark",Wisconsin,D,Appropriations


In [64]:
articles_df.to_csv('data/articles.csv')

In [66]:
articles_df['representative_name'].value_counts()

representative_name
Pocan, Mark      537
Sewell, Terri    438
Name: count, dtype: int64

In [67]:
articles_df['issue'].value_counts()


issue
Healthcare                        160
Jobs and the Workforce            139
Education                         120
Economy                           118
Jobs and Workforce Development     88
Voting Rights                      67
National Security and Veterans     62
Equality                           61
Budget                             52
Protecting the Right to Vote       28
Agriculture                        25
Energy and Environment             22
Veterans                           18
Supporting Seniors                 15
Name: count, dtype: int64