In [62]:
import pandas as pd
from utils.scraping_utils import get_html

In [63]:
rep_df = pd.read_csv('data\chosen_representatives.csv')
rep_df.tail(78)

Unnamed: 0,state,party,district,name,office_room,phone,committee,page_link
7,California,D,2nd,"Huffman, Jared",2445 RHOB,(202) 225-5161,Natural Resources|Transportation and Infrastru...,https://huffman.house.gov
8,California,R,1st,"LaMalfa, Doug",408 CHOB,(202) 225-3076,Agriculture|Natural Resources|Transportation a...,https://lamalfa.house.gov
9,Colorado,D,1st,"DeGette, Diana",2111 RHOB,(202) 225-4431,Energy and Commerce,https://degette.house.gov
10,Colorado,R,3rd,"Boebert, Lauren",1713 LHOB,(202) 225-4761,Oversight and Accountability|Natural Resources,https://boebert.house.gov
11,Connecticut,D,1st,"Larson, John",1501 LHOB,(202) 225-2265,Ways and Means,https://larson.house.gov/
...,...,...,...,...,...,...,...,...
80,Washington,R,4th,"Newhouse, Dan",504 CHOB,(202) 225-5816,Appropriations|Select Comm on the Strategic Co...,https://newhouse.house.gov
81,West Virginia,R,1st,"Miller, Carol",465 CHOB,(202) 225-3452,Ways and Means,https://miller.house.gov/
82,Wisconsin,D,2nd,"Pocan, Mark",1026 LHOB,(202) 225-2906,Appropriations,https://pocan.house.gov
83,Wisconsin,R,1st,"Steil, Bryan",1526 LHOB,(202) 225-3031,Financial Services|House Administration|Joint ...,https://steil.house.gov


In [64]:
def extract_issues_wrapper(rep_link, extract_issues):
    issues_link = rep_link + 'issues'
    issues_html = get_html(issues_link)
    if issues_html is None:
        return []
    
    issues = extract_issues(issues_html, rep_link)
    return issues

def extract_article_links_wrapper(base_link, issues, extract_article_links):
    article_links = []
    for issue in issues:
        issue_article_links = extract_article_links(issue, base_link)
        if issue_article_links is not None:
            article_links.extend([{'issue': issue['name'], 'link': article_link} for article_link in issue_article_links])
    return article_links

def extract_articles_wrapper(article_links, extract_articles):
    articles = []
    for article_link in article_links:
        print(article_link)
        article = extract_articles(article_link)
        if article is not None:
            articles.append(article)
    return articles

def extract_articles(rep_link, extract_issues, extract_article_links, extract_article):
    if rep_link[-1] != '/':
        rep_link += '/'
    issues = extract_issues_wrapper(rep_link, extract_issues)
    article_links = extract_article_links_wrapper(rep_link, issues, extract_article_links)
    articles = extract_articles_wrapper(article_links, extract_article)
    return articles

In [66]:
def carl_extract_issues(issues_html, base_link):
    issues_list_element = issues_html.find(class_='evo-content')
    issue_elements = issues_list_element.find_all(class_='evo-media-object')
    issues = []
    for issue_element in issue_elements:
        a_el = issue_element.find(class_='media-body').find('a')
        issue = {
            'name': a_el.text.strip(),
            'link': base_link + a_el['href']
        }
        issues.append(issue)
    return issues

def carl_extract_article_links(issue, base_link):
    print(issue)
    articles_page = get_html(issue['link'])
    articles_links = []
    while True:
        articles_container = articles_page.find(class_='evo-view-wrapper')
        article_elements = articles_container.find_all(class_='evo-views-row')

        for article_element in article_elements:
            body = article_element.find(class_='media-body')
            # date = cells[1].text.strip()
            # article_name = cells[3].text.strip()
            article_href = base_link + body.find('a')['href'].strip()
            articles_links.append(article_href)
        
        try:
            next_page_btn = articles_page.find(class_='page__content').find(class_='pagination').find('li', class_='pager__item--next')
        except AttributeError as e:
            next_page_btn = None
        if next_page_btn is None:
            break
        next_page_btn = next_page_btn.find('a')
        
        next_page_link = issue['link'] + next_page_btn['href']
        print(next_page_link)
        articles_page = get_html(next_page_link)

    return articles_links
def carl_extract_article(article_link):
    try:
        article_page = get_html(article_link['link'])
        article_element = article_page.find(class_='evo-content')
        title = article_element.find('h1').span.text.strip()
        date = article_element.find(class_='row').div.text.strip()
        text = article_element.find(class_='evo-press-release__body').get_text().strip()
    except AttributeError as e:
        print('Error on: ', article_link)
        return None
    return {'issue': article_link['issue'], 'title': title, 'date': date, 'text': text}

In [67]:
def radewagen_extract_issues(issues_html, base_link):
    issues_list_element = issues_html.find(class_='evo-content')
    issue_elements = issues_list_element.find_all(class_='evo-media-object')
    issues = []
    for issue_element in issue_elements:
        a_el = issue_element.find(class_='media-body').find('a')
        issue = {
            'name': a_el.text.strip(),
            'link': base_link + a_el['href']
        }
        issues.append(issue)
    return issues

def radewagen_extract_article_links(issue, base_link):
    print(issue)
    articles_page = get_html(issue['link'])
    articles_links = []
    while True:
        articles_container = articles_page.find(class_='evo-view-wrapper')
        article_elements = articles_container.find_all(class_='evo-views-row')

        for article_element in article_elements:
            body = article_element.find(class_='media-body')
            # date = cells[1].text.strip()
            # article_name = cells[3].text.strip()
            article_href = base_link + body.find('a')['href'].strip()
            articles_links.append(article_href)
        
        try:
            next_page_btn = articles_page.find(class_='page__content').find(class_='pagination').find('li', class_='pager__item--next')
        except AttributeError as e:
            next_page_btn = None
        if next_page_btn is None:
            break
        next_page_btn = next_page_btn.find('a')
        
        next_page_link = issue['link'] + next_page_btn['href']
        print(next_page_link)
        articles_page = get_html(next_page_link)

    return articles_links
def radewagen_extract_article(article_link):
    try:
        article_page = get_html(article_link['link'])
        article_element = article_page.find(class_='evo-content')
        title = article_element.find('h1').span.text.strip()
        date = article_element.find(class_='row').div.text.strip()
        text = article_element.find(class_='evo-press-release__body').get_text().strip()
    except AttributeError as e:
        print('Error on: ', article_link)
        return None
    return {'issue': article_link['issue'], 'title': title, 'date': date, 'text': text}

In [68]:
def lamalfa_extract_issues(issues_html, base_link):
    issues_list_element = issues_html.find(class_='evo-content')
    issue_elements = issues_list_element.find_all(class_='evo-media-object')
    issues = []
    for issue_element in issue_elements:
        a_el = issue_element.find(class_='media-body').find('a')
        issue = {
            'name': a_el.text.strip(),
            'link': base_link + a_el['href']
        }
        issues.append(issue)
    return issues

def lamalfa_extract_article_links(issue, base_link):
    print(issue)
    articles_page = get_html(issue['link'])
    articles_links = []
    while True:
        articles_container = articles_page.find(class_='evo-view-wrapper')
        article_elements = articles_container.find_all(class_='evo-views-row')

        for article_element in article_elements:
            body = article_element.find(class_='media-body')
            # date = cells[1].text.strip()
            # article_name = cells[3].text.strip()
            article_href = base_link + body.find('a')['href'].strip()
            articles_links.append(article_href)
        
        try:
            next_page_btn = articles_page.find(class_='page__content').find(class_='pagination').find('li', class_='pager__item--next')
        except AttributeError as e:
            next_page_btn = None
        if next_page_btn is None:
            break
        next_page_btn = next_page_btn.find('a')
        
        next_page_link = issue['link'] + next_page_btn['href']
        print(next_page_link)
        articles_page = get_html(next_page_link)

    return articles_links
def lamalfa_extract_article(article_link):
    try:
        article_page = get_html(article_link['link'])
        article_element = article_page.find(class_='evo-content')
        title = article_element.find('h1').span.text.strip()
        date = article_element.find(class_='row').div.text.strip()
        text = article_element.find(class_='evo-press-release__body').get_text().strip()
    except AttributeError as e:
        print('Error on: ', article_link)
        return None
    return {'issue': article_link['issue'], 'title': title, 'date': date, 'text': text}

In [69]:
from representatives.collection import extraction_functions
extraction_functions['https://carl.house.gov'] = {
            'extract_issues': carl_extract_issues,
            'extract_article_links': carl_extract_article_links,
            'extract_article': carl_extract_article
        }
extraction_functions['https://radewagen.house.gov'] = {
            'extract_issues': radewagen_extract_issues,
            'extract_article_links': radewagen_extract_article_links,
            'extract_article': radewagen_extract_article
        }
extraction_functions['https://lamalfa.house.gov'] = {
            'extract_issues': lamalfa_extract_issues,
            'extract_article_links': lamalfa_extract_article_links,
            'extract_article': lamalfa_extract_article
        }

'https://lamalfa.house.gov'

In [70]:
articles = []
skip_list = ['https://pocan.house.gov', 'https://sewell.house.gov/', 'https://carl.house.gov', 'https://radewagen.house.gov']
for name, state, party, committee, link in zip(rep_df['name'], rep_df['state'], rep_df['party'], rep_df['committee'], rep_df['page_link']):
    if link in extraction_functions and not (link in skip_list):
        page_functions = extraction_functions[link]
        print(link)
        rep_articles = extract_articles(link, page_functions['extract_issues'], page_functions['extract_article_links'], page_functions['extract_article'])
        for rep_article in rep_articles:
            rep_article['representative_name'] = name
            rep_article['representative_state'] = state
            rep_article['representative_party'] = party
            rep_article['representative_committee'] = committee
        articles.extend(rep_articles)

https://lamalfa.house.gov
{'name': 'Agriculture', 'link': 'https://lamalfa.house.gov//issues/agriculture'}
https://lamalfa.house.gov//issues/agriculture?page=1
https://lamalfa.house.gov//issues/agriculture?page=2
https://lamalfa.house.gov//issues/agriculture?page=3
https://lamalfa.house.gov//issues/agriculture?page=4
https://lamalfa.house.gov//issues/agriculture?page=5
{'name': 'Congressional Issues', 'link': 'https://lamalfa.house.gov//issues/congressional-issues'}
https://lamalfa.house.gov//issues/congressional-issues?page=1
https://lamalfa.house.gov//issues/congressional-issues?page=2
https://lamalfa.house.gov//issues/congressional-issues?page=3
https://lamalfa.house.gov//issues/congressional-issues?page=4
https://lamalfa.house.gov//issues/congressional-issues?page=5
https://lamalfa.house.gov//issues/congressional-issues?page=6
https://lamalfa.house.gov//issues/congressional-issues?page=7
https://lamalfa.house.gov//issues/congressional-issues?page=8
https://lamalfa.house.gov//issues

In [75]:
articles_df = pd.DataFrame.from_dict(articles)
articles_df

Unnamed: 0,issue,title,date,text,representative_name,representative_state,representative_party,representative_committee
0,Agriculture,"LaMalfa, Thompson Introduce Pacific Flyway Hab...","October 8, 2024","Washington, D.C.— Congressman Doug LaMalfa (R-...","LaMalfa, Doug",California,R,Agriculture|Natural Resources|Transportation a...
1,Agriculture,"Reps. LaMalfa, Carbajal, Miller, and Spanberge...","February 22, 2024","(Washington, D.C.) – Today, Representatives D...","LaMalfa, Doug",California,R,Agriculture|Natural Resources|Transportation a...
2,Agriculture,"LaMalfa, Spanberger Introduce Bipartisan Bill ...","December 1, 2023","(Washington, D.C.) – Today, Congressman Doug L...","LaMalfa, Doug",California,R,Agriculture|Natural Resources|Transportation a...
3,Agriculture,"LaMalfa, Thompson, Feinstein, Padilla, Introdu...","August 2, 2023","(Washington, D.C.) – Congressmen Doug LaMalfa ...","LaMalfa, Doug",California,R,Agriculture|Natural Resources|Transportation a...
4,Agriculture,"Congressmen LaMalfa, Garamendi, reintroduce th...","July 14, 2023","(Washington, D.C.) – Congressmen Doug LaMalfa ...","LaMalfa, Doug",California,R,Agriculture|Natural Resources|Transportation a...
...,...,...,...,...,...,...,...,...
1021,Water,Reps. LaMalfa & Garamendi Announce Bill to Bui...,"March 19, 2014","Maxwell, CA – Congressmen Doug LaMalfa (R-CA-0...","LaMalfa, Doug",California,R,Agriculture|Natural Resources|Transportation a...
1022,Water,LaMalfa Comments on Senate Drought Relief Bill,"February 11, 2014","For Immediate ReleaseFebruary 11, 2014Washingt...","LaMalfa, Doug",California,R,Agriculture|Natural Resources|Transportation a...
1023,Water,Rep. LaMalfa Urges Protection of North State’s...,"February 5, 2014","For Immediate ReleaseFebruary 5, 2014Washingto...","LaMalfa, Doug",California,R,Agriculture|Natural Resources|Transportation a...
1024,Water,Rep. LaMalfa Supports Water Resources Reforms;...,"October 24, 2013","Washington, DC – Rep. Doug LaMalfa (R-CA) toda...","LaMalfa, Doug",California,R,Agriculture|Natural Resources|Transportation a...


In [72]:
# articles_df.to_csv('data/articles.csv')

In [73]:
articles_df['representative_name'].value_counts()

representative_name
LaMalfa, Doug    1026
Name: count, dtype: int64

In [74]:
articles_df['issue'].value_counts()


issue
Congressional Issues              117
Federal Spending and Borrowing    107
Water                              93
Defense and National Security      88
Forestry                           79
Economy and Jobs                   79
Health Care                        78
Infrastructure                     72
Agriculture                        56
Foreign Affairs                    48
Transportation                     45
Veterans Issues                    44
Tax Reform                         43
Energy                             34
Education                          21
Financial Services                 13
Social Security and Medicare        9
Name: count, dtype: int64