# Scraping one page per row

Let's say we're interested in our members of Congress, because who isn't? Read in `congress.csv`.

In [1]:
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
import re
import time



In [2]:
df = pd.read_csv("congress.csv")
df.head(10)

Unnamed: 0,name,slug
0,"Senator Abdnor, James",james-abdnor/A000009
1,"Representative Abercrombie, Neil",neil-abercrombie/A000014
2,"Senator Abourezk, James",james-abourezk/A000017
3,"Representative Abraham, Ralph Lee",ralph-abraham/A000374
4,"Senator Abraham, Spencer",spencer-abraham/A000355
5,"Representative Abzug, Bella S.",bella-abzug/A000018
6,"Representative Acevedo-Vila, Anibal",anibal-acevedo-vila/A000359
7,"Representative Ackerman, Gary L.",gary-ackerman/A000022
8,"Representative Adams, Alma S.",alma-adams/A000370
9,"Senator Adams, Brock",brockman-adams/A000031


# Let's scrape one

The `slug` is the part of the URL that's particular to that member of Congress. So `/james-abdnor/A000009` really means `https://www.congress.gov/member/james-abdnor/A000009`.

Scrape his name, birthdaye, party, whether he's currently in congress, and his bill count (don't worry if the bill count is dirty, you can clean it up later).

In [3]:
driver = webdriver.Chrome()

In [4]:
#First try at scraping his information. I used the XML page to get more precise info, but I ended up changing to the following function because the bios were not standardized enough to scrape.

url = "https://www.congress.gov/member/james-abdnor/A000009"
driver.get(url)

#Bill count information
bill_count_block = driver.find_element_by_class_name("results-number").text.strip()
bill_count = re.findall(r'of (\d,?\d?\d?\d?\d?)', bill_count_block)[0]

#Navigating to the member bio page, to get to the XML page
member_bio = driver.find_element_by_class_name("member_bio_link")
member_bio.click()
driver.switch_to.window(driver.window_handles[-1])

#Using the XML page for most of the information and most precise birthday
xml_page = driver.find_element_by_xpath("/html/body/div[2]/div/div/div/div[2]/div/div/div[2]/div/div/div[1]/div/div[2]/a")
xml_page.click()
driver.switch_to.window(driver.window_handles[-1])
xml_page_text = driver.find_element_by_class_name("pretty-print").text.strip()
first_name = re.findall(r'<firstnames>(.+)</firstnames>', xml_page_text)[0]
last_name = re.findall(r'<lastname>(.+)</lastname>', xml_page_text)[0]
party = re.findall(r'<term-party>(.+)</term-party>', xml_page_text)[0]
birth_date = re.findall(r'born in .+, .+, .+, (\w+ \d+, \d\d\d\d);', xml_page_text)[0]

end_time_served = re.findall(r'<time-served>\d\d\d\d-(.+?)</time-served>', xml_page_text)[0]
if end_time_served == '':
    current = "current"
else: 
    current = "former"

print(first_name, "||", last_name, "||", birth_date, "||", party, "||", current, "||", bill_count)

#Closing the extra windows
driver.close()
driver.switch_to.window(driver.window_handles[-1])
driver.close()
driver.switch_to.window(driver.window_handles[0])

James || ABDNOR || February 13, 1923 || Republican || former || 1,949


In [5]:
#Changed it to be a bit simpler -- the XML was not working with everyone and was a lot more page loading

url = "https://www.congress.gov/member/james-abdnor/A000009"
driver.get(url)

info = driver.find_element_by_tag_name("h1").text.strip()

birth_year = re.findall(r'\((\d\d\d\d) - \d+?\)', info)[0]
name = re.findall(r'[Representative/Senator] (.+) \(', info)[0]

overview = driver.find_element_by_class_name("overview").text.strip()
party = re.findall(r'Party (\w+)', overview)[0]


if "Present" in info:
    current = "current"
else: 
    current = "not current"

bill_count_block = driver.find_element_by_class_name("results-number").text.strip()
bill_count = re.findall(r'of (\d,?\d?\d?\d?\d?)', bill_count_block)[0]

#print(info)
#print(overview)

print(name)
print(birth_year)
print(party)
print(current)
print(bill_count)

James Abdnor
1923
Republican
not current
1,949


# Build a function

Write a function called `scrape_page` that makes a URL out of the the `slug`, like we're going to use `.apply`.

In [6]:
def scrape_page(slug):
    
    url = f'https://www.congress.gov/member/{slug}'
    
    return url

In [7]:
scrape_page("james-abdnor/A000009")

'https://www.congress.gov/member/james-abdnor/A000009'

# Do the scraping

Rewrite `scrape_page` to actually scrape the URL. You can use your scraping code from up above. Start by testing with just one row (I put a sample call below) and then expand to your whole dataframe.

Save the results as `scraped_df`.

* **Hint:** Be sure to use `return`!
* **Hint:** Make sure you return a `pd.Series`

In [8]:
def scrape_page(df):
    time.sleep(.25) #added small time delay 
    slug = df['slug']
    
    url = f'https://www.congress.gov/member/{slug}'
    driver.get(url)

    info = driver.find_element_by_tag_name("h1").text.strip()

    birth_year = re.findall(r'\((\d\d\d\d)', info)[0]
    name = re.findall(r'[Representative/Senator] (.+) \(', info)[0]

    overview = driver.find_element_by_class_name("overview").text.strip()
    party = re.findall(r'Party (\w+)', overview)[0]


    if "Present" in info:
        current = "current"
    else: 
        current = "not current"

    bill_count_block = driver.find_element_by_class_name("results-number").text.strip()
    bill_count = re.findall(r'of (\d,?\d?\d?\d?\d?)', bill_count_block)[0]

    
    return {'name': name, 
            'birth_year': birth_year, 
            'party': party, 
            'current': current, 
            'bill_count': bill_count, 
            'slug': slug}

In [9]:
# Test with this
scrape_page({'slug': 'neil-abercrombie/A000014'})

{'name': 'Neil Abercrombie',
 'birth_year': '1938',
 'party': 'Democratic',
 'current': 'not current',
 'bill_count': '4,472',
 'slug': 'neil-abercrombie/A000014'}

In [10]:
#Testing with 5
scraped_test = df.head().apply(scrape_page, axis=1)
scraped_test

0    {'name': 'James Abdnor', 'birth_year': '1923',...
1    {'name': 'Neil Abercrombie', 'birth_year': '19...
2    {'name': 'James Abourezk', 'birth_year': '1931...
3    {'name': 'Ralph Lee Abraham', 'birth_year': '1...
4    {'name': 'Spencer Abraham', 'birth_year': '195...
dtype: object

In [11]:
#Converting to list that can be turned to dataframe easily
scraped_test = list(scraped_test)
scraped_test

[{'name': 'James Abdnor',
  'birth_year': '1923',
  'party': 'Republican',
  'current': 'not current',
  'bill_count': '1,949',
  'slug': 'james-abdnor/A000009'},
 {'name': 'Neil Abercrombie',
  'birth_year': '1938',
  'party': 'Democratic',
  'current': 'not current',
  'bill_count': '4,472',
  'slug': 'neil-abercrombie/A000014'},
 {'name': 'James Abourezk',
  'birth_year': '1931',
  'party': 'Democratic',
  'current': 'not current',
  'bill_count': '875',
  'slug': 'james-abourezk/A000017'},
 {'name': 'Ralph Lee Abraham',
  'birth_year': '1954',
  'party': 'Republican',
  'current': 'current',
  'bill_count': '736',
  'slug': 'ralph-abraham/A000374'},
 {'name': 'Spencer Abraham',
  'birth_year': '1952',
  'party': 'Republican',
  'current': 'not current',
  'bill_count': '1,227',
  'slug': 'spencer-abraham/A000355'}]

In [12]:
pd.DataFrame(scraped_test)

Unnamed: 0,name,birth_year,party,current,bill_count,slug
0,James Abdnor,1923,Republican,not current,1949,james-abdnor/A000009
1,Neil Abercrombie,1938,Democratic,not current,4472,neil-abercrombie/A000014
2,James Abourezk,1931,Democratic,not current,875,james-abourezk/A000017
3,Ralph Lee Abraham,1954,Republican,current,736,ralph-abraham/A000374
4,Spencer Abraham,1952,Republican,not current,1227,spencer-abraham/A000355


In [13]:
### Doing the whole dataframe

In [19]:
#running it here:
#scraped = df.apply(scrape_page, axis=1)
scraped




0       {'name': 'James Abdnor', 'birth_year': '1923',...
1       {'name': 'Neil Abercrombie', 'birth_year': '19...
2       {'name': 'James Abourezk', 'birth_year': '1931...
3       {'name': 'Ralph Lee Abraham', 'birth_year': '1...
4       {'name': 'Spencer Abraham', 'birth_year': '195...
                              ...                        
2343    {'name': 'Ryan K. Zinke', 'birth_year': '1961'...
2344    {'name': 'Roger H. Zion', 'birth_year': '1921'...
2345    {'name': 'Edward Zorinsky', 'birth_year': '192...
2346    {'name': 'Edwin V. W. Zschau', 'birth_year': '...
2347    {'name': 'John M. Zwach', 'birth_year': '1907'...
Length: 2348, dtype: object

In [20]:
df_scraped = pd.DataFrame(list(scraped))
df_scraped

Unnamed: 0,name,birth_year,party,current,bill_count,slug
0,James Abdnor,1923,Republican,not current,1949,james-abdnor/A000009
1,Neil Abercrombie,1938,Democratic,not current,4472,neil-abercrombie/A000014
2,James Abourezk,1931,Democratic,not current,875,james-abourezk/A000017
3,Ralph Lee Abraham,1954,Republican,current,736,ralph-abraham/A000374
4,Spencer Abraham,1952,Republican,not current,1227,spencer-abraham/A000355
...,...,...,...,...,...,...
2343,Ryan K. Zinke,1961,Republican,not current,364,ryan-zinke/Z000018
2344,Roger H. Zion,1921,Republican,not current,60,roger-zion/Z000010
2345,Edward Zorinsky,1928,Democratic,not current,1543,edward-zorinsky/Z000013
2346,Edwin V. W. Zschau,1940,Republican,not current,303,edwin-zschau/Z000014


## Join with your original dataframe

Join your new data with your original data, adding the `_scraped` suffix on the new columns. You can use either `.join` or `.merge`, but be sure to read the docs to know the difference!

In [25]:
df_final = df.join(df_scraped.add_suffix('_scraped'))
del df_final['slug_scraped']
df_final

Unnamed: 0,name,slug,name_scraped,birth_year_scraped,party_scraped,current_scraped,bill_count_scraped
0,"Senator Abdnor, James",james-abdnor/A000009,James Abdnor,1923,Republican,not current,1949
1,"Representative Abercrombie, Neil",neil-abercrombie/A000014,Neil Abercrombie,1938,Democratic,not current,4472
2,"Senator Abourezk, James",james-abourezk/A000017,James Abourezk,1931,Democratic,not current,875
3,"Representative Abraham, Ralph Lee",ralph-abraham/A000374,Ralph Lee Abraham,1954,Republican,current,736
4,"Senator Abraham, Spencer",spencer-abraham/A000355,Spencer Abraham,1952,Republican,not current,1227
...,...,...,...,...,...,...,...
2343,"Representative Zinke, Ryan K.",ryan-zinke/Z000018,Ryan K. Zinke,1961,Republican,not current,364
2344,"Representative Zion, Roger H.",roger-zion/Z000010,Roger H. Zion,1921,Republican,not current,60
2345,"Senator Zorinsky, Edward",edward-zorinsky/Z000013,Edward Zorinsky,1928,Democratic,not current,1543
2346,"Representative Zschau, Edwin V. W.",edwin-zschau/Z000014,Edwin V. W. Zschau,1940,Republican,not current,303


## Save it

Save your combined results to `congress-plus-scraped.csv`.

In [26]:
df_final.to_csv("congress-plus-scraped.csv", index=False)