/
BeautifulDemo4.py
39 lines (34 loc) · 1.28 KB
/
BeautifulDemo4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# Script for scraping websites
# import Python module to deal with websites
import urllib2
# import BeautifulSoup to scrape websites
from bs4 import BeautifulSoup
# these next lines read the website into BeautifulSoup
url = "http://www.house.gov/representatives"
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
page.close()
# a variable (called menu) grabs congressmen listed by last name
menu = soup.find_all("div", {"id":"byName"})
# variables and aliases -- finds tables for letters of alphabet
# then finds rows for individual members of congress
# then finds columns for info about those members
for tab in menu:
tables = tab.find_all("table")
for table in tables:
row = table.find_all("tr")
for tr in row:
col = tr.find_all("td")
# test for valid text in columns, then extract columns
try:
name = col[0].a.string
district = col[1].string
party = col[2].string
room = col[3].string
phone = col[4].string
str = "\t"
congo = (name, district, party, room, phone)
print str.join(congo)
except:
# print 'error: there wasn't text in all the columns presumably'
pass