-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
executable file
·89 lines (74 loc) · 3.34 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
import csv
import sys
def main():
with open("data.csv", "w", newline="") as f:
fieldnames = ["grantee", "grantee_url", "amount", "fiscal_year",
"grant_type", "grant_description"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
go(writer)
def do_grantee(writer, fy_sums, grantee_num, grantee, grantee_url):
# Now we go to the grantee page to get information about each individual
# grant. We only need to do this once for each grantee.
if grantee_num not in fy_sums:
fy_sums[grantee_num] = {}
response_g = requests.get(grantee_url)
soup_g = BeautifulSoup(response_g.content, "lxml")
rows_g = soup_g.find_all("tr")[1:] # Skip the header row of the table
for row_g in rows_g:
cols_g = row_g.find_all("td")
d = {}
d['grantee'] = grantee
d['grantee_url'] = grantee_url
d['fiscal_year'] = cols_g[0].text.strip()
d['amount'] = cols_g[1].text.strip()
d['grant_type'] = cols_g[2].text.strip()
d['grant_description'] = cols_g[3].text.strip()
writer.writerow(d)
amount_g = d['amount']
assert amount_g.startswith("$")
amount_g = float(amount_g.replace("$", "").replace(",", ""))
if d['fiscal_year'] not in fy_sums[grantee_num]:
fy_sums[grantee_num][d['fiscal_year']] = amount_g
else:
fy_sums[grantee_num][d['fiscal_year']] += amount_g
if not fy_sums[grantee_num]:
print("Failed to download grantee page:", grantee,
grantee_num, file=sys.stderr)
def go(writer):
# This is a map: int -> fiscal_year -> float
fy_sums = {}
response = requests.get("https://www.baumanfoundation.org/grants/search?amount=All&fiscal_year=&name=&items_per_page=All")
soup = BeautifulSoup(response.content, "lxml")
rows = soup.find_all("tr")[1:] # Skip the header row of the table
for row in rows:
cols = row.find_all("td")
grantee = cols[0].text.strip()
grantee_path = cols[0].find("a")["href"]
grantee_num = int(grantee_path.split('/')[-1])
grantee_url = ("https://www.baumanfoundation.org" +
grantee_path)
fiscal_year = cols[2].text.strip()
# This is the total for this (grantee, fiscal_year) combination, so
# it's different from a single grant. We store it to check against our
# per-grant sums (stored in fy_sums) to check our understanding of the
# site.
amount = cols[1].text.strip()
assert amount.startswith("$"), amount
amount = float(amount.replace("$", "").replace(",", ""))
do_grantee(writer, fy_sums, grantee_num, grantee, grantee_url)
try:
assert fy_sums[grantee_num][fiscal_year] == amount, \
(grantee, fiscal_year, fy_sums[grantee_num][fiscal_year],
amount)
except AssertionError as e:
print(e.__class__.__name__, e, grantee, grantee_num,
file=sys.stderr)
except KeyError as e:
print(e.__class__.__name__, e, grantee, grantee_num,
file=sys.stderr)
if __name__ == "__main__":
main()