/
Day_05_C_Displaying_Census_URLs.py
184 lines (120 loc) · 4.09 KB
/
Day_05_C_Displaying_Census_URLs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <codecell>
%pylab --no-import-all inline
# <codecell>
import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame, Series, Index
import pandas as pd
# <codecell>
# check that CENSUS_KEY is defined
import settings
assert settings.CENSUS_KEY is not None
# <markdowncell>
# The census documentation has example URLs but needs your API key to work. In this notebook, we'll use the IPython notebook HTML display mechanism to help out.
# <codecell>
# http://api.census.gov/data/2010/sf1/geo.html
# <codecell>
from IPython.core.display import HTML
# <codecell>
HTML("<iframe src='http://api.census.gov/data/2010/sf1/geo.html' width='800px'/>")
# <codecell>
%%HTML
<b>hi there</b>
# <codecell>
import urlparse
import urllib
from IPython.core.display import HTML
def add_census_key(url, api_key=settings.CENSUS_KEY):
"""Take an input example Census API call and a key parameter"""
pr = urlparse.urlparse(url)
# we're going to modify the query, which is the 5th element in the tuple (index 4)
pr1 = list(pr)
# convert pr.query from string to dict
# see http://stackoverflow.com/a/10233141/7782 for meaning of doseq
pr_query = urlparse.parse_qs(pr.query)
pr_query["key"]= api_key
pr1[4] = urllib.urlencode(pr_query, doseq=True)
return urlparse.urlunparse(pr1)
def c_url (url, title=None, api_key=settings.CENSUS_KEY):
url_with_key = add_census_key(url, api_key)
if title is None:
title = url
return HTML("""<a href="{url}">{title}</a>""".format(url=url_with_key, title=title))
#add_census_key("http://api.census.gov/data/2010/sf1?get=P0010001&for=county:*")
c_url("http://api.census.gov/data/2010/sf1?get=NAME,P0010001&for=state:*")
# <headingcell level=1>
# Scraping the examples
# <codecell>
import requests
from lxml.html import parse, fromstring
url = "http://api.census.gov/data/2010/sf1/geo.html"
r = requests.get(url).content
doc = fromstring(r)
rows = doc.xpath("//table/tr")
# first row is the header
headers = [col.text for col in rows[0].findall('th')]
headers
# next rows are the census URL examples
# <codecell>
row = rows[1]
cols = row.findall('td')
# col[s0]: Summmary Level
print cols[0].text
# cols[1]: Description
print cols[1].text
# <codecell>
from itertools import islice
from lxml.html import parse
# let's actually now decorate the urls
def decorated_parse_examples(examples, api_key=settings.CENSUS_KEY):
for row in examples:
new_row = row.copy()
# need to change URLs
example_urls_col = new_row[headers[2]]
#urls_with_key = [add_census_key(url) for url in example_urls_col]
new_row[headers[2]] = "<br/>".join(
["""<a href="{url_with_key}">{url}</a>""".format(
url=url,
url_with_key=add_census_key(url)
) for url in example_urls_col
])
yield new_row
def parse_urls_col(col):
# http://stackoverflow.com/a/15074386/7782
return [child for child in col.itertext()]
def parse_census_examples():
url = "http://api.census.gov/data/2010/sf1/geo.html"
doc = parse(url)
rows = doc.xpath("//table/tr")
# first row is the header
headers = [col.text for col in rows[0].findall('th')]
for row in rows[1:]:
cols = row.findall('td')
yield ({headers[0]:cols[0].text,
headers[1]:cols[1].text,
headers[2]:parse_urls_col(cols[2])})
#parsed_examples = list(islice(parse_census_examples(),None))
parsed_examples = parse_census_examples()
# <codecell>
# let's redisplay the table with
from IPython.display import HTML
from jinja2 import Template
URLS_TEMPLATE= """
<table>
<tr>
{% for header in headers %}
<th>{{header}}</th>
{% endfor %}
</tr>
{% for row in rows %}
<tr>
{% for header in headers %}
<td>{{row[header]}}</td>
{% endfor %}
</tr>
{% endfor %}
</table>"""
template = Template(URLS_TEMPLATE)
HTML(template.render(headers=headers, rows=decorated_parse_examples(parsed_examples)))