forked from openspending/dpkg-uk25k
-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_index.py
83 lines (74 loc) · 2.86 KB
/
build_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import sys
import sqlaload as sl
from common import *
log = logging.getLogger('build_index')
TAGS = ['spend-transactions', '25000', '25k', 'Spending Data', 'transactions']
TAGS = ['spend-transactions', '25k', '25000']
GROUPS = {}
def fetch_group(client, package):
if len(package['groups']) != 1:
log.warn("Invalid groups: %r", package['groups'])
return {}
group_name = package['groups'].pop()
if group_name not in GROUPS:
GROUPS[group_name] = client.group_entity_get(group_name)
return GROUPS[group_name]
def fetch_package(client, package_name, engine, table):
'''Queries CKAN for a particular dataset and stores metadata for each
of its resources in the local database.'''
try:
pkg = client.package_entity_get(package_name)
except Exception, e:
log.exception(e)
return
log.info("Dataset: %s", pkg['name'])
group = fetch_group(client, pkg)
for res in pkg['resources']:
log.info(" > Resource %s: %s", res['id'], res['url'])
data = {
'resource_id': res['id'],
'package_id': pkg['id'],
'package_name': pkg['name'],
'package_title': pkg['title'],
'last_modified': res.get('last_modified'),
'url': res['url'],
'publisher_name': group.get('name'),
'publisher_title': group.get('title'),
'publisher_type': group.get('type'),
'format': res['format'],
'description': res['description']
}
row = sl.find_one(engine, table, resource_id=pkg['id'])
if row and row['url'] != pkg['url']:
# url has changed, so force retrieval next time
data['retrieve_status'] = False
sl.upsert(engine, table, data, ['resource_id'])
def connect():
engine = db_connect()
src_table = sl.get_table(engine, 'source')
return engine, src_table
def build_index(department_filter=None):
'''Searches CKAN for spending resources and writes their metadata to
the database.'''
engine, table = connect()
client = ckan_client()
tags = ['+tags:"%s"' % t for t in TAGS]
q = " OR ".join(tags)
if department_filter:
department_filter = ' OR '.join(['publisher:"%s"' % pub for pub in department_filter.split(',')])
q = '(%s) AND (%s)' % (q, department_filter)
log.info('Search q: %r', q)
res = client.package_search(q,
search_options={'limit': 5})
log.info('Search returned %i dataset results', res['count'])
for package_name in res['results']:
fetch_package(client, package_name, engine, table)
if __name__ == '__main__':
if len(sys.argv) > 2:
print 'Usage: python %s [<department-name>]' % sys.argv[0]
sys.exit(1)
elif len(sys.argv) == 2:
department_filter = sys.argv[1]
else:
department_filter = None
build_index(department_filter)