Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
tree: f699c84544
Fetching contributors…

Octocat-spinner-32-eaf2f5

Cannot retrieve contributors at this time

file 151 lines (137 sloc) 6.032 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
import sys, csv
from urllib import urlopen
from pprint import pprint

from recon import company, public_body
from recon.interactive import interactive, SQLiteMemory
from recon.local import CSVLocalEndpoint
from sqlaload import connect, get_table, distinct, all, update

COUNTRIES_URL = 'iso_3166_2_countries.csv'

def integrate_countries(engine, table):
    fh = urlopen(COUNTRIES_URL)
    uri = lambda r: r['ISO 3166-1 2 Letter Code']
    endpoint = CSVLocalEndpoint(fh, 'Common Name', uri_maker=uri)
    integrate_recon(engine, table, endpoint.reconcile,
                    'country',
                    'country_name', 'country_code',
                    'countries')


def integrate_departments(engine, table):
    def eu_public_body(query):
        return public_body(query, jurisdiction='EU', limit=10)
    integrate_recon(engine, table, eu_public_body,
                    'responsible_department',
                    'department_name', 'department_uri',
                    'eu_bodies')

def integrate_companies(engine, table):
    #def eu_public_body(query):
    # return public_body(query, jurisdiction='EU', limit=10)
    integrate_recon(engine, table, company,
                    'beneficiary',
                    'beneficiary_name', 'beneficiary_uri',
                    'companies', min_score=60)

def integrate_recon(engine, table, qfunc, src_col, dst_name_col, dst_uri_col,
        memory_name, min_score=None):
    memory = SQLiteMemory(engine.raw_connection(), memory_name)
    conn = engine.connect()
    for row in list(distinct(engine, table, src_col)):
        res = interactive(qfunc, row[src_col], memory=memory, min_score=min_score)
        if res is not None:
            print row.get(src_col), " -> ", res.name.encode('utf-8'), res.score
            update(conn, table, row, {dst_name_col: res.name, dst_uri_col: res.uri})

from urllib import quote, urlopen
import json
def integrate_geocode(engine, table):
    BASE = 'http://api.geonames.org/postalCodeSearchJSON?formatted=true&country=%s&postalcode=%s&maxRows=1&username=demo&style=full'
    conn = engine.connect()
    for row in list(distinct(engine, table, 'country_code', 'postcode')):
        if not row['country_code'] or not row['postcode']:
            continue
        url = BASE % (quote(row[0].encode('utf-8')),
                      quote(row[1].encode('utf-8')))
        data = json.load(urlopen(url))
        print url
        print len(data['postalCodes'])
        if len(data['postalCodes']):
            pc = data['postalCodes'][0]
            update(conn, table, row, {
                'admin1_code': pc.get('adminCode1'),
                'admin1_name': pc.get('adminCode1'),
                'admin2_code': pc.get('adminCode2'),
                'admin2_name': pc.get('adminName2'),
                'admin3_code': pc.get('adminCode3'),
                'admin3_name': pc.get('adminName3')
                })

def integrate_nominatim(engine, table):
    BASE = 'http://open.mapquestapi.com/nominatim/v1/search?format=json&q=%s&limit=1&countrycodes=%s'
    CITIES = {}
    def _get(query, country):
        url = BASE % (quote(query.encode('utf-8')),
                      quote(country.encode('utf-8')))
        return json.load(urlopen(url))
    conn = engine.connect()
    for row in list(distinct(engine, table, 'country_code', 'address', 'city',
                             'lng', 'lat')):
        if not row['country_code']: continue
        if row['lng'] and row['lat']: continue
        query = row['address'] + ", " + row['city']
        print query.encode('utf-8')
        data = _get(query, row['country_code'])
        if not len(data):
            city = row['city'].lower().strip()
            if not city in CITIES:
                CITIES[city] = _get(city, row['country_code'])
            data = CITIES[city]
        if len(data):
            loc = data[0]
            criteria = row.copy()
            del criteria['lng']
            del criteria['lat']
            print loc['lon'], loc['lat']
            update(conn, table, criteria,
                {'lng': loc['lon'], 'lat': loc['lat']})

def integrate_nuts(engine, table):
    from ll2nuts import LonLat2NUTS
    ll = LonLat2NUTS(3)
    conn = engine.connect()
    for row in list(distinct(engine, table, 'country_code', 'lng', 'lat')):
        if not row['country_code'] or not row['lng'] or not row['lat']:
            continue
        try:
            print row
            nuts = ll.ll2nuts(float(row['lng']), float(row['lat']),
                              iso=row['country_code'])
            if not nuts.startswith(row['country_code']):
                continue
            print nuts
            update(conn, table, row, {'nuts3': nuts})
        except Exception, e:
            print e

def integrate_nutsnames(conn, table):
    fh = open('NUTS_2006.csv', 'rb')
    nuts1, nuts2, nuts3 = {}, {}, {}
    for row in csv.DictReader(fh):
        d = {'1': nuts1, '2': nuts2, '3': nuts3}.get(row['NUTS_LEVEL'], {})
        d[row['CODE']] = dict([(k, v.decode('utf-8')) for k, v in row.items()])
    conn = engine.connect()
    for row in list(distinct(engine, table, 'nuts3')):
        if not row['nuts3']: continue
        nuts1_ = row['nuts3'][:3]
        nuts2_ = row['nuts3'][:4]
        data = {
            'nuts1': nuts1_, 'nuts1_label': nuts1.get(nuts1_, {}).get('LABEL'),
            'nuts2': nuts2_, 'nuts1_label': nuts2.get(nuts2_, {}).get('LABEL'),
            'nuts3_label': nuts3.get(nuts1_, {}).get('LABEL')
                }
        update(conn, table, row, data)

if __name__ == '__main__':
    assert len(sys.argv)==3, "Usage: %s {cc,dg,corp} [sqlite-db]"
    op = sys.argv[1]
    engine = connect('sqlite:///' + sys.argv[2])
    table = get_table(engine, 'fts')
    ops = {
        'dg': integrate_departments,
        'corp': integrate_companies,
        'cc': integrate_countries,
        'geo': integrate_nominatim,
        'nuts': integrate_nuts,
        'nutsnames': integrate_nutsnames
        }.get(op)(engine, table)

Something went wrong with that request. Please try again.