# Recipe: Text to maps

## Preparation
### Actions
* Review NER Output
* Preprocess data
 * Remove extra whitespace
 * Remove Post office box reference

In [56]:
import json
import csv
import urllib.request

fp=open("ner_output.csv", newline='')
csv_reader = csv.reader(fp)
next(csv_reader) # skip header row
addresses=[]
for [address] in csv_reader:
    address=address.replace("  "," ")
    address=address.replace("Post Office Box ","")
    print(address)
    addresses.append(address)

46 King Street, Sandy Bay
2/50 St Marks Road, Randwick
Dalby Street, Jandowae
C/O H E C, Gowrie Park
26, Littlehampton
Darwin


## Georeferencing
### Actions
* Perform georeferencing using Pelias geocoder

In [57]:
output=[["p_confidence","p_match_type","p_accuracy","p_lon","p_lat"]]
for address in addresses:
    address=address.replace(" ","+")
    url='https://geocoder.aurin.org.au/v1/search?text='+address+'&boundary.country=AUS'
    req = urllib.request.Request(url)
    with urllib.request.urlopen(req) as response:
        data = json.loads(response.read().decode("utf-8"))
        #print(data)
        row=[]
        row.append(data["features"][0]["properties"]["confidence"])
        if("match_type" in data["features"][0]["properties"]):
            row.append(data["features"][0]["properties"]["match_type"])
        else:
            row.append("")
        row.append(data["features"][0]["properties"]["accuracy"])
        row.append(data["features"][0]["geometry"]["coordinates"][0])
        row.append(data["features"][0]["geometry"]["coordinates"][1])
        output.append(row)
        
from IPython.display import HTML, display
import tabulate

display(HTML(tabulate.tabulate(output, tablefmt='html')))

0,1,2,3,4
p_confidence,p_match_type,p_accuracy,p_lon,p_lat
1,exact,point,147.327321,-42.894435
0.6,fallback,centroid,151.242096,-33.911304
0.6,fallback,centroid,151.111022,-26.781431
0.74,,point,151.896663,-27.486232
0.672,,point,138.862205,-35.054718
1,exact,centroid,130.842984,-12.460477


### Actions
* Perform georeferencing using the data61 GNAF geocoder

In [58]:
url='https://geocoder.aurin.org.au/gnaf-search/search'
output[0].extend(["d_score","d_lon","d_lat"])
ii=1
for address in addresses:
    row=output[ii]
    post_data='{"addr": "'+address+'","numHits": 3,"fuzzy": {"maxEdits": 2,"minLength": 5,"prefixLength": 2}}'
    post_data=post_data.encode('utf-8')
    req = urllib.request.Request(url,data=post_data,headers={'Content-Type':'application/json', 'Accept':'application/json'})
    with urllib.request.urlopen(req) as response:
        data = json.loads(response.read().decode("utf-8"))
        hit=json.loads(data["hits"][0]["json"])
        row.append(data["hits"][0]["score"])
        row.append(hit["location"]["lon"])
        row.append(hit["location"]["lat"])
    ii+=1
display(HTML(tabulate.tabulate(output, tablefmt='html')))

0,1,2,3,4,5,6,7
p_confidence,p_match_type,p_accuracy,p_lon,p_lat,d_score,d_lon,d_lat
1,exact,point,147.327321,-42.894435,0.6515591144561768,147.32732057,-42.89443454
0.6,fallback,centroid,151.242096,-33.911304,0.5142303705215454,151.24675295,-33.91375726
0.6,fallback,centroid,151.111022,-26.781431,0.5402733087539673,151.10802394,-26.77965686
0.74,,point,151.896663,-27.486232,0.28649017214775085,146.21810781,-41.47355859
0.672,,point,138.862205,-35.054718,0.3255309462547302,138.84656084,-35.05692593
1,exact,centroid,130.842984,-12.460477,0.3615313172340393,151.45618062,-33.37669323


# Reviewing
## Actions
* Review georeferencing result: look for discordant results from the two geocoders
 * Calculate distance of result from first geocoder to result from second geocoder
 * Display on a map

In [59]:
# calculate distance from one to the other
import math
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees).
    Source: http://gis.stackexchange.com/a/56589/15183
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a))
    km = 6367 * c
    return km
ii=-1
for row in output:
    ii+=1
    if ii == 0:
        row.append("distance")
        continue
    row.append(round(haversine(float(row[3]),float(row[4]),float(row[6]),float(row[7])),2))
display(HTML(tabulate.tabulate(output, tablefmt='html')))

0,1,2,3,4,5,6,7,8
p_confidence,p_match_type,p_accuracy,p_lon,p_lat,d_score,d_lon,d_lat,distance
1,exact,point,147.327321,-42.894435,0.6515591144561768,147.32732057,-42.89443454,0.0
0.6,fallback,centroid,151.242096,-33.911304,0.5142303705215454,151.24675295,-33.91375726,0.51
0.6,fallback,centroid,151.111022,-26.781431,0.5402733087539673,151.10802394,-26.77965686,0.36
0.74,,point,151.896663,-27.486232,0.28649017214775085,146.21810781,-41.47355859,1638.06
0.672,,point,138.862205,-35.054718,0.3255309462547302,138.84656084,-35.05692593,1.44
1,exact,centroid,130.842984,-12.460477,0.3615313172340393,151.45618062,-33.37669323,3125.68


In [60]:
from ipyleaflet import Map, Marker, CircleMarker, Polygon
center = (-28, 140)
m = Map(center=center, zoom=4)
#marker = Marker(location=center, draggable=False)
#m.add_layer(marker);

for result in output:
    m.add_layer(CircleMarker(location=(result[4],result[3]),radius=2,color="red",fill_color="red"))
    m.add_layer(CircleMarker(location=(result[7],result[6]),radius=2,color="blue",fill_color="blue"))
    m.add_layer(Polygon(locations=[(result[4],result[3]),(result[7],result[6])],color="green",fill_color="green"))
m

Map(basemap={'url': 'https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', 'max_zoom': 19, 'attribution': 'Map …

# Refinement
## Actions
* Drop fine-grained information at the beginning of address, such as unit number, etc. and rerun the procedure