In [1]:
import argparse
import os
import sys
import glob
import codecs
import unicodecsv
import logging
from itertools import combinations 
from ast import literal_eval
from collections import defaultdict, namedtuple
import json
import requests
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
NOMINATIM_LIMIT = 2500

en_events = ["california_wildfires_2018", "canada_wildfires_2016", "cyclone_idai_2019", "ecuador_earthquake_2016", 
          "greece_wildfires_2018", "hurricane_dorian_2019", "hurricane_florence_2018", "hurricane_harvey_2017", 
          "hurricane_irma_2017", "hurricane_maria_2017", "hurricane_matthew_2016", "italy_earthquake_aug_2016", 
          "kaikoura_earthquake_2016", "kerala_floods_2018", "maryland_floods_2018", "midwestern_us_floods_2019", 
          "pakistan_earthquake_2019", "puebla_mexico_earthquake_2017", "srilanka_floods_2017"]
 
ar_events = ["beirut_explosion_2020", "cairo_bombing_2019", "covid_2019", "dragon_storms_2020",
             "hafr_albatin_floods_2019", "jordan_floods_2018", "kuwait_floods_2018"] 


parts = ["train", "dev", "test"]

types_dict = {"Island": "ISL", "State": "STAT", "Continent": "CONT", "City/town": "CITY", "Country": "CTRY",
        "County": "CNTY", "Neighborhood": "NBHD", "Road/street": "ST", "District": "DIST", "Other locations": "OTHR", 
        "Natural Point-of-Interest": "NPOI", "Human-made Point-of-Interest": "HPOI"}
types = list(types_dict.values())

TGs = {"country": ["island", "state", "city", "country", "county", "neighborhood", "street", "district", "other locations", "natural point-of-interest", "human-made point-of-interest"],
       "state": ["island", "state", "city", "county", "neighborhood", "street", "district", "other locations", "natural point-of-interest", "human-made point-of-interest"], 
       "county": ["island", "city", "county", "neighborhood", "street", "district", "other locations", "natural point-of-interest", "human-made point-of-interest"],
       "city": ["city", "neighborhood", "street", "other locations", "natural point-of-interest", "human-made point-of-interest"],
       "district": ["neighborhood", "street", "district", "other locations", "natural point-of-interest", "human-made point-of-interest"],
       "neighborhood": ["neighborhood"], #, "street", "natural point-of-interest", "human-made point-of-interest"],
       "street": ["street", "natural point-of-interest", "human-made point-of-interest"],
       "point-of-interest": ["natural point-of-interest", "human-made point-of-interest"]}
        # "other": ["island", "continent", "other locations"],


In [3]:
from geopy.geocoders import Nominatim
def geocode(coords):
    locs = []
    locator = Nominatim(user_agent="LMDGeocoder")
    url = "https://nominatim.openstreetmap.org/reverse?format=jsonv2&extratags=1&addressdetails=1"
    for coord in coords:
        coordinates = "&lat={}&lon={}".format(coord[0], coord[1]) #"53.480837, -2.244914"
        location = requests.get(url + coordinates, timeout=30)
        locs.append(json.loads(location.content)) 
    return locs

In [4]:
def resolve_to_location(coords_dict, online, cached_coords=None, multiprocessing=False):

    y_resolved_dict = {}
    for k in coords_dict:
        for sys in coords_dict[k]:
            coords = coords_dict[k][sys]
            unresolved = set(coords)

            if cached_coords:
                unresolved.difference_update(set(cached_coords.keys()))

            if unresolved and online:
                unresolved = list(unresolved)
                size = min(len(unresolved), NOMINATIM_LIMIT)

                locations = list(geocode(unresolved))
                cached_coords.update([(coord, loc) for coord, loc in zip(unresolved, locations) if 'address' in loc or 'address_components' in loc])
                logging.debug('Number of resolved locations {}/{}'.format(len([l for l in locations if 'address' in l or 'address_components' in l]), size))
                logging.debug('Number of returned locations {}'.format(len(locations)))
            else:
                logging.info('Offline mode')
                logging.info('Number of cached locations is {}, unresolved is {}'.format(len(cached_coords), len(unresolved)))


            y_resolved = []
            for latitude, longitude in coords:
                location = cached_coords.get((latitude, longitude), {})
                country = state = county = city = district = neighborhood = street = point_of_interest = 'unresolved'

                country = location['address'].get('country', 'unresolved')
                state = location['address'].get('state', 'unresolved')
                if state == "unresolved":
                    state = location['address'].get('province', 'unresolved')

                county = location['address'].get('county', 'unresolved')
                city = location['address'].get('city', 'unresolved')
                if city == "unresolved":
                    city = location['address'].get('village', 'unresolved')

                district = location['address'].get('district', 'unresolved')
                if district == "unresolved": 
                    district = location['address'].get('subdistrict', 'unresolved')

                neighborhood = location['address'].get('neighborhood', 'unresolved')
                if neighborhood == "unresolved": #https://en.wikipedia.org/wiki/Suburb
                    neighborhood = location['address'].get('subrub', 'unresolved')

                street = location['address'].get('street', 'unresolved')
                if street == "unresolved":
                    street = location['address'].get('road', 'unresolved')

                #Read more: https://wiki.openstreetmap.org/wiki/Key:addr:*
                point_of_interest = location['address'].get('house_number', 'unresolved')
                if point_of_interest == "unresolved":
                    point_of_interest = location['address'].get('house_name', 'unresolved')
                if point_of_interest == "unresolved":
                    point_of_interest = location['address'].get('flats', 'unresolved')
                if point_of_interest == "unresolved":
                    point_of_interest = location['address'].get('place', 'unresolved')
                if point_of_interest == "unresolved":
                    point_of_interest = location['address'].get('hamlet', 'unresolved')
                if point_of_interest == "unresolved":
                    point_of_interest = location['address'].get('flats', 'unresolved')
                if point_of_interest == "unresolved": #https://wiki.openstreetmap.org/wiki/Key:amenity
                    point_of_interest = location['address'].get('amenity', 'unresolved')                

                y_resolved.append({"latitude":latitude, "longitude":longitude, #"type": typ,
                                   "country":country.lower(),
                                   "state":state.lower(),
                                   "city":city.lower(),
                                   "county":county.lower()})
                                   #"district":county.lower(),
                                   #"street":county.lower(),
                                   #"island":county.lower(),
                                   #"nbhd":county.lower(),
                                   #"hpoi":county.lower(),
                                   #"mpoi":county.lower()})

            if k not in y_resolved_dict:
                y_resolved_dict[k] = {}
            y_resolved_dict[k][sys] = y_resolved
            
    return y_resolved_dict


In [5]:
def geoloceval(docs):
    systems = defaultdict(list)
    for d, v in docs.items():
        for s, r in v.items():
            systems[s].append(r)
            
    target = systems.pop('gold')
    
    granularities = list(set(target[0][0].keys()).difference(set(['tweet_id', 'latitude', 'longitude', 'type'])))
    print("granularities")
    print(granularities)
    results = []
    Result = namedtuple('Results', ['System', 'Gran', 'MRR1', 'MRR3', 'MRR5'])
    
    print("System\tGran\tMRR@1\tMRR@3\tMRR@5")
    # Evaluation per granularity
    for g in granularities:
        
        logging.info('Discrete Evaluation at the level of {} **************************************************'.format(g.upper()))
        print('{}'.format(g.upper()), end='\t')
        # extract target labels on the level of g granularity
        l_target_all = [d[0][g] for d in target]
        l_type_all = [d[0]['type'] for d in target]
        tg = TGs[g]
        l_target = [d for i, d in enumerate(l_target_all) if l_type_all[i] in tg]
               
        lt_unresolved = len([p for p in l_target if p == 'unresolved'])
        
        if len(l_target) > 0:
            logging.info('Number of originally unresolved locations: {} ({:.3%})'.format(lt_unresolved, (float(lt_unresolved)/len(l_target))))
            print('Number of originally unresolved locations: {} ({:.3%})'.format(lt_unresolved, (float(lt_unresolved)/len(l_target))))
        else:
            logging.info('No sufficient gold LMs for evaluation at this granularity')
            print('No sufficient gold LMs for evaluation at this granularity')
            continue
            
        data = defaultdict(list)
        
        for s in systems:
            l_pred_all_ranked = []
            for l in systems[s]:
                l_pred_all_ranked.append([d[g] if len(d) > 0 else 'unresolved' for d in l])
            
            l_pred_ranked = []
            for i in range(len(l_pred_all_ranked)):
                l = l_pred_all_ranked[i]
                if l_type_all[i] not in tg: 
                    continue
                l_pred_ranked.append([d for i, d in enumerate(l)])
            
            mrr1 = calculate_mrr(l_target, l_pred_ranked, 1)
            mrr3 = calculate_mrr(l_target, l_pred_ranked, 3)
            mrr5 = calculate_mrr(l_target, l_pred_ranked, 5)
            

            l_pred_all = [d[0][g] if len(d) > 0 else 'unresolved' for d in systems[s]]
            l_pred = [d for i, d in enumerate(l_pred_all) if l_type_all[i] in tg]
            
            for i, e in enumerate(l_pred):
                if e is None:
                    l_pred[i] = 'unresolved'
            
            data[s] = l_pred
            

            lp_unresolved = len([p for p in l_pred if p == 'unresolved'])
            logging.info('Number of unresolved locations for {}: {} ({:.3%})'.format(s, lp_unresolved, float(lp_unresolved)/len(l_pred)))
            
                        
            results.append(Result._make((s, g, mrr1, mrr3, mrr5,)))
            
            print("{}\t{}\t{:.4f}\t{:.4f}\t{:.4f}".format(s, g, mrr1, mrr3, mrr5))
    return



In [6]:
#TODO: match using the toponym id not toponym name
def calculate_mrr(gold, ranked_toponyms, cutoff):
    RR = []
    for i in range(len(gold)):
        lm = gold[i]
        rr = 0.0
        if len(ranked_toponyms) > 0:
            limit = min(len(ranked_toponyms[i]), cutoff)
            for j in range(limit):
                if ranked_toponyms[i][j].lower() == gold[i].lower():
                    rr += 1.0/(j+1)
                    break #to avoid scoring multiple toponyms with the same name correctly!
        RR.append(rr)
    mrr = sum(RR)/len(RR)
    return mrr

In [7]:
def run(gold, systems, outdir, geoloc, g_online, p_online):
    docs = defaultdict(dict)
    cached_coords = {}
    
    if geoloc:
        with codecs.open(geoloc, 'r', 'utf-8') as fsystem:  
            for line in fsystem:
                s = line.split("\t")
                
                cached_coords.update([(literal_eval(s[0]), literal_eval(s[1]))]) 

    # load the ground-truth
    with codecs.open(gold, 'r', 'utf-8') as fgold:
        logging.info('{}'.format(os.path.basename(gold)))
        
        for line in fgold:
            data = json.loads(line)
            tmp = data.pop('tweet_id') + "-" + data.pop('lm')
            for i in ["1", "2", "3", "4", "5"]:
                if tmp + "-" + i not in docs:
                    tmp = tmp + "-" + i
                    break
            i = tmp
            docs[i]['gold'] = [data]
        
        # resolve gold coordinates to locations
        coords = {d:{'gold':[(float(docs[d]['gold'][0]['latitude']), float(docs[d]['gold'][0]['longitude']))]} for d in docs}
        
        resolved_coords = {}
        resolved_coords_temp = resolve_to_location(coords, g_online, cached_coords)
        resolved_coords = {d:{'gold':[]} for d in docs}
                
        for r, d in zip(resolved_coords_temp.values(), docs.keys()):
            r['gold'][0]['type'] = docs[d]['gold'][0]['type']
            resolved_coords[d]['gold'].append(r['gold'][0])

        for k, v in zip(docs.keys(), resolved_coords.values()):
            docs[k] = json.loads(json.dumps(v).lower())
        
    for s in systems:
        with codecs.open(s, 'r', 'utf-8') as fsystem:           
            sysname = os.path.basename(s).split('.')[0]
            for line in fsystem:
                data = json.loads(line)
                tmp = data.pop('tweet_id') + "-" + data.pop('lm')
                for i in ["1", "2", "3", "4", "5"]:
                    if tmp + "-" + i in docs:
                        if sysname not in docs[tmp + "-" + i]:
                            tmp = tmp + "-" + i
                            break
                        else:
                            continue
                    else:
                        tmp = tmp + "-" + i
                        break
                i = tmp
                docs[i][sysname] = json.loads(json.dumps(data["ranked_toponyms"]).lower())
           
            if (p_online):
                coords = {}
                for k in docs.keys(): 
                    d = docs[k]
                    coords[k] = {}
                    try:
                        coords[k][sysname] = []                       
                        for res in d[sysname]:
                            coords[k][sysname].append((float(res['latitude']), float(res['longitude'])))
                    except:
                        coords[k][sysname] = []
                        for res in d[sysname]:
                            coords[k][sysname].append((0.0, 0.0))                            
                resolved_coords = resolve_to_location(coords, p_online, cached_coords)
            else:
                resolved_coords = {}
                for k in docs.keys(): 
                    d = docs[k]
                    resolved_coords[k] = {}
                    resolved_coords[k][sysname] = []
                    for res in d[sysname]:
                        resolved_coords[k][sysname].append(
                            {"latitude":float(res['latitude'])
                             , "longitude":float(res['longitude'])
                             , "country":res['country'], "state":res['state']
                             , "city":res['city'], "county":res['county']})

            for k in docs.keys():
                true_coords = (docs[k]['gold'][0]['latitude'], docs[k]['gold'][0]['longitude'])
                for si in range(len(resolved_coords[k][sysname])):
                    pred_coords = (float(docs[k][sysname][si]['latitude']), float(docs[k][sysname][si]['longitude']))
                    v = resolved_coords[k][sysname][si]
                    docs[k][sysname][si] = json.loads(json.dumps(v).lower())
    
    if not geoloc and (g_online or p_online):
        geoloc = os.path.join(outdir, '.coords_cache.txt')
    
    if geoloc:
        with codecs.open(geoloc, 'w', 'utf-8') as outf:
            for k, v in cached_coords.items():
                outf.write('{}\t{}\n'.format(k, v))
                    
            logging.info('Updated the offline coordinates cache.')
    
    outfile = os.path.join(outdir, 'eval.output.json')
    with codecs.open(outfile, 'w', 'utf-8') as jfile:
        json.dump(docs, jfile, ensure_ascii=False, indent=4, separators=(',', ':'))
    logging.info('Saved formatted files to {}'.format(outfile))

    
    geoloceval(docs)

    return



In [None]:
lap= "msuwa" #"QCRI-IMMRAN" #"msuwa"
g = "demo\\lat-lon-mapping\\" #path to the mapping of lms to their ground truth geo-coordinates
o = "demo\\results\\" #path where results will be logged
p = "demo\\baselines\\" #path to the predictions of all systems
c = "demo\\nominatim-cache\\" #'Filepath for cached geographic location mappings'. set to None when unavailable

events = ["sample"]

g_online = False #'Use an online geocoding API; default=False (offline)'
p_online = True 
ss = ["georeferencing", "geolocator2", "geolocator3", "geoparsepy", "nominatim"]
for e in events:
    systems = []
    for s in ss:
        systems.append(p + "\\" + e + "\\" + s + ".jsonl")
    geoloc = c + e + "\\test.txt" #
    gold = g + e + "\\test.jsonl"
    outdir = o
    run(gold, systems, outdir, geoloc, g_online, p_online)

    