In [231]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from multiprocessing import Pool
from progressbar import ProgressBar
from unidecode import unidecode
import string
import pycountry
import jellyfish
import difflib
import csv
from collections import Counter
import re
import nltk

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
df = pd.read_pickle('entities_df')

In [6]:
def remove_non_ascii(s): return "".join(i for i in s if ord(i)<128)
 
def fuzzy_match(s1, s2, max_dist=.8):
    return jellyfish.jaro_distance(s1, s2) >= max_dist

In [122]:
country_names = [i.name.lower() for i in pycountry.countries]


def title_except(s, exceptions=['a', 'an', 'of', 'the', 'is']):
    word_list = re.split(' ', s)       #re.split behaves as expected
    final = [word_list[0].capitalize()]
    for word in word_list[1:]:
        final.append(word in exceptions and word or word.capitalize())
    return " ".join(final)


def correct_country_mispelling(s):
    with open("ISO3166ErrorDictionary.csv", "rb") as info:
        reader = csv.reader(info)
        for row in reader:
            if s.lower() == unicode(row[0],'utf8').lower():
                return row[2]
            if s.lower() == unidecode(row[0]).lower():
                return row[2]
            if s.lower() == remove_non_ascii(row[0]).lower():
                return row[2]
    return s


def matching_countries(entity):
    # further correction for misspellings
    matching_countries = difflib.get_close_matches(entity, country_names, cutoff=0.8,)
    if matching_countries:
        confidence = difflib.SequenceMatcher(None, matching_countries[0], entity).ratio()
        return (matching_countries[0], confidence)

    
def get_countries(places, spellcheck=False):
    # correcting selling introduces some false positives
    # likelihood of official government documents being spelled incorrectly is low
    countries = []
    for place, label in places:
        if label == 'LOCATION':
            place = correct_country_mispelling(place)
            if spellcheck:
                match = matching_countries(place.lower())
                if match:
                    countries.append((place, match[1]))
            else:
                if place.lower() in country_names:
                    countries.append((place, 1.0))
    c = set(Counter(name for name, _ in countries).iteritems())
    c_dict = {}
    for country, count in c:
        # gets the probability from before the counter
        c_dict.update({country: {'probability': probability, 'count': count} for name, probability in sorted(countries) if name in country})
    return c_dict


In [279]:
subdivision_df = pd.DataFrame.from_csv('GeoLite2-City-Locations.csv', index_col=None, encoding='utf8')
subdivision_df.head()

Unnamed: 0,geoname_id,continent_code,continent_name,country_iso_code,country_name,subdivision_iso_code,subdivision_name,city_name,metro_code,time_zone
0,1861060,AS,Asia,JP,Japan,,,,,Asia/Tokyo
1,1809858,AS,Asia,CN,China,44.0,Guangdong,Guangzhou,,Asia/Shanghai
2,1850147,AS,Asia,JP,Japan,13.0,Tōkyō,Tokyo,,Asia/Tokyo
3,1814991,AS,Asia,CN,China,,,,,
4,2077456,OC,Oceania,AU,Australia,,,,,


In [280]:
s1 = subdivision_df[['country_name', 'subdivision_name']].dropna().rename(columns={'subdivision_name':'subdivision'})
s1['type'] = 'subdivision'
s2 = subdivision_df[['country_name', 'subdivision_iso_code']].dropna().rename(columns={'subdivision_iso_code':'subdivision'})
s2['type'] = 'subdivision_code'
s3 = subdivision_df[['country_name', 'city_name']].dropna().rename(columns={'city_name':'subdivision'})
s3['type'] = 'city'
alles = pd.concat([s1,s2,s3], ignore_index=True).drop_duplicates()

In [281]:
alles.head()

Unnamed: 0,country_name,subdivision,type
0,China,Guangdong,subdivision
1,Japan,Tōkyō,subdivision
2,Australia,Victoria,subdivision
3,Thailand,Bangkok,subdivision
4,Thailand,Changwat Samut Songkhram,subdivision


In [289]:
unidecode(alles.subdivision.tolist()[1])
print unidecode(u'Ãland Islands')

Aland Islands


In [364]:
def adjust_probabilities(old_probability, possible_countries):
    if sum(count for _, count in possible_countries) == 0:
        # no change to probabilities when there are no contextual clues
        return {country: old_probability for country, _ in possible_countries}
    
    dict_ = {}
    for country, count in possible_countries:
        new_probability = old_probability
        if count == 0:
            # only decreases it by a single half if there is no nearby context for it
            decrease = (1.0 - new_probability) / 2
            new_probability -= decrease
        for i in range(count):
            # increase probability by half for each context clue in range
            increase = (1.0 - new_probability) / 2
            new_probability += increase
        dict_.update({country: new_probability})
    return dict_


def context_adjustment(place, possible_countries, probability, text):
    print('{} could be in {} with a probability of {} each'.format(place, possible_countries, probability))
    tokens = [w for w in nltk.word_tokenize(text) if w.isalpha()]
    context = [tokens[i-10:i+10] for i in [ix for ix, i in enumerate(tokens) if i == place]][0]
    context.remove(place)
    print('{} has a surrounding context of {}'.format(place, context))
    print('Recognized locations in the context are {}'.format(filter(lambda x: x in alles.subdivision.tolist(), context)))
    context_countries = []
    for i in context:
        a = alles[alles.subdivision == i]
        if not a.empty:
            list_ = a.country_name.tolist()
            context_countries.extend(list_)
            print('{} could refer to {}'.format(i, list_))
    context_count = Counter(context_countries)
    print('Counts for each context-country are {}'.format(context_count))
    ambiguous_country_counts = zip(possible_countries, map(lambda x: context_count[x], possible_countries))
    print('Counts for ambiguous countries are {}'.format(ambiguous_country_counts))
    new_probabilities = adjust_probabilities(probability, ambiguous_country_counts)
    print new_probabilities
    print
    return new_probabilities
    

def update_countries_with_regions(entities, countries, text):
    subs = pd.DataFrame()
    for entity, _ in entities:
        a = alles[alles.subdivision == entity]
        if not a.empty:
            subs = pd.concat([subs, a], ignore_index=True)

    if not subs.empty:
        no_dupes = subs.drop_duplicates(['country_name', 'subdivision'])
        for value_count in no_dupes.subdivision.value_counts().iteritems():
            count = value_count[1]
            place = value_count[0]
            probability = 1.0 / count
            if probability == 1.0:
                # only one country exists for any probability
                probability = 0.8 # correcting for imperfect entity parsing
                possible_countries = subs[subs.subdivision == place].country_name.tolist()
                country = possible_countries[0]
                if country in countries:
                    priors = countries[country]
                    new_count = priors['count'] + len(possible_countries)
                    probability_non_occurrence = (1-priors['probability']) * (1-probability)
                    new_probability = 1 - probability_non_occurrence
                    countries.update({country: {'count': new_count, 'probability': new_probability}})
                else:
                    countries.update({country: {'count': len(possible_countries), 'probability': probability}})
            else:
                # multiple countries exist for a single subdivision
                possible_countries = no_dupes[no_dupes.subdivision == place].country_name.tolist()
                new_probabilities = context_adjustment(place, possible_countries, probability, text)
                for country in possible_countries:
                    if country in countries:
                        priors = countries[country]
                        new_count = priors['count'] + 1
                        probability_non_occurrence = (1-priors['probability']) * (1-new_probabilities[country])
                        new_probability = 1 - probability_non_occurrence
                        countries.update({country: {'count': new_count, 'probability': new_probability}})
                    else:
                        countries.update({country: {'count': 1, 'probability': new_probabilities[country]}})
    return countries


In [190]:
sample = df.ix[8]

In [363]:
countries = get_countries(sample.entities)
update_countries_with_regions(sample.entities, countries, sample.raw_text)

Lisbon could be in [u'Portugal', u'United States'] with a probability of 0.5 each
Lisbon has a surrounding context of [u'incarcerated', u'at', u'Inmate', u'Number', u'FCI', u'Elkton', u'Federal', u'Correctional', u'Institution', u'Box', u'OH', u'and', u'with', u'an', u'address', u'at', u'Courtland', u'Place', u'Laurel']
Recognized locations in the context are [u'Elkton', u'Federal', u'Box', u'OH', u'Courtland', u'Place', u'Laurel']
Elkton could refer to [u'United States']
Federal could refer to [u'Brazil', u'Argentina']
Box could refer to [u'United Kingdom', u'Finland']
OH could refer to [u'United States']
Courtland could refer to [u'United States']
Place could refer to [u'France']
Laurel could refer to [u'United States']
Counts for each context-country are Counter({u'United States': 4, u'Brazil': 1, u'United Kingdom': 1, u'Finland': 1, u'France': 1, u'Argentina': 1})
Counts for ambiguous countries are [(u'Portugal', 0), (u'United States', 4)]
{u'United States': 0.96875, u'Portugal': 0

{u'Australia': {'count': 1, 'probability': 0.25},
 u'Nigeria': {'count': 2, 'probability': 1.0},
 u'Portugal': {'count': 1, 'probability': 0.25},
 u'United States': {'count': 16, 'probability': 1.0}}

In [None]:
# check whether any part of an entity string relates to a country