In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from multiprocessing import Pool
from progressbar import ProgressBar
from unidecode import unidecode
import string
import pycountry
import jellyfish
import difflib
import csv
from collections import Counter
import re
import nltk
from nltk import bigrams
from nltk import trigrams

In [2]:
df = pd.read_pickle('entities_df')

In [8]:
def remove_non_ascii(s): return "".join(i for i in s if ord(i)<128)
 
def fuzzy_match(s1, s2, max_dist=.8):
    return jellyfish.jaro_distance(s1, s2) >= max_dist

In [34]:
def title_except(s, exceptions=['a', 'an', 'the', 'and', 'but', 'or', 'for' 'nor', 'on', 'at', 'to', 'from', 'by', 'of']):
    word_list = re.split(' ', s)
    final = []
    for ix, word in enumerate(word_list):
        if word.count('.') > 1:
            # fix abbreviations correctly
            word = word.upper()
        elif '-' in word:
            location = word.find('-')
            word = word.capitalize()
            word = word.replace(word[location+1], word[location+1].upper())
        elif word[0] in ['(', '[']:
            word = word.replace(word[1], word[1].upper())
        elif ix == 0:
            word = word.capitalize()
        elif word in exceptions:
            word = word.lower()
        elif "d'ivoire" in word.lower():
            word = "d'Ivoire"
        else:
            word = word.capitalize()
        final.append(word)
    return " ".join(final)


def correct_country_mispelling(s):
    with open("ISO3166ErrorDictionary.csv", "rb") as info:
        reader = csv.reader(info)
        for row in reader:
            if s.lower() == unicode(row[0],'utf8').lower():
                return unicode(row[2], 'utf8')
            if unidecode(s).lower() == unidecode(unicode(row[0],'utf8')).lower():
                return unicode(row[2], 'utf8')
            if s.lower() == remove_non_ascii(row[0]).lower():
                return unicode(row[2], 'utf8')
    return s


def matching_countries(entity):
    # further correction for misspellings
    matched_countries = difflib.get_close_matches(entity, country_names, cutoff=0.8,)
    if matched_countries:
        confidence = difflib.SequenceMatcher(None, matched_countries[0], entity).ratio()
        return (matched_countries[0], confidence)

    
def get_countries(places, spellcheck=False):
    # correcting selling introduces some false positives
    # likelihood of official government documents being spelled incorrectly is low
    countries = []
    for place, label in places:
        if label in ['LOCATION', 'PERSON', 'ORGANIZATION']:
            place = correct_country_mispelling(place)
            if spellcheck:
                match = matching_countries(place.lower())
                if match:
                    countries.append((place, match[1]))
            else:
                if place.lower() in country_names:
                    countries.append((title_except(place), 1.0))
    c = set(Counter(name for name, _ in countries).iteritems())
    c_dict = {}
    for country, count in c:
        # gets the probability from before the counter
        c_dict.update({country: {'probability': probability, 'count': count} for name, probability in sorted(countries) if name in country})
    return c_dict

get_countries(df.ix[14,'entities'])

{u'China': {'count': 1, 'probability': 1.0},
 u'United States': {'count': 5, 'probability': 1.0}}

In [21]:
country_names = [i.name for i in pycountry.countries]

# fix country names 
def standardize_country_name(name):
    try:
        name = unicode(name, 'utf8')
    except:
        pass
    name = correct_country_mispelling(name)
    return name

country_names = [standardize_country_name(i).lower() for i in country_names]

In [13]:
subdivision_df = pd.DataFrame.from_csv('GeoLite2-City-Locations.csv', index_col=None, encoding='utf8').dropna(subset=['country_name'])
subdivision_df.head()

Unnamed: 0,geoname_id,continent_code,continent_name,country_iso_code,country_name,subdivision_iso_code,subdivision_name,city_name,metro_code,time_zone
0,1861060,AS,Asia,JP,Japan,,,,,Asia/Tokyo
1,1809858,AS,Asia,CN,China,44.0,Guangdong,Guangzhou,,Asia/Shanghai
2,1850147,AS,Asia,JP,Japan,13.0,Tōkyō,Tokyo,,Asia/Tokyo
3,1814991,AS,Asia,CN,China,,,,,
4,2077456,OC,Oceania,AU,Australia,,,,,


In [14]:
s1 = subdivision_df[['country_name', 'subdivision_name']].dropna().rename(columns={'subdivision_name':'subdivision'})
s1['type'] = 'subdivision'
s2 = subdivision_df[['country_name', 'subdivision_iso_code']].dropna().rename(columns={'subdivision_iso_code':'subdivision'})
s2['type'] = 'subdivision_code'
s3 = subdivision_df[['country_name', 'city_name']].dropna().rename(columns={'city_name':'subdivision'})
s3['type'] = 'city'
s4 = subdivision_df[['country_name', 'country_iso_code']].dropna().rename(columns={'country_iso_code':'subdivision'})
s4['type'] = 'country_code'

# add countries to 'everything'
s5 = pd.DataFrame([subdivision_df.country_name.unique()]*2).T
s5.columns = ['country_name','subdivision']
s5['type'] = 'country'

almost_everything = pd.concat([s1,s2,s3,s4,s5], ignore_index=True).drop_duplicates()

In [49]:
almost_everything.head()

Unnamed: 0,country_name,subdivision,type
0,China,Guangdong,subdivision
1,Japan,Tōkyō,subdivision
2,Australia,Victoria,subdivision
3,Thailand,Bangkok,subdivision
4,Thailand,Changwat Samut Songkhram,subdivision


In [38]:
def adjust_probabilities(old_probability, possible_countries):
    if len(set(count for _, count in possible_countries)) <= 1:
        # no change to probabilities when there are no contextual clues
        return [(country, old_probability) for country, _ in possible_countries]
    
    list_ = []
    for country, count in possible_countries:
        new_probability = old_probability
        if count == 0:
            # only decreases it by a single half if there is no nearby context for it
            decrease = new_probability / 2
            new_probability -= decrease
        for i in range(count):
            # increase probability by half for each context clue in range
            increase = (1.0 - new_probability) / 2
            new_probability += increase
        list_.append((country, new_probability))
    return list_


def remove_word(s, word):
    remove = word
    regex = re.compile(r'\b('+remove+r')\b', flags=re.IGNORECASE)
    out = regex.sub("", s)
    return out


def find_all(a_str, sub):
    start = 0
    while True:
        start = a_str.find(sub, start)
        if start == -1: return
        yield start
        start += len(sub) # use start += 1 to find overlapping matches


def context_adjustment(place, possible_countries, probability, text):
    # get contextual windows revolving around ambiguous place name
#     print('{} could be in {} with a probability of {} for each'.format(place, possible_countries, probability))
    window = 60
    bottom = lambda x: x-window if x-window > 0 else 0
    top = lambda x: x+window if x+window < len(text) else len(text)
#     print indices
    indices = list(find_all(text, place))
    contexts = [text[bottom(i):top(i)] for i in indices]    
#     print('{} has surrounding contexts of {}'.format(place, contexts))
#     print
    new_probabilities = []
    while not new_probabilities:
        # waits until any contextual clues are acquired rather than getting every possible contextual clue which can lead to false positives when get multiple copies of same error
        for context in contexts:
            context = remove_word(context, place)
            tokens = nltk.word_tokenize(context)
            codes = [t for t in tokens if t==t.upper() and t.isalpha()]

            # chop off first and last token which are likely not whole words
            tokens = [token.lower() for token in tokens if token.isalpha()][1:-1]
            bi_tokens = bigrams(tokens)
            tri_tokens = trigrams(tokens)
            tokens = tokens + [' '.join(t) for t in bi_tokens] + [' '.join(t) for t in tri_tokens]

            # fix capitalization of state codes
            tokens = [(lambda x: x.upper() if x.upper() in codes else title_except(x))(t) for t in tokens]
#             print('Recognized locations in the context are {}'.format(filter(lambda x: x in [i for i in almost_everything.subdivision.tolist()], tokens)))
            context_countries = []

            # check whether contextual token is a country subdivision
            for i in tokens:
                a = almost_everything[almost_everything.subdivision == i]
                if not a.empty:
                    list_ = a.country_name.tolist()
                    context_countries.extend(list_)
    #                 print('{} could refer to {}'.format(i, list_))

            # use the number of contextual countries that are the same as the ambiguous countries to compute new probabilities
            if context_countries:
                context_count = Counter(context_countries)
    #             print('Counts for each context-country are {}'.format(context_count))
                ambiguous_country_counts = zip(possible_countries, map(lambda x: context_count[x], possible_countries))
#                 print('Counts for ambiguous countries are {}'.format(ambiguous_country_counts))
                new_probabilities.extend(adjust_probabilities(probability, ambiguous_country_counts))
                break # break out of for loop when gather first contextual clue
        break # break out of while loop when there are no contextual clues after looping through all

    # combine multiple contexts into a single count and probability per country
    dict_ = {}
    if new_probabilities:
        country_set = {i[0] for i in new_probabilities}
        for country in country_set:
            probs = [i[1] for i in new_probabilities if i[0] == country]
            count = len(probs)
            probability = probs.pop(0)
            if probs:
                for i in probs:
                    probability = independent_either_probability(probability, i)
            dict_[country] = {'count': count, 'probability': probability}
    else:
        for country in possible_countries:
            dict_[country] = {'count': 1, 'probability': probability}
    return dict_


def independent_either_probability(oldp, newp):
    probability_non_occurrence = (1-oldp) * (1-newp)
    new_probability = 1 - probability_non_occurrence
    return new_probability


def update_countries_with_regions(entities, countries, text):
    # adds countries derived from regions to country list
    ambiguous_locations = {}
    
    subs = pd.DataFrame()
    for entity in {i[0] for i in entities if i[1]=='LOCATION'}:
        a = almost_everything[almost_everything.subdivision == entity]
        if not a.empty:
            subs = pd.concat([subs, a], ignore_index=True)
    
    if not subs.empty:
        subs.country_name = subs.country_name.apply(standardize_country_name)
        no_dupes = subs.drop_duplicates(['country_name', 'subdivision'])
        for value_count in no_dupes.subdivision.value_counts().iteritems():
            count = value_count[1]
            place = value_count[0]
            probability = 1.0 / count
            if probability == 1.0:
                # only one country exists for a single subdivision
                probability = 0.8 # correcting for imperfect entity parsing
                possible_countries = subs[subs.subdivision == place].country_name.tolist()
                country = possible_countries[0]
                if country in countries:
                    priors = countries[country]
                    new_count = priors['count'] + len(possible_countries)
                    new_probability = independent_either_probability(priors['probability'], probability)
                    countries[country] = {'count': new_count, 'probability': new_probability}
                else:
                    countries[country] = {'count': len(possible_countries), 'probability': probability}
            else:
                # multiple countries exist for a single subdivision
                possible_countries = no_dupes[no_dupes.subdivision == place].country_name.tolist()
                new_probabilities = context_adjustment(place, possible_countries, probability, text)
                ambiguous_locations[place] = {'possible_countries': possible_countries}
                for country in possible_countries:
                    if country in countries:
                        priors = countries[country]
                        new_count = priors['count'] + new_probabilities[country]['count']
                        new_probability = independent_either_probability(priors['probability'], new_probabilities[country]['probability'])
                        countries[country] = {'count': new_count, 'probability': new_probability}
                    else:
                        countries[country] = {'count': new_probabilities[country]['count'], 'probability': new_probabilities[country]['probability']}
    return countries, ambiguous_locations


In [39]:
def parse_countries(row):
    countries = get_countries(row.entities)
    text = row.title + '\n' + row.toc_subject + '\n' + ' '.join(row[1].topics) + '\n' + row.raw_text
    countries, ambiguous_locations = update_countries_with_regions(row.entities, countries, text)
    return (countries, ambiguous_locations)


In [522]:
sample = df.ix[0:20]
for row in sample.iterrows():
    print '***'
    print row[0]
    print parse_countries(row[1])

***
0
{u'Canada': {'count': 1, 'probability': 0.07142857142857142}, u'United Kingdom': {'count': 1, 'probability': 0.25}, u'Argentina': {'count': 1, 'probability': 0.07142857142857142}, u'Georgia': {'count': 2, 'probability': 1.0}, u'Italy': {'count': 1, 'probability': 0.07142857142857142}, u'Uruguay': {'count': 1, 'probability': 0.07142857142857142}, u'Saint Lucia': {'count': 1, 'probability': 0.07142857142857142}, u'Mexico': {'count': 1, 'probability': 0.07142857142857142}, u'Costa Rica': {'count': 1, 'probability': 0.07142857142857142}, u'Venezuela, Bolivarian republic of': {'count': 1, 'probability': 0.07142857142857142}, u'United States': {'count': 10, 'probability': 1.0}, u'Colombia': {'count': 1, 'probability': 0.25}, u'Panama': {'count': 1, 'probability': 0.07142857142857142}, u'Philippines': {'count': 1, 'probability': 0.07142857142857142}, u'Spain': {'count': 1, 'probability': 0.07142857142857142}, u'Moldova, Republic of': {'count': 1, 'probability': 0.07142857142857142}}
***

In [641]:
pbar = ProgressBar(maxval=df.shape[0]).start()
countries = []
for ix, row in enumerate(df.iterrows()):
    countries.append(parse_countries(row[1]))
    pbar.update(ix)
pbar.finish()

In [18]:
df[df.raw_text_url == 'https://www.federalregister.gov/articles/text/raw_text/201/231/447.txt']

Unnamed: 0,raw_text_url,title,toc_subject,topics,raw_text,entities
14,https://www.federalregister.gov/articles/text/...,Fresh Garlic From the People's Republic of Chi...,"Antidumping Duty New Shipper Reviews; Results,...",[],\nSUMMARY: \nThe Department of Commerce (Depar...,"[(Department of Commerce ( Department, ORGANIZ..."


In [40]:
countries = get_countries(df.ix[14,'entities'])
update_countries_with_regions(df.ix[14,'entities'], countries, df.ix[14,'raw_text'])

({u'China': {'count': 2, 'probability': 1.0},
  u'Colombia': {'count': 1, 'probability': 0.25},
  u'Russian Federation': {'count': 1, 'probability': 0.16666666666666666},
  u'United Kingdom': {'count': 1, 'probability': 0.25},
  u'United States': {'count': 9, 'probability': 1.0}},
 {u'China': {'possible_countries': [u'United States',
    u'Russian Federation',
    u'China']},
  u'DC': {'possible_countries': [u'United States', u'Colombia']},
  u'Washington': {'possible_countries': [u'United States',
    u'United Kingdom']}})

In [None]:
{u'China': {'count': 1, 'probability': 0.16666666666666666},
 u'Colombia': {'count': 1, 'probability': 0.25},
 u'Russian Federation': {'count': 1, 'probability': 0.16666666666666666},
 u'United Kingdom': {'count': 1, 'probability': 0.25},
 u'United States': {'count': 4, 'probability': 0.9999674479166667}}