In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from multiprocessing import Pool
from progressbar import ProgressBar
from unidecode import unidecode
import string
import pycountry
import jellyfish
import difflib
import csv
from collections import Counter
import re
import nltk
from nltk import bigrams
from nltk import trigrams

In [2]:
df = pd.read_pickle('entities_df')

In [3]:
def remove_non_ascii(s): return "".join(i for i in s if ord(i)<128)
 
def fuzzy_match(s1, s2, max_dist=.8):
    return jellyfish.jaro_distance(s1, s2) >= max_dist

In [49]:
country_names = [i.name.lower() for i in pycountry.countries]


def title_except(s, exceptions=['a', 'an', 'of', 'the', 'is']):
    word_list = re.split(' ', s)       #re.split behaves as expected
    final = [word_list[0].capitalize()]
    for word in word_list[1:]:
        final.append(word in exceptions and word or word.capitalize())
    return " ".join(final)


def correct_country_mispelling(s):
    with open("ISO3166ErrorDictionary.csv", "rb") as info:
        reader = csv.reader(info)
        for row in reader:
            if s.lower() == unicode(row[0],'utf8').lower():
                return row[2]
            if s.lower() == unidecode(unicode(row[0],'utf8')).lower():
                return row[2]
            if s.lower() == remove_non_ascii(row[0]).lower():
                return row[2]
    return s


def matching_countries(entity):
    # further correction for misspellings
    matching_countries = difflib.get_close_matches(entity, country_names, cutoff=0.8,)
    if matching_countries:
        confidence = difflib.SequenceMatcher(None, matching_countries[0], entity).ratio()
        return (matching_countries[0], confidence)

    
def get_countries(places, spellcheck=False):
    # correcting selling introduces some false positives
    # likelihood of official government documents being spelled incorrectly is low
    countries = []
    for place, label in places:
        if label == 'LOCATION':
            place = correct_country_mispelling(place)
            if spellcheck:
                match = matching_countries(place.lower())
                if match:
                    countries.append((place, match[1]))
            else:
                if place.lower() in country_names:
                    countries.append((place, 1.0))
    c = set(Counter(name for name, _ in countries).iteritems())
    c_dict = {}
    for country, count in c:
        # gets the probability from before the counter
        c_dict.update({country: {'probability': probability, 'count': count} for name, probability in sorted(countries) if name in country})
    return c_dict


In [5]:
subdivision_df = pd.DataFrame.from_csv('GeoLite2-City-Locations.csv', index_col=None, encoding='utf8')
subdivision_df.head()

Unnamed: 0,geoname_id,continent_code,continent_name,country_iso_code,country_name,subdivision_iso_code,subdivision_name,city_name,metro_code,time_zone
0,1861060,AS,Asia,JP,Japan,,,,,Asia/Tokyo
1,1809858,AS,Asia,CN,China,44.0,Guangdong,Guangzhou,,Asia/Shanghai
2,1850147,AS,Asia,JP,Japan,13.0,Tōkyō,Tokyo,,Asia/Tokyo
3,1814991,AS,Asia,CN,China,,,,,
4,2077456,OC,Oceania,AU,Australia,,,,,


In [6]:
s1 = subdivision_df[['country_name', 'subdivision_name']].dropna().rename(columns={'subdivision_name':'subdivision'})
s1['type'] = 'subdivision'
s2 = subdivision_df[['country_name', 'subdivision_iso_code']].dropna().rename(columns={'subdivision_iso_code':'subdivision'})
s2['type'] = 'subdivision_code'
s3 = subdivision_df[['country_name', 'city_name']].dropna().rename(columns={'city_name':'subdivision'})
s3['type'] = 'city'
alles = pd.concat([s1,s2,s3], ignore_index=True).drop_duplicates()

In [7]:
alles.head()

Unnamed: 0,country_name,subdivision,type
0,China,Guangdong,subdivision
1,Japan,Tōkyō,subdivision
2,Australia,Victoria,subdivision
3,Thailand,Bangkok,subdivision
4,Thailand,Changwat Samut Songkhram,subdivision


In [120]:
def adjust_probabilities(old_probability, possible_countries):
    if sum(count for _, count in possible_countries) == 0:
        # no change to probabilities when there are no contextual clues
        return [(country, old_probability) for country, _ in possible_countries]
    
    list_ = []
    for country, count in possible_countries:
        new_probability = old_probability
        if count == 0:
            # only decreases it by a single half if there is no nearby context for it
            decrease = new_probability / 2
            new_probability -= decrease
        for i in range(count):
            # increase probability by half for each context clue in range
            increase = (1.0 - new_probability) / 2
            new_probability += increase
        list_.append((country, new_probability))
    return list_


def remove_word(s, word):
    remove = word
    regex = re.compile(r'\b('+remove+r')\b', flags=re.IGNORECASE)
    out = regex.sub("", s)
    return out


def find_all(a_str, sub):
    start = 0
    while True:
        start = a_str.find(sub, start)
        if start == -1: return
        yield start
        start += len(sub) # use start += 1 to find overlapping matches


def context_adjustment(place, possible_countries, probability, text):
    # get contextual windows revolving around ambiguous place name
#     print('{} could be in {} with a probability of {} for each'.format(place, possible_countries, probability))
    window = 60
    bottom = lambda x: x-window if x-window > 0 else 0
    top = lambda x: x+window if x+window < len(text) else len(text)
#     indices = [n for n in xrange(len(text)) if text.lower().find(place.lower(), n) == n]
#     print indices
#     indices = [m.start() for m in re.finditer(place, text, flags=re.IGNORECASE)]
    indices = list(find_all(text, place))
    contexts = [text[bottom(i):top(i)] for i in indices]    
#     print('{} has surrounding contexts of {}'.format(place, contexts))
#     print
    new_probabilities = []
    while not new_probabilities:
        # waits until any contextual clues are acquired rather than getting every possible contextual clue which can lead to false positives when get multiple copies of same error
        for context in contexts:
            context = remove_word(context, place)
            tokens = nltk.word_tokenize(context)
            codes = [t.lower() for t in tokens if t==t.upper()]

            # chop off first and last token which are likely not whole words
            tokens = [token.lower() for token in tokens if token.isalpha()][1:-2]
            bi_tokens = bigrams(tokens)
            tri_tokens = trigrams(tokens)
            tokens = tokens + [' '.join(t) for t in bi_tokens] + [' '.join(t) for t in tri_tokens]

            # fix capitalization of state codes
            tokens = [(lambda x: x.upper() if x in codes else title_except(x))(t) for t in tokens]
    #         print('Recognized locations in the context are {}'.format(filter(lambda x: x in [i for i in alles.subdivision.tolist()], tokens)))
            context_countries = []

            # check whether contextual token is a country subdivision
            for i in tokens:
                a = alles[alles.subdivision == i]
                if not a.empty:
                    list_ = a.country_name.tolist()
                    context_countries.extend(list_)
    #                 print('{} could refer to {}'.format(i, list_))

            # use the number of contextual countries that are the same as the ambiguous countries to compute new probabilities
            if context_countries:
                context_count = Counter(context_countries)
    #             print('Counts for each context-country are {}'.format(context_count))
                ambiguous_country_counts = zip(possible_countries, map(lambda x: context_count[x], possible_countries))
    #             print('Counts for ambiguous countries are {}'.format(ambiguous_country_counts))
                new_probabilities.extend(adjust_probabilities(probability, ambiguous_country_counts))
                break
        
    # combine multiple contexts into a single count and probability per country
    dict_ = {}
    if new_probabilities:
        country_set = {i[0] for i in new_probabilities}
        for country in country_set:
            probs = [i[1] for i in new_probabilities if i[0] == country]
            count = len(probs)
            probabilitiy = probs.pop(0)
            if count > 1:
                for i in probs:
                    probability = independent_either_probability(probability, i)
            dict_[country] = {'count': count, 'probability': probability}
    else:
        for country in possible_countries:
            dict_[country] = {'count': 1, 'probability': probability}
    return dict_


def independent_either_probability(oldp, newp):
    probability_non_occurrence = (1-oldp) * (1-newp)
    new_probability = 1 - probability_non_occurrence
    return new_probability


def update_countries_with_regions(entities, countries, text):
    # adds countries derived from regions to country list
    subs = pd.DataFrame()
    for entity in {i[0] for i in entities if i[1]=='LOCATION'}:
        a = alles[alles.subdivision == entity]
        if not a.empty:
            subs = pd.concat([subs, a], ignore_index=True)

    if not subs.empty:
        no_dupes = subs.drop_duplicates(['country_name', 'subdivision'])
        for value_count in no_dupes.subdivision.value_counts().iteritems():
            count = value_count[1]
            place = value_count[0]
            probability = 1.0 / count
            if probability == 1.0:
                # only one country exists for any probability
                probability = 0.8 # correcting for imperfect entity parsing
                possible_countries = subs[subs.subdivision == place].country_name.tolist()
                country = possible_countries[0]
                if country in countries:
                    priors = countries[country]
                    new_count = priors['count'] + len(possible_countries)
                    new_probability = independent_either_probability(priors['probability'], probability)
                    countries[country] = {'count': new_count, 'probability': new_probability}
                else:
                    countries[country] = {'count': len(possible_countries), 'probability': probability}
            else:
                # multiple countries exist for a single subdivision
                possible_countries = no_dupes[no_dupes.subdivision == place].country_name.tolist()
                new_probabilities = context_adjustment(place, possible_countries, probability, text)
                for country in possible_countries:
                    if country in countries:
                        priors = countries[country]
                        new_count = priors['count'] + new_probabilities[country]['count']
                        new_probability = independent_either_probability(priors['probability'], new_probabilities[country]['probability'])
                        countries[country] = {'count': new_count, 'probability': new_probability}
                    else:
                        countries[country] = {'count': new_probabilities[country]['count'], 'probability': new_probabilities[country]['probability']}
    return countries


In [121]:
def parse_countries(row):
    countries = get_countries(row.entities)
    countries = update_countries_with_regions(row.entities, countries, row.raw_text)
    return countries

parse_countries(df.ix[9])

{u'Australia': {'count': 1, 'probability': 0.3333333333333333},
 u'Brazil': {'count': 1, 'probability': 0.3333333333333333},
 u'Costa Rica': {'count': 1, 'probability': 0.2},
 u'Czechia': {'count': 1, 'probability': 0.5},
 u'El Salvador': {'count': 1, 'probability': 0.5},
 u'Mexico': {'count': 1, 'probability': 0.2},
 u'New Zealand': {'count': 1, 'probability': 0.8},
 u'Panama': {'count': 1, 'probability': 0.2},
 u'Qatar': {'count': 1, 'probability': 0.3333333333333333},
 u'Spain': {'count': 1, 'probability': 0.2},
 u'Trinidad and Tobago': {'count': 1, 'probability': 0.3333333333333333},
 u'United States': {'count': 10, 'probability': 1.0}}

In [122]:
sample = df.ix[0:10]
for row in sample.iterrows():
    print '***'
    print row[0]
    print parse_countries(row[1])

***
0
{u'United Kingdom': {'count': 1, 'probability': 0.5}, u'Georgia': {'count': 1, 'probability': 1.0}, u'Italy': {'count': 1, 'probability': 0.16666666666666666}, u'Saint Lucia': {'count': 1, 'probability': 0.16666666666666666}, u'Mexico': {'count': 1, 'probability': 0.14285714285714285}, u'Philippines': {'count': 1, 'probability': 0.14285714285714285}, u'Uruguay': {'count': 1, 'probability': 0.16666666666666666}, u'Costa Rica': {'count': 1, 'probability': 0.14285714285714285}, u'Moldova': {'count': 1, 'probability': 0.16666666666666666}, u'United States': {'count': 9, 'probability': 1.0}, u'Colombia': {'count': 1, 'probability': 0.5}, u'Argentina': {'count': 1, 'probability': 0.14285714285714285}, u'Venezuela': {'count': 1, 'probability': 0.14285714285714285}, u'Panama': {'count': 1, 'probability': 0.14285714285714285}, u'Spain': {'count': 1, 'probability': 0.16666666666666666}}
***
1
{'United States': {'count': 2, 'probability': 1.0}}
***
2
{u'Canada': {'count': 1, 'probability': 

In [663]:




# create default for where if there is no ambiguity resolution
# make it no change to probabilities when count for each is EQUAL rather than just zero
# run alles country names through misspelling corrector so that all the country names are the same. fuzzy match?
# needs to take into account when subdivision name is made up of multiple words / chunk.
# take into account continent_name and country_iso_code (left as caps) in subdivision_df
# should probably convert back to string.lower() from token
# needs to take into account actual country names in context too. not just subdivisions
# does it take into account multiple locations or more than one count for each city?
# when countries are tagged should remove them from the list unless they could also be a city/division name?

In [400]:
sorted(alles.country_name.unique())

[u'Afghanistan',
 u'Albania',
 u'Algeria',
 u'American Samoa',
 u'Andorra',
 u'Angola',
 u'Anguilla',
 u'Antarctica',
 u'Antigua and Barbuda',
 u'Argentina',
 u'Armenia',
 u'Aruba',
 u'Australia',
 u'Austria',
 u'Azerbaijan',
 u'Bahamas',
 u'Bahrain',
 u'Bangladesh',
 u'Barbados',
 u'Belarus',
 u'Belgium',
 u'Belize',
 u'Benin',
 u'Bermuda',
 u'Bhutan',
 u'Bolivia',
 u'Bonaire',
 u'Bosnia and Herzegovina',
 u'Botswana',
 u'Brazil',
 u'British Virgin Islands',
 u'Brunei',
 u'Bulgaria',
 u'Burkina Faso',
 u'Burundi',
 u'Cambodia',
 u'Cameroon',
 u'Canada',
 u'Cape Verde',
 u'Cayman Islands',
 u'Central African Republic',
 u'Chad',
 u'Chile',
 u'China',
 u'Colombia',
 u'Comoros',
 u'Congo',
 u'Cook Islands',
 u'Costa Rica',
 u'Croatia',
 u'Cuba',
 u'Cura\xe7ao',
 u'Cyprus',
 u'Czechia',
 u'Denmark',
 u'Dominica',
 u'Dominican Republic',
 u'East Timor',
 u'Ecuador',
 u'Egypt',
 u'El Salvador',
 u'Equatorial Guinea',
 u'Estonia',
 u'Ethiopia',
 u'Falkland Islands',
 u'Faroe Islands',
 u'Fij

In [664]:
df.ix[0:3].apply(parse_countries, axis=1)

San Francisco could be in [u'United States', u'Philippines', u'Costa Rica', u'Panama', u'Mexico', u'Argentina', u'Venezuela'] with a probability of 0.142857142857 for each
[931, 946]
San Francisco has surrounding contexts of [u' display of the exhibit objects at the Fine Arts Museums of San Francisco, San Francisco, CA, from on or about January 2', u' exhibit objects at the Fine Arts Museums of San Francisco, San Francisco, CA, from on or about January 26, 2013, until ']

Recognized locations in the context are [u'CA', u'From']
CA could refer to [u'Italy', u'United States', u'Saint Lucia', u'Spain', u'Moldova', u'Uruguay']
From could refer to [u'Norway']
Counts for each context-country are Counter({u'Italy': 1, u'Uruguay': 1, u'Saint Lucia': 1, u'Moldova': 1, u'United States': 1, u'Norway': 1, u'Spain': 1})
Counts for ambiguous countries are [(u'United States', 1), (u'Philippines', 0), (u'Costa Rica', 0), (u'Panama', 0), (u'Mexico', 0), (u'Argentina', 0), (u'Venezuela', 0)]
Recognized 

0    <built-in method values of dict object at 0x11...
1    <built-in method values of dict object at 0x11...
2    <built-in method values of dict object at 0x11...
3    <built-in method values of dict object at 0x11...
dtype: object

In [None]:
# check whether any part of an entity string relates to a country
# apply probability adjustment like method to spellcheck for get_countries
# apply factual
# build model