In [283]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from multiprocessing import Pool
from progressbar import ProgressBar
from unidecode import unidecode
import string
import pycountry
import jellyfish
import difflib
import csv
from collections import Counter
import re
import nltk
from nltk import bigrams
from nltk import trigrams
import textual

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df = pd.read_pickle('entities_df')

In [306]:
def correct_country_mispelling(s):
    with open("data/ISO3166ErrorDictionary.csv", "rb") as info:
        reader = csv.reader(info)
        for row in reader:
            if s.lower() == unicode(row[0],'utf8').lower():
                return unicode(row[2], 'utf8')
            if unidecode(s).lower() == unidecode(unicode(row[0],'utf8')).lower():
                return unicode(row[2], 'utf8')
            if s.lower() == textual.remove_non_ascii(row[0]).lower():
                return unicode(row[2], 'utf8')
    return s


def matching_countries(entity):
    # further correction for misspellings
    matched_countries = difflib.get_close_matches(entity, country_names, cutoff=0.8,)
    if matched_countries:
        confidence = difflib.SequenceMatcher(None, matched_countries[0], entity).ratio()
        return (matched_countries[0], confidence)

    
def get_countries(places, spellcheck=False):
    # correcting selling introduces some false positives
    # likelihood of official government documents being spelled incorrectly is low
    countries = []
    for place, label in places:
        if label in ['LOCATION', 'PERSON', 'ORGANIZATION']:
            place = correct_country_mispelling(place)
            if spellcheck:
                match = matching_countries(place.lower())
                if match:
                    countries.append((place, match[1]))
            else:
                if place.lower() in country_names:
                    countries.append((textual.titlecase(place), 1.0))
    c = set(Counter(name for name, _ in countries).iteritems())
    c_dict = {}
    for country, count in c:
        # gets the probability from before the counter
        c_dict.update({country: {'probability': probability, 'count': count} for name, probability in sorted(countries) if name in country})
    return c_dict

get_countries(df.ix[8,'entities'])

{u'Nigeria': {'count': 2, 'probability': 1.0},
 u'United States': {'count': 12, 'probability': 1.0}}

In [293]:
country_names = [i.name for i in pycountry.countries]

# fix country names 
def standardize_country_name(name):
    try:
        name = unicode(name, 'utf8')
    except:
        pass
    name = correct_country_mispelling(name)
    return name

country_names = [standardize_country_name(i).lower() for i in country_names]

In [294]:
subdivision_df = pd.DataFrame.from_csv('data/GeoLite2-City-Locations.csv', index_col=None, encoding='utf8').dropna(subset=['country_name'])
subdivision_df.head()

Unnamed: 0,geoname_id,continent_code,continent_name,country_iso_code,country_name,subdivision_iso_code,subdivision_name,city_name,metro_code,time_zone
0,1861060,AS,Asia,JP,Japan,,,,,Asia/Tokyo
1,1809858,AS,Asia,CN,China,44.0,Guangdong,Guangzhou,,Asia/Shanghai
2,1850147,AS,Asia,JP,Japan,13.0,Tōkyō,Tokyo,,Asia/Tokyo
3,1814991,AS,Asia,CN,China,,,,,
4,2077456,OC,Oceania,AU,Australia,,,,,


In [295]:
s1 = subdivision_df[['country_name', 'subdivision_name']].dropna().rename(columns={'subdivision_name':'subdivision'})
s1['type'] = 'subdivision'
s2 = subdivision_df[['country_name', 'subdivision_iso_code']].dropna().rename(columns={'subdivision_iso_code':'subdivision'})
s2['type'] = 'subdivision_code'
s3 = subdivision_df[['country_name', 'city_name']].dropna().rename(columns={'city_name':'subdivision'})
s3['type'] = 'city'
s4 = subdivision_df[['country_name', 'country_iso_code']].dropna().rename(columns={'country_iso_code':'subdivision'})
s4['type'] = 'country_code'

# add countries to 'everything'
s5 = pd.DataFrame([subdivision_df.country_name.unique()]*2).T
s5.columns = ['country_name','subdivision']
s5['type'] = 'country'

almost_everything = pd.concat([s1,s2,s3,s4,s5], ignore_index=True).drop_duplicates()

In [296]:
almost_everything.head()

Unnamed: 0,country_name,subdivision,type
0,China,Guangdong,subdivision
1,Japan,Tōkyō,subdivision
2,Australia,Victoria,subdivision
3,Thailand,Bangkok,subdivision
4,Thailand,Changwat Samut Songkhram,subdivision


In [297]:
import textual
from nltk import word_tokenize, bigrams, trigrams
import string
from unidecode import unidecode

In [298]:
tupled_everything = [tuple(x) for x in almost_everything[['country_name', 'subdivision']].values]

In [299]:
sample = df.ix[14,'raw_text']

In [301]:
def check_for_subdivision(list_of_tuples, term):
    matches = [(country, subdivision) for country, subdivision in list_of_tuples if subdivision == term]
    return matches


def adjust_probabilities(old_probability, possible_countries):
    if len(set(count for _, count in possible_countries)) <= 1:
        # no change to probabilities when there are no contextual clues
        return [(country, old_probability) for country, _ in possible_countries]
    
    list_ = []
    for country, count in possible_countries:
        new_probability = old_probability
        if count == 0:
            # only decreases it by a single half if there is no nearby context for it
            decrease = new_probability / 2
            new_probability -= decrease
        for i in range(count):
            # increase probability by half for each context clue in range
            increase = (1.0 - new_probability) / 2
            new_probability += increase
        list_.append((country, new_probability))
    return list_


def context_adjustment(place, possible_countries, probability, text):
    contexts = get_contexts(place, text)
    new_probabilities = []
    
    lump = []
    for context in contexts:
        # remove the original place from tokens
        context = textual.remove_words(context, place)
        tokens = nltk.word_tokenize(context)
        # chop off first and last token which are likely not whole words
        tokens = [token.lower() for token in tokens if token.isalpha()][1:-1]

        bi_tokens = bigrams(tokens)
        tri_tokens = trigrams(tokens)
        tokens = tokens + [' '.join(t) for t in bi_tokens] + [' '.join(t) for t in tri_tokens]

        # maintain capitalization of abbreviations and state codes
#             tokens = [(lambda x: x if x == x.upper() else x.lower())(t) for t in tokens]
        tokens = [(lambda x: x if x == x.upper() else textual.titlecase(x))(t) for t in tokens]
        lump.extend(tokens)

    # check whether contextual token is a country subdivision
    context_countries = []
    for token in lump:
        context_countries.extend([country for country, subdivision in tupled_everything if subdivision == token])

    # use the existance of contextual countries to clarify ambiguous countries
    # if you dont take the set then you end up with false positives from multiple copies of same wrong country
    if context_countries:
        context_count = Counter(set(context_countries))
#                 print('Counts for each context-country are {}'.format(context_count))
        ambiguous_country_counts = zip(possible_countries, map(lambda x: context_count[x], possible_countries))
#                 print('Counts for ambiguous countries are {}'.format(ambiguous_country_counts))
        new_probabilities.extend(adjust_probabilities(probability, ambiguous_country_counts))

    # combine multiple contexts into a single count and probability per country 
    dict_ = {}
    if new_probabilities:
        country_set = {i[0] for i in new_probabilities}
        for country in country_set:
            probs = [i[1] for i in new_probabilities if i[0] == country]
            count = len(probs)
            probability = probs.pop(0)
            if probs:
                for i in probs:
                    probability = independent_either_probability(probability, i)
            dict_[country] = {'count': count, 'probability': probability}
    else:
        for country in possible_countries:
            dict_[country] = {'count': 1, 'probability': probability}
    return dict_


def independent_either_probability(oldp, newp):
    probability_non_occurrence = (1-oldp) * (1-newp)
    new_probability = 1 - probability_non_occurrence
    return new_probability


def update_countries_with_regions(entities, found_countries, text):
    # adds countries derived from regions to country list
    ambiguous_locations = {}
    
    matches = []
    
    for entity in {i[0] for i in entities if i[1]=='LOCATION'}:
        matches.extend(check_for_subdivision(tupled_everything, entity))

    if matches:
        matches = [(standardize_country_name(country), subdivision) for country, subdivision in matches]
        no_dupes = set(matches)
        tokenized = False
        for place, count in Counter(i[1] for i in no_dupes).items():
            probability = 1.0 / count
            possible_countries = [country for country, subdivision in matches if subdivision == place]
            if count == 1:
                # only one country exists for a single subdivision
                probability = 0.8 # correcting for imperfect entity parsing
                country = possible_countries[0]
                if country in found_countries:
                    priors = found_countries[country]
                    new_count = priors['count'] + len(possible_countries)
                    new_probability = independent_either_probability(priors['probability'], probability)
                    found_countries[country] = {'count': new_count, 'probability': new_probability}
                else:
                    found_countries[country] = {'count': len(possible_countries), 'probability': probability}
            else:
                # multiple countries exist for a single subdivision                  
                possible_countries = set(possible_countries)
                new_probabilities = context_adjustment(place, possible_countries, probability, text)
                ambiguous_locations[place] = {'possible_countries': possible_countries}
                for country in possible_countries:
                    if country in found_countries:
                        priors = found_countries[country]
                        new_count = priors['count'] + new_probabilities[country]['count']
                        new_probability = independent_either_probability(priors['probability'], new_probabilities[country]['probability'])
                        found_countries[country] = {'count': new_count, 'probability': new_probability}
                    else:
                        found_countries[country] = {'count': new_probabilities[country]['count'], 'probability': new_probabilities[country]['probability']}
    return found_countries, ambiguous_locations


def get_contexts(term, text):
    # get contextual windows revolving around ambiguous terms
    window = 60
    bottom = lambda x: x-window if x-window > 0 else 0
    top = lambda x: x+window if x+window < len(text) else len(text)
    indices = list(textual.find_all(text, term))
    contexts = [text[bottom(i):top(i)] for i in indices]
    return contexts


In [305]:
parse_countries(df.ix[8])

({u'Australia': {'count': 1, 'probability': 0.5},
  u'Nigeria': {'count': 3, 'probability': 1.0},
  u'Portugal': {'count': 1, 'probability': 0.25},
  u'United States': {'count': 16, 'probability': 1.0}},
 {u'Lisbon': {'possible_countries': {u'Portugal', u'United States'}},
  u'Maryland': {'possible_countries': {u'Australia', u'United States'}}})

In [287]:
def parse_countries(row):
    countries = get_countries(row.entities)
    text = row.title + '\n' + row.toc_subject + '\n' + ' '.join(row.topics) + '\n' + row.raw_text
    countries, ambiguous_locations = update_countries_with_regions(row.entities, countries, text)
    return (countries, ambiguous_locations)


In [281]:
sample = df.ix[0:20]
for row in sample.iterrows():
    print '***'
    print row[0]
    print parse_countries(row[1])

***
0
[0.1377551020408163, 0.4375, 1.0, 0.1377551020408163, 0.1377551020408163, 0.1377551020408163, 0.1377551020408163, 0.1377551020408163, 0.1377551020408163, 0.1377551020408163, 1.0, 0.4375, 0.1377551020408163, 0.1377551020408163, 0.1377551020408163, 0.1377551020408163]
[0.1377551020408163, 0.4375, 1.0, 0.1377551020408163, 0.1377551020408163, 0.1377551020408163, 0.1377551020408163, 0.1377551020408163, 0.1377551020408163, 0.1377551020408163, 1.0, 0.4375, 0.1377551020408163, 0.1377551020408163, 0.1377551020408163, 0.1377551020408163]
***
1
[1.0]
[1.0]
***
2
[0.15972222222222232, 0.826388888888889, 0.15972222222222232, 0.9722222222222222, 0.18999999999999995, 0.15972222222222232, 1.0, 0.18999999999999995, 0.18999999999999995]
[0.15972222222222232, 0.826388888888889, 0.15972222222222232, 0.9722222222222222, 0.18999999999999995, 0.15972222222222232, 1.0, 0.18999999999999995, 0.18999999999999995]
***
3
[0.4375, 1.0, 1.0, 1.0, 0.4375, 1.0]
[0.4375, 1.0, 1.0, 1.0, 0.4375, 1.0]
***
4
[0.99999

In [641]:
pbar = ProgressBar(maxval=df.shape[0]).start()
countries = []
for ix, row in enumerate(df.iterrows()):
    countries.append(parse_countries(row[1]))
    pbar.update(ix)
pbar.finish()

In [18]:
df[df.raw_text_url == 'https://www.federalregister.gov/articles/text/raw_text/201/231/447.txt']

Unnamed: 0,raw_text_url,title,toc_subject,topics,raw_text,entities
14,https://www.federalregister.gov/articles/text/...,Fresh Garlic From the People's Republic of Chi...,"Antidumping Duty New Shipper Reviews; Results,...",[],\nSUMMARY: \nThe Department of Commerce (Depar...,"[(Department of Commerce ( Department, ORGANIZ..."


In [307]:
df.ix[14, 'raw_text']

u"\nSUMMARY: \nThe Department of Commerce (Department) has determined that a request for a new shipper review (NSR) under the antidumping duty order on fresh garlic from the People's Republic of China (PRC) meets the statutory and regulatory requirements for initiation. The period of review (POR) is November 1, 2011, through October 31, 2012. \nDATES: \nEffective Date: January 2, 2012. \nFOR FURTHER INFORMATION CONTACT: \nLingjun Wang, AD/CVD Operations, Office 6, Import Administration, International Trade Administration, U.S. Department of Commerce, 14th Street and Constitution Avenue NW., Washington, DC 20230; telephone: (202) 482-2316. \nSUPPLEMENTARY INFORMATION: \nBackground \nThe Department published the antidumping duty order on fresh garlic from the PRC in the Federal Register on November 16, 1994. 1 \nOn November 27, 2012, the Department received a timely NSR request from Shijiazhuang Goodman Trading Co., Ltd. (Goodman) in accordance with section 751(a)(2)(B)(i) of the Tariff 

In [40]:
countries = get_countries(df.ix[14,'entities'])
update_countries_with_regions(df.ix[14,'entities'], countries, df.ix[14,'raw_text'])

({u'China': {'count': 2, 'probability': 1.0},
  u'Colombia': {'count': 1, 'probability': 0.25},
  u'Russian Federation': {'count': 1, 'probability': 0.16666666666666666},
  u'United Kingdom': {'count': 1, 'probability': 0.25},
  u'United States': {'count': 9, 'probability': 1.0}},
 {u'China': {'possible_countries': [u'United States',
    u'Russian Federation',
    u'China']},
  u'DC': {'possible_countries': [u'United States', u'Colombia']},
  u'Washington': {'possible_countries': [u'United States',
    u'United Kingdom']}})

In [None]:
{u'China': {'count': 1, 'probability': 0.16666666666666666},
 u'Colombia': {'count': 1, 'probability': 0.25},
 u'Russian Federation': {'count': 1, 'probability': 0.16666666666666666},
 u'United Kingdom': {'count': 1, 'probability': 0.25},
 u'United States': {'count': 4, 'probability': 0.9999674479166667}}