In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from multiprocessing import Pool
from progressbar import ProgressBar
from unidecode import unidecode
import nltk
from nltk.tag.stanford import StanfordNERTagger
from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree
import string
import pycountry
import jellyfish
import csv
from collections import Counter
import re

In [2]:
df = pd.read_pickle('docs_df')

In [3]:
sample = df.raw_text.ix[8]

In [5]:
st = StanfordNERTagger('/Users/amangum/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/amangum/stanford-ner/stanford-ner.jar')

def stanfordNE2BIO(tagged_sent):
    bio_tagged_sent = []
    prev_tag = "O"
    for token, tag in tagged_sent:
        if tag == "O": #O
            bio_tagged_sent.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged_sent.append((token, "I-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag

    return bio_tagged_sent


def stanfordNE2tree(ne_tagged_sent):
    bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)
    return ne_tree

In [132]:
def find_entities(text):
    # stanford tagger takes 2.45 seconds vs 2.38 seconds for standard tagger
    
#     tokens = [w for w in nltk.word_tokenize(text) if w.isalpha()]
#     tokens = nltk.word_tokenize(unidecode(text).translate(None, string.punctuation))
#     tokens = text.split()
    tokens = nltk.word_tokenize(text)
    
    places = []

    # stanford tagger
    ne_tagged_sent = st.tag(tokens)
    ne_tree = stanfordNE2tree(ne_tagged_sent)  
    for ne in ne_tree:
        if isinstance(ne, Tree): # If subtree is a noun chunk, i.e. NE != "O"
            if ne.label() in ['LOCATION', 'PERSON', 'ORGANIZATION']:
#             if ne.label() in ['LOCATION']:
                ne_label = ne.label()
                ne_string = u' '.join([token for token, pos in ne.leaves()])
                places.append([ne_string, ne_label])
                
    c = Counter((entity, label) for entity, label in places)
    return {(entity, label, count) for (entity, label), count in c.items()}

In [133]:
find_entities(sample)

{(u'AECA', u'ORGANIZATION', 1),
 (u'BIS', u'ORGANIZATION', 3),
 (u'Bernard Kritzer', u'PERSON', 1),
 (u'Bureau of Industry and Security', u'ORGANIZATION', 1),
 (u"Bureau of Industry and Security 's Office of Exporter Services",
  u'ORGANIZATION',
  1),
 (u'Commerce for Industry', u'ORGANIZATION', 1),
 (u'Courtland Place', u'LOCATION', 1),
 (u'EAA', u'ORGANIZATION', 1),
 (u'Elkton', u'LOCATION', 1),
 (u'Emenike Charles Nwankwoala', u'PERSON', 2),
 (u'FCI', u'ORGANIZATION', 1),
 (u'Federal Correctional Institution', u'ORGANIZATION', 1),
 (u'International Emergency Economic Powers Act', u'ORGANIZATION', 1),
 (u'Laurel', u'ORGANIZATION', 1),
 (u'Lisbon', u'LOCATION', 1),
 (u'Maryland', u'LOCATION', 1),
 (u'Nigeria', u'LOCATION', 2),
 (u'Nwankwoala', u'LOCATION', 5),
 (u'Nwankwoala', u'PERSON', 8),
 (u'Office of Exporter Services', u'ORGANIZATION', 1),
 (u'U.S. Department of State', u'ORGANIZATION', 1),
 (u'U.S. District Court', u'ORGANIZATION', 1),
 (u'United States', u'LOCATION', 12),
 (u

In [125]:
c = Counter([(entity, label) for entity, label in t])
set((entity, label, count) for (entity, label), count in c.items())

{(u'AECA', u'ORGANIZATION', 1),
 (u'BIS', u'ORGANIZATION', 3),
 (u'Bernard Kritzer', u'PERSON', 1),
 (u'Bureau of Industry and Security', u'ORGANIZATION', 1),
 (u"Bureau of Industry and Security 's Office of Exporter Services",
  u'ORGANIZATION',
  1),
 (u'Commerce for Industry', u'ORGANIZATION', 1),
 (u'Courtland Place', u'LOCATION', 1),
 (u'EAA', u'ORGANIZATION', 1),
 (u'Elkton', u'LOCATION', 1),
 (u'Emenike Charles Nwankwoala', u'PERSON', 2),
 (u'FCI', u'ORGANIZATION', 1),
 (u'Federal Correctional Institution', u'ORGANIZATION', 1),
 (u'International Emergency Economic Powers Act', u'ORGANIZATION', 1),
 (u'Laurel', u'ORGANIZATION', 1),
 (u'Lisbon', u'LOCATION', 1),
 (u'Maryland', u'LOCATION', 1),
 (u'Nigeria', u'LOCATION', 2),
 (u'Nwankwoala', u'LOCATION', 5),
 (u'Nwankwoala', u'PERSON', 8),
 (u'Office of Exporter Services', u'ORGANIZATION', 1),
 (u'U.S. Department of State', u'ORGANIZATION', 1),
 (u'U.S. District Court', u'ORGANIZATION', 1),
 (u'United States', u'LOCATION', 12),
 (u

In [None]:
# check whether any part of an entity string relates to a country

In [4]:
def remove_non_ascii(s): return "".join(i for i in s if ord(i)<128)
 
def fuzzy_match(s1, s2, max_dist=.8):
    return jellyfish.jaro_distance(s1, s2) >= max_dist

In [7]:
def title_except(s, exceptions=['a', 'an', 'of', 'the', 'is']):
    word_list = re.split(' ', s)       #re.split behaves as expected
    final = [word_list[0].capitalize()]
    for word in word_list[1:]:
        final.append(word in exceptions and word or word.capitalize())
    return " ".join(final)


def correct_country_mispelling(s):
    with open("ISO3166ErrorDictionary.csv", "rb") as info:
        reader = csv.reader(info)
        for row in reader:
            try:
                if unidecode(s.decode('utf8')).lower() == unidecode(row[0].decode('utf8')).lower():
                    return row[2]
            except:
                #fails on unicode string
                try:
                    if unidecode(s).lower() == row[0].lower():
                        return row[2]
                except:
                    pass
            try:
                if s.lower == unicode(row[0], 'utf8'):
                    return row[2]
            except:
                try:
                    # error on string
                    if s.lower() == row[0].lower():
                        return row[2]
                except:
                    pass
            if s.lower() == remove_non_ascii(row[0]).lower():
                return row[2]
    return s


def is_a_country(s):
    s = correct_country_mispelling(s)
    try:
        pycountry.countries.get(name=s)
        return True
    except:
        try:
            pycountry.countries.get(name=title_except(s))
            return True
        except KeyError, e:
            return False

    
def set_countries(places):
    countries = [correct_country_mispelling(place) for place in places if is_a_country(place)]
    country_mentions = Counter(countries).most_common()
    return country_mentions

In [8]:
cities = pd.DataFrame.from_csv('GeoLite2-City-Locations.csv', index_col=None)
cities.head()

Unnamed: 0,geoname_id,continent_code,continent_name,country_iso_code,country_name,subdivision_iso_code,subdivision_name,city_name,metro_code,time_zone
0,1861060,AS,Asia,JP,Japan,,,,,Asia/Tokyo
1,1809858,AS,Asia,CN,China,44.0,Guangdong,Guangzhou,,Asia/Shanghai
2,1850147,AS,Asia,JP,Japan,13.0,Tōkyō,Tokyo,,Asia/Tokyo
3,1814991,AS,Asia,CN,China,,,,,
4,2077456,OC,Oceania,AU,Australia,,,,,


In [9]:
s1 = cities[['country_name', 'subdivision_name']].dropna().rename(columns={'subdivision_name':'subdivision'})
s1['type'] = 'subdivision'
s2 = cities[['country_name', 'subdivision_iso_code']].dropna().rename(columns={'subdivision_iso_code':'subdivision'})
s2['type'] = 'subdivision_code'
s3 = cities[['country_name', 'city_name']].dropna().rename(columns={'city_name':'subdivision'})
s3['type'] = 'city'
alles = pd.concat([s1,s2,s3], ignore_index=True).drop_duplicates()


def update_labels_with_regions(places, labels):
    subs = pd.DataFrame()
    for place in places:
        a = alles[alles.subdivision == place]
        if not a.empty:
            subs = pd.concat([subs, a], ignore_index=True)
    
    if not subs.empty:
        no_dupes = subs.drop_duplicates(['country_name', 'subdivision'])
        for value_count in no_dupes.subdivision.value_counts().iteritems():
            probability = 1.0 / value_count[1]
            if probability == 1.0:
                # only one country exists for any probability
                probability = 0.8 # correcting for imperfect entity parsing
                possible_countries = subs[subs.subdivision == value_count[0]].country_name.tolist()
                country = possible_countries[0]
                if country in labels:
                    priors = labels[country]
                    new_count = priors['count'] + len(possible_countries)
                    probability_non_occurrence = (1-priors['probability']) * (1-probability)
                    new_probability = 1 - probability_non_occurrence
                    labels.update({unicode(country, 'utf8'): {'count': new_count, 'probability': new_probability}})
                else:
                    labels.update({unicode(country, 'utf8'): {'count': len(possible_countries), 'probability': probability}})
            else:
                # multiple countries exist for a single subdivision
                possible_countries = no_dupes[no_dupes.subdivision == value_count[0]].country_name.tolist()
                for country in possible_countries:
                    if country in labels:
                        priors = labels[country]
                        new_count = priors['count'] + 1
                        probability_non_occurrence = (1-priors['probability']) * (1-probability)
                        new_probability = 1 - probability_non_occurrence
                        labels.update({unicode(country, 'utf8'): {'count': new_count, 'probability': new_probability}})
                    else:
                        labels.update({unicode(country, 'utf8'): {'count': 1, 'probability': probability}})
    return labels

In [10]:
def get_labels(text):
    places = find_entities(text)
    country_mentions = set_countries(places)
    labels = {i[0]: {'probability': 1.0, 'count': i[1]} for i in country_mentions}
    labels = update_labels_with_regions(places, labels)
    return labels

get_labels(sample)

  result = lib.scalar_compare(x, y, op)


{u'Nigeria': {'count': 2, 'probability': 1.0},
 u'United States': {'count': 12, 'probability': 1.0}}

In [12]:
pool = Pool()
result = pool.map(get_labels, df.raw_text.tolist())
pool.close()

# countries = []
# progress = ProgressBar()
# for i in progress(df.raw_text.tolist()):
#     places = get_labels(i)
#     countries.append(places)

In [13]:
df['countries_stanford_alpha'] = result

In [14]:
df.to_pickle('parsed_df')

In [36]:
df.countries_stanford_alpha.tolist()

[{u'United States': {'count': 3, 'probability': 1.0}},
 {'United States': {'count': 1, 'probability': 1.0}},
 {u'United States': {'count': 5, 'probability': 1.0}},
 {u'Georgia': {'count': 1, 'probability': 1.0},
  u'Kyrgyzstan': {'count': 1, 'probability': 1.0},
  'Russian Federation': {'count': 1, 'probability': 1.0},
  u'United States': {'count': 6, 'probability': 1.0}},
 {u'Australia': {'count': 1, 'probability': 0.2},
  u'Brazil': {'count': 1, 'probability': 0.2},
  u'Ireland': {'count': 1, 'probability': 0.2},
  u'South Africa': {'count': 1, 'probability': 0.2},
  u'United States': {'count': 22, 'probability': 0.999744}},
 {},
 {},
 {u'Brazil': {'count': 1, 'probability': 0.8},
  u'Montenegro': {'count': 1, 'probability': 1.0},
  u'South Sudan': {'count': 1, 'probability': 1.0}},
 {u'Nigeria': {'count': 2, 'probability': 1.0},
  u'United States': {'count': 12, 'probability': 1.0}},
 {u'Brazil': {'count': 1, 'probability': 0.3333333333333333},
  u'Costa Rica': {'count': 1, 'probabi

In [11]:
# take care of city followed by state

In [381]:
def add_subdivision_tags(tagged_token):
    state_list = ['OH']
    if tagged_token[0] in state_list:
#         return (tagged_token[0], u'LOCATION') # for stanford
        return '(GPE ' + tagged_token[0] + '/NNP)' # for regular
    else:
        return tagged_token

text = nltk.word_tokenize(sample)
pos = nltk.pos_tag(text)
nes = nltk.ne_chunk(pos)
nes = map(add_subdivision_tags, nes)

In [373]:
for index, subtree in enumerate(nes):
    pass
#     if type(subtree) == nltk.tree.Tree:
#         traverse(subtree)
#     elif type(subtree) == tuple:
#         newVal = (subtree[0], subtree[1].lower())
#         subtree = newVal
#         tree[index] = subtree

state_list = ['OH']
for i in nes:
    print i

(PERSON Order/NNP)
(u'Denying', 'NNP')
(u'Export', 'NNP')
(u'Privileges', 'NNP')
(u'On', 'NNP')
(u'January', 'NNP')
(u'3', 'CD')
(u',', ',')
(u'2011', 'CD')
(u',', ',')
(u'in', 'IN')
(u'the', 'DT')
(GPE U.S./NNP)
(ORGANIZATION District/NNP Court/NNP)
(u',', ',')
(GPE District/NNP)
(u'of', 'IN')
(GPE Maryland/NNP)
(u',', ',')
(PERSON Emenike/NNP Charles/NNP Nwankwoala/NNP)
(u'(', 'NNP')
(u'\u201cNwankwoala\u201d', 'NNP')
(u')', 'NNP')
(u'was', 'VBD')
(u'convicted', 'VBN')
(u'of', 'IN')
(u'violating', 'NN')
(u'Section', 'NN')
(u'38', 'CD')
(u'of', 'IN')
(u'the', 'DT')
(ORGANIZATION Arms/NNP Export/NNP Control/NNP Act/NNP)
(u'(', 'NNP')
(u'22', 'CD')
(u'U.S.C', 'NNP')
(u'.', '.')
(u'2778', 'CD')
(u'(', 'CD')
(u'2000', 'CD')
(u')', 'CD')
(u')', 'CD')
(u'(', 'CD')
(u'\u201cAECA\u201d', 'JJ')
(u')', 'NN')
(u'and', 'CC')
(u'the', 'DT')
(ORGANIZATION
  International/NNP
  Emergency/NNP
  Economic/NNP
  Powers/NNP
  Act/NNP)
(u'(', 'NNP')
(u'50', 'CD')
(u'U.S.C', 'NNP')
(u'.', '.')
(u'1701', 'C