In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from multiprocessing import Pool
from progressbar import ProgressBar
from unidecode import unidecode
import nltk
from nltk.tag.stanford import StanfordNERTagger
from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree
import string
import pycountry
import jellyfish
import csv
from collections import Counter
import re

In [2]:
df = pd.read_pickle('docs_df')

In [3]:
sample = df.raw_text.ix[8]

In [4]:
def remove_non_ascii(s): return "".join(i for i in s if ord(i)<128)
 
def fuzzy_match(s1, s2, max_dist=.8):
    return jellyfish.jaro_distance(s1, s2) >= max_dist

In [5]:
st = StanfordNERTagger('/Users/amangum/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/amangum/stanford-ner/stanford-ner.jar')

def stanfordNE2BIO(tagged_sent):
    bio_tagged_sent = []
    prev_tag = "O"
    for token, tag in tagged_sent:
        if tag == "O": #O
            bio_tagged_sent.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged_sent.append((token, "I-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag

    return bio_tagged_sent


def stanfordNE2tree(ne_tagged_sent):
    bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)
    return ne_tree

In [75]:
def find_entities(text):
    # stanford tagger takes 2.45 seconds vs 2.38 seconds for standard tagger
    
    tokens = [w for w in nltk.word_tokenize(text) if w.isalpha()]
#     tokens = [w for w in nltk.word_tokenize(text) if len(w) > 1 ] #removes punctuation without removing words that happen to have punctuation in them
#     tokens = nltk.word_tokenize(unidecode(text).translate(None, string.punctuation))
#     tokens = nltk.word_tokenize(text)
#     tokens = text.split()
    
    places = []

    # stanford tagger
    ne_tagged_sent = st.tag(tokens)
    ne_tree = stanfordNE2tree(ne_tagged_sent)  
    for ne in ne_tree:
        if isinstance(ne, Tree): # If subtree is a noun chunk, i.e. NE != "O"
            if ne.label() in ['LOCATION', 'PERSON', 'ORGANIZATION']:
#             if ne.label() in ['LOCATION']:
                ne_label = ne.label()
                ne_string = u' '.join([token for token, pos in ne.leaves()])
                places.append([ne_string, ne_label])
#                 places.append(ne_string)
                
    return places

In [74]:
find_entities(sample)

[[u'U.S. District Court', u'ORGANIZATION'],
 [u'Maryland', u'LOCATION'],
 [u'Emenike Charles Nwankwoala', u'PERSON'],
 [u'Nwankwoala', u'PERSON'],
 [u'United States', u'LOCATION'],
 [u'Nigeria', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'Nwankwoala', u'PERSON'],
 [u'United States', u'LOCATION'],
 [u'Nigeria', u'LOCATION'],
 [u'United States Department of Commerce', u'ORGANIZATION'],
 [u'Nwankwoala', u'PERSON'],
 [u'Nwankwoala', u'LOCATION'],
 [u'U.S. Department of State', u'ORGANIZATION'],
 [u'Office of Exporter Services', u'ORGANIZATION'],
 [u'International Emergency Economic Powers Act', u'ORGANIZATION'],
 [u'EAA', u'ORGANIZATION'],
 [u"Bureau of Industry and Security 's Office of Exporter Services",
  u'ORGANIZATION'],
 [u'Bureau of Industry and Security', u'ORGANIZATION'],
 [u'Nwankwoala', u'PERSON'],
 [u'AECA', u'ORGANIZATION'],
 [u'Nwankwoala', u'LOCATION'],
 [u'BIS', u'ORGANIZATION'],
 [u'Nwankwoala', u'LOCATION'],
 [u'BIS', u'ORGANIZATION'],
 [u'BIS', u'ORGANIZATION'],

In [None]:
alpha


word_tokenizer
[[u'U.S. District Court', u'ORGANIZATION'],
 [u'Maryland', u'LOCATION'],
 [u'Emenike Charles Nwankwoala', u'PERSON'],
 [u'Nwankwoala', u'PERSON'],
 [u'United States', u'LOCATION'],
 [u'Nigeria', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'Nwankwoala', u'PERSON'],
 [u'United States', u'LOCATION'],
 [u'Nigeria', u'LOCATION'],
 [u'United States Department of Commerce', u'ORGANIZATION'],
 [u'Nwankwoala', u'PERSON'],
 [u'Nwankwoala', u'LOCATION'],
 [u'U.S. Department of State', u'ORGANIZATION'],
 [u'Office of Exporter Services', u'ORGANIZATION'],
 [u'International Emergency Economic Powers Act', u'ORGANIZATION'],
 [u'EAA', u'ORGANIZATION'],
 [u"Bureau of Industry and Security 's Office of Exporter Services",
  u'ORGANIZATION'],
 [u'Bureau of Industry and Security', u'ORGANIZATION'],
 [u'Nwankwoala', u'PERSON'],
 [u'AECA', u'ORGANIZATION'],
 [u'Nwankwoala', u'LOCATION'],
 [u'BIS', u'ORGANIZATION'],
 [u'Nwankwoala', u'LOCATION'],
 [u'BIS', u'ORGANIZATION'],
 [u'BIS', u'ORGANIZATION'],
 [u'Nwankwoala', u'PERSON'],
 [u'Nwankwoala', u'PERSON'],
 [u'Nwankwoala', u'PERSON'],
 [u'Emenike Charles Nwankwoala', u'PERSON'],
 [u'FCI', u'ORGANIZATION'],
 [u'Elkton', u'LOCATION'],
 [u'Federal Correctional Institution', u'ORGANIZATION'],
 [u'Lisbon', u'LOCATION'],
 [u'Courtland Place', u'LOCATION'],
 [u'Laurel', u'ORGANIZATION'],
 [u'Nwankwoala', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'Nwankwoala', u'PERSON'],
 [u'Commerce for Industry', u'ORGANIZATION'],
 [u'Nwankwoala', u'LOCATION'],
 [u'Bernard Kritzer', u'PERSON']]

split
Out[69]:
[[u'U.S.', u'LOCATION'],
 [u'Charles Nwankwoala', u'PERSON'],
 [u'Nwankwoala', u'PERSON'],
 [u'United States', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'Department of State', u'ORGANIZATION'],
 [u'Nwankwoala', u'PERSON'],
 [u'United States', u'LOCATION'],
 [u'Nigeria', u'LOCATION'],
 [u'United States Department of Commerce,', u'ORGANIZATION'],
 [u'Nwankwoala', u'PERSON'],
 [u'Nwankwoala', u'ORGANIZATION'],
 [u'U.S. Department of State Debarred List. Section', u'ORGANIZATION'],
 [u'International Emergency Economic Powers Act (50 U.S.C.', u'ORGANIZATION'],
 [u'U.S.C.', u'ORGANIZATION'],
 [u'U.S.C.', u'LOCATION'],
 [u'U.S.C.', u'ORGANIZATION'],
 [u"Bureau of Industry and Security's Office of Exporter Services",
  u'ORGANIZATION'],
 [u'Bureau of Industry and Security', u'ORGANIZATION'],
 [u'AECA', u'ORGANIZATION'],
 [u'Nwankwoala', u'LOCATION'],
 [u'Nwankwoala', u'PERSON'],
 [u'Charles Nwankwoala,', u'PERSON'],
 [u'Courtland', u'PERSON'],
 [u'United States', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'United States,', u'LOCATION'],
 [u'United States;', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'United States;', u'LOCATION'],
 [u'United States', u'LOCATION'],
 [u'United States.', u'LOCATION'],
 [u'Nwankwoala', u'LOCATION'],
 [u'Commerce for Industry', u'ORGANIZATION'],
 [u'Bernard', u'PERSON']]

stan split
{u'Nigeria': {'count': 1, 'probability': 1.0},
 'United States': {'count': 11, 'probability': 1.0}}
 
stan word_toke
{'Australia': {'count': 1, 'probability': 0.5},
 u'Nigeria': {'count': 2, 'probability': 1.0},
 'Portugal': {'count': 1, 'probability': 0.5},
 u'United States': {'count': 16, 'probability': 1.0}}

stan translate
 {u'Nigeria': {'count': 2, 'probability': 1.0},
 'Portugal': {'count': 1, 'probability': 0.5},
 u'United States': {'count': 13, 'probability': 1.0}}
 
 stan len
 {u'Nigeria': {'count': 2, 'probability': 1.0},
 'Portugal': {'count': 1, 'probability': 0.5},
 u'United States': {'count': 13, 'probability': 1.0}}
 
 stan alpha
 {u'Nigeria': {'count': 2, 'probability': 1.0},
 u'United States': {'count': 12, 'probability': 1.0}}
 
 reg split
 {'Hungary': {'count': 2, 'probability': 1.0},
 u'Nigeria': {'count': 1, 'probability': 1.0},
 'United States': {'count': 17, 'probability': 1.0}}
 
 reg word toke
 {'Australia': {'count': 1, 'probability': 0.5},
 'Hungary': {'count': 2, 'probability': 0.8},
 u'Nigeria': {'count': 2, 'probability': 1.0},
 'Portugal': {'count': 1, 'probability': 0.5},
 'United States': {'count': 23, 'probability': 1.0}}
 
 reg trans
 {'Hungary': {'count': 2, 'probability': 0.8},
 u'Nigeria': {'count': 2, 'probability': 1.0},
 'Portugal': {'count': 1, 'probability': 0.5},
 u'United States': {'count': 22, 'probability': 1.0}}
 
 reg len
 {'Hungary': {'count': 2, 'probability': 0.8},
 u'Nigeria': {'count': 2, 'probability': 1.0},
 'Portugal': {'count': 1, 'probability': 0.5},
 'United States': {'count': 23, 'probability': 1.0}}
 
 reg alpha
 {'Hungary': {'count': 2, 'probability': 0.8},
 u'Nigeria': {'count': 2, 'probability': 1.0},
 u'United States': {'count': 20, 'probability': 1.0}}

In [7]:
def title_except(s, exceptions=['a', 'an', 'of', 'the', 'is']):
    word_list = re.split(' ', s)       #re.split behaves as expected
    final = [word_list[0].capitalize()]
    for word in word_list[1:]:
        final.append(word in exceptions and word or word.capitalize())
    return " ".join(final)


def correct_country_mispelling(s):
    with open("ISO3166ErrorDictionary.csv", "rb") as info:
        reader = csv.reader(info)
        for row in reader:
            try:
                if unidecode(s.decode('utf8')).lower() == unidecode(row[0].decode('utf8')).lower():
                    return row[2]
            except:
                #fails on unicode string
                try:
                    if unidecode(s).lower() == row[0].lower():
                        return row[2]
                except:
                    pass
            try:
                if s.lower == unicode(row[0], 'utf8'):
                    return row[2]
            except:
                try:
                    # error on string
                    if s.lower() == row[0].lower():
                        return row[2]
                except:
                    pass
            if s.lower() == remove_non_ascii(row[0]).lower():
                return row[2]
    return s


def is_a_country(s):
    s = correct_country_mispelling(s)
    try:
        pycountry.countries.get(name=s)
        return True
    except:
        try:
            pycountry.countries.get(name=title_except(s))
            return True
        except KeyError, e:
            return False

    
def set_countries(places):
    countries = [correct_country_mispelling(place) for place in places if is_a_country(place)]
    country_mentions = Counter(countries).most_common()
    return country_mentions

In [8]:
cities = pd.DataFrame.from_csv('GeoLite2-City-Locations.csv', index_col=None)
cities.head()

Unnamed: 0,geoname_id,continent_code,continent_name,country_iso_code,country_name,subdivision_iso_code,subdivision_name,city_name,metro_code,time_zone
0,1861060,AS,Asia,JP,Japan,,,,,Asia/Tokyo
1,1809858,AS,Asia,CN,China,44.0,Guangdong,Guangzhou,,Asia/Shanghai
2,1850147,AS,Asia,JP,Japan,13.0,Tōkyō,Tokyo,,Asia/Tokyo
3,1814991,AS,Asia,CN,China,,,,,
4,2077456,OC,Oceania,AU,Australia,,,,,


In [9]:
s1 = cities[['country_name', 'subdivision_name']].dropna().rename(columns={'subdivision_name':'subdivision'})
s1['type'] = 'subdivision'
s2 = cities[['country_name', 'subdivision_iso_code']].dropna().rename(columns={'subdivision_iso_code':'subdivision'})
s2['type'] = 'subdivision_code'
s3 = cities[['country_name', 'city_name']].dropna().rename(columns={'city_name':'subdivision'})
s3['type'] = 'city'
alles = pd.concat([s1,s2,s3], ignore_index=True).drop_duplicates()


def update_labels_with_regions(places, labels):
    subs = pd.DataFrame()
    for place in places:
        a = alles[alles.subdivision == place]
        if not a.empty:
            subs = pd.concat([subs, a], ignore_index=True)
    
    if not subs.empty:
        no_dupes = subs.drop_duplicates(['country_name', 'subdivision'])
        for value_count in no_dupes.subdivision.value_counts().iteritems():
            probability = 1.0 / value_count[1]
            if probability == 1.0:
                # only one country exists for any probability
                probability = 0.8 # correcting for imperfect entity parsing
                possible_countries = subs[subs.subdivision == value_count[0]].country_name.tolist()
                country = possible_countries[0]
                if country in labels:
                    priors = labels[country]
                    new_count = priors['count'] + len(possible_countries)
                    probability_non_occurrence = (1-priors['probability']) * (1-probability)
                    new_probability = 1 - probability_non_occurrence
                    labels.update({unicode(country, 'utf8'): {'count': new_count, 'probability': new_probability}})
                else:
                    labels.update({unicode(country, 'utf8'): {'count': len(possible_countries), 'probability': probability}})
            else:
                # multiple countries exist for a single subdivision
                possible_countries = no_dupes[no_dupes.subdivision == value_count[0]].country_name.tolist()
                for country in possible_countries:
                    if country in labels:
                        priors = labels[country]
                        new_count = priors['count'] + 1
                        probability_non_occurrence = (1-priors['probability']) * (1-probability)
                        new_probability = 1 - probability_non_occurrence
                        labels.update({unicode(country, 'utf8'): {'count': new_count, 'probability': new_probability}})
                    else:
                        labels.update({unicode(country, 'utf8'): {'count': 1, 'probability': probability}})
    return labels

In [10]:
def get_labels(text):
    places = find_entities(text)
    country_mentions = set_countries(places)
    labels = {i[0]: {'probability': 1.0, 'count': i[1]} for i in country_mentions}
    labels = update_labels_with_regions(places, labels)
    return labels

get_labels(sample)

  result = lib.scalar_compare(x, y, op)


{u'Nigeria': {'count': 2, 'probability': 1.0},
 u'United States': {'count': 12, 'probability': 1.0}}

In [12]:
pool = Pool()
result = pool.map(get_labels, df.raw_text.tolist())
pool.close()

# countries = []
# progress = ProgressBar()
# for i in progress(df.raw_text.tolist()):
#     places = get_labels(i)
#     countries.append(places)

In [13]:
df['countries_stanford_alpha'] = result

In [14]:
df.to_pickle('parsed_df')

In [36]:
df.countries_stanford_alpha.tolist()

[{u'United States': {'count': 3, 'probability': 1.0}},
 {'United States': {'count': 1, 'probability': 1.0}},
 {u'United States': {'count': 5, 'probability': 1.0}},
 {u'Georgia': {'count': 1, 'probability': 1.0},
  u'Kyrgyzstan': {'count': 1, 'probability': 1.0},
  'Russian Federation': {'count': 1, 'probability': 1.0},
  u'United States': {'count': 6, 'probability': 1.0}},
 {u'Australia': {'count': 1, 'probability': 0.2},
  u'Brazil': {'count': 1, 'probability': 0.2},
  u'Ireland': {'count': 1, 'probability': 0.2},
  u'South Africa': {'count': 1, 'probability': 0.2},
  u'United States': {'count': 22, 'probability': 0.999744}},
 {},
 {},
 {u'Brazil': {'count': 1, 'probability': 0.8},
  u'Montenegro': {'count': 1, 'probability': 1.0},
  u'South Sudan': {'count': 1, 'probability': 1.0}},
 {u'Nigeria': {'count': 2, 'probability': 1.0},
  u'United States': {'count': 12, 'probability': 1.0}},
 {u'Brazil': {'count': 1, 'probability': 0.3333333333333333},
  u'Costa Rica': {'count': 1, 'probabi

In [77]:
''' building custom tokenizer '''

from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
import pickle

docs = df.raw_text.tolist()

trainer = PunktTrainer()

trainer.ABBREV = 0.3
"""cut-off value whether a 'token' is an abbreviation"""

trainer.IGNORE_ABBREV_PENALTY = False
"""allows the disabling of the abbreviation penalty heuristic, which
exponentially disadvantages words that are found at times without a
final period."""

trainer.ABBREV_BACKOFF = 5
"""upper cut-off for Mikheev's(2002) abbreviation detection algorithm"""

trainer.COLLOCATION = 7.88
"""minimal log-likelihood value that two tokens need to be considered
as a collocation"""

trainer.SENT_STARTER = 30
"""minimal log-likelihood value that a token requires to be considered
as a frequent sentence starter"""

trainer.INCLUDE_ALL_COLLOCS = False
"""this includes as potential collocations all word pairs where the first
word ends in a period. It may be useful in corpora where there is a lot
of variation that makes abbreviations like Mr difficult to identify."""

trainer.INCLUDE_ABBREV_COLLOCS = False
"""this includes as potential collocations all word pairs where the first
word is an abbreviation. Such collocations override the orthographic
heuristic, but not the sentence starter heuristic. This is overridden by
INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
and ordinals are considered."""
""""""

trainer.MIN_COLLOC_FREQ = 1
"""this sets a minimum bound on the number of times a bigram needs to
appear before it can be considered a collocation, in addition to log
likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True."""

progress = ProgressBar()
for doc in progress(docs):
    trainer.train(doc, finalize=False, verbose=False)

print "Finalizing training..."
trainer.finalize_training(verbose=True)
print "Training done."

params = trainer.get_params()
with open('sentence_tokenizer_params.pickle', 'wb') as f:
    pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL)
print "Params: %s" % repr(params)

# # set custom parameters
# extra_collocations = {(u'sec', u'##number##')}
# extra_sentence_starters = {u'(##number##)'}
# extra_abbreviations = {u'U.S.C', u'usc'}

# # add in custom collocations etc
# params.collocations = params.collocations | extra_collocations
# params.sent_starters = params.sent_starters | extra_sentence_starters

tokenizer = PunktSentenceTokenizer(params)

with open("sentence_tokenizer.pickle", mode='wb') as f:
        pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)


 99% (9998 of 10000) |################### | Elapsed Time: 0:17:00 ETA:  0:00:00

Finalizing training...
  Sent Starter: [40.3095] u'middleman'
  Sent Starter: [35.5613] u'treasuries'
  Sent Starter: [45.8904] u'oceana'
  Sent Starter: [33.5433] u'caryl'
  Sent Starter: [711.2582] u'second'
  Sent Starter: [73.8597] u'porites'
  Sent Starter: [92.2556] u'admiral'
  Sent Starter: [43.3309] u'reporter'
  Sent Starter: [43.8525] u'topography'
  Sent Starter: [2191.4605] u'classification'
  Sent Starter: [25291.8065] u'therefore'
  Sent Starter: [135.8454] u'passport'
  Sent Starter: [39.4044] u'jiaxing'
  Sent Starter: [1221.8135] u'example'
  Sent Starter: [42.8055] u'nedwell'
  Sent Starter: [375.5407] u'methodology'
  Sent Starter: [39.1326] u'danyang'
  Sent Starter: [530.0968] u'tianjin'
  Sent Starter: [46.4013] u'odontocete'
  Sent Starter: [49.7944] u'atf'
  Sent Starter: [39.6142] u'euphyllia'
  Sent Starter: [33.7111] u'roni'
  Sent Starter: [3826.3374] u'similarly'
  Sent Starter: [100.9561] u'ntis'
  Sent Starter: [41.6197] u'ntia'
  Sent Starter: [34.9665]

100% (10000 of 10000) |###################| Elapsed Time: 0:17:00 Time: 0:17:00


In [11]:
# take care of city followed by state

In [381]:
def add_subdivision_tags(tagged_token):
    state_list = ['OH']
    if tagged_token[0] in state_list:
#         return (tagged_token[0], u'LOCATION') # for stanford
        return '(GPE ' + tagged_token[0] + '/NNP)' # for regular
    else:
        return tagged_token

text = nltk.word_tokenize(sample)
pos = nltk.pos_tag(text)
nes = nltk.ne_chunk(pos)
nes = map(add_subdivision_tags, nes)

In [373]:
for index, subtree in enumerate(nes):
    pass
#     if type(subtree) == nltk.tree.Tree:
#         traverse(subtree)
#     elif type(subtree) == tuple:
#         newVal = (subtree[0], subtree[1].lower())
#         subtree = newVal
#         tree[index] = subtree

state_list = ['OH']
for i in nes:
    print i

(PERSON Order/NNP)
(u'Denying', 'NNP')
(u'Export', 'NNP')
(u'Privileges', 'NNP')
(u'On', 'NNP')
(u'January', 'NNP')
(u'3', 'CD')
(u',', ',')
(u'2011', 'CD')
(u',', ',')
(u'in', 'IN')
(u'the', 'DT')
(GPE U.S./NNP)
(ORGANIZATION District/NNP Court/NNP)
(u',', ',')
(GPE District/NNP)
(u'of', 'IN')
(GPE Maryland/NNP)
(u',', ',')
(PERSON Emenike/NNP Charles/NNP Nwankwoala/NNP)
(u'(', 'NNP')
(u'\u201cNwankwoala\u201d', 'NNP')
(u')', 'NNP')
(u'was', 'VBD')
(u'convicted', 'VBN')
(u'of', 'IN')
(u'violating', 'NN')
(u'Section', 'NN')
(u'38', 'CD')
(u'of', 'IN')
(u'the', 'DT')
(ORGANIZATION Arms/NNP Export/NNP Control/NNP Act/NNP)
(u'(', 'NNP')
(u'22', 'CD')
(u'U.S.C', 'NNP')
(u'.', '.')
(u'2778', 'CD')
(u'(', 'CD')
(u'2000', 'CD')
(u')', 'CD')
(u')', 'CD')
(u'(', 'CD')
(u'\u201cAECA\u201d', 'JJ')
(u')', 'NN')
(u'and', 'CC')
(u'the', 'DT')
(ORGANIZATION
  International/NNP
  Emergency/NNP
  Economic/NNP
  Powers/NNP
  Act/NNP)
(u'(', 'NNP')
(u'50', 'CD')
(u'U.S.C', 'NNP')
(u'.', '.')
(u'1701', 'C