In [63]:
import pandas as pd
import numpy as np
import spacy
import re

In [64]:
data_path = '../data/'
df = pd.read_csv(data_path + 'ner_dataset.zip', encoding="latin1")
df.shape

(1048575, 4)

In [65]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


## Preprocess text

In [66]:
# Keep all numbers, all letters and hyphens
df['tidy_word'] = df['Word'].apply(lambda x: re.sub("[^a-zA-Z0-9\-]", "", x))

In [67]:
# Remove observations that are now empty
df = df[df['tidy_word'] != '']

# Hyphenated words are kept as a single token
# in this dataset. Remove observations that
# are a between-word dash. This causes problems
# with the algorithm
df = df[df['tidy_word'] != '-']

In [68]:
# Later steps require a consecutive index
df = df.reset_index()

### Create Sentence Form of Text

We create a new Series where each row is a list with the contents of a sentence. We do this so that it can be more easily sent to Spacy NER.

In [69]:
def toSentenceForm(df, var):
    """
    From a Series:
    
        Sentence  #      var
        -----------  -------
        Sentence: 1     Mary
                NaN      had
                NaN        a
                NaN   little
                NaN     lamb
                NaN        .
        Sentence: 2       He
                NaN followed
                NaN      her
                
    to a list of lists of strings by sentence:
    
        [['Mary', 'had', 'a', 'little', 'lamb' '.'],
         ['He', 'followed', 'her', ...]]
                                
    """
    sent_out = []

    # Find the index of each sentence start
    idx = df['Sentence #'].isna()
    sent_breaks = df[idx == False].index
    
    
    for i in range(len(sent_breaks)-1):
        
        # Create a list containing all strings
        # from the current sentence
        out = []
        for j in range(sent_breaks[i], sent_breaks[i+1]):
            out.append(df[var].iloc[j])
        
        # Append ith sentence list to the
        # list of all sentences
        sent_out.append(out)

    return sent_out

In [70]:
sentences = toSentenceForm(df, 'tidy_word')

In [71]:
tags = toSentenceForm(df, 'Tag')

In [72]:
se = pd.DataFrame({'sentence_list': sentences, 'tags': tags})

In [73]:
se.head()

Unnamed: 0,sentence_list,tags
0,"[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
1,"[Families, of, soldiers, killed, in, the, conf...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[They, marched, from, the, Houses, of, Parliam...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo]"
3,"[Police, put, the, number, of, marchers, at, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,"[The, protest, comes, on, the, eve, of, the, a...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,..."


In [74]:
se.to_csv('../data/ner_sentences.csv')

### Perform NER

Spacy will automatically perform NER when we create a Doc object from a sentence.

Iterate over all words and tags in the Doc object and create lists of tokens and tags

In [75]:
nlp = spacy.load("en_core_web_sm")

In [14]:
words = []
spacy_tags = []
ctr = 0

# number of iterations between progress report
report_freq = 2500

for row, sentence in enumerate(se['sentence_list']):
    doc = nlp(' '.join(sentence))

    # Print progress report
    if row == int(row / report_freq) * report_freq:
        print(f'{row:10.9g}:\t{se.shape[0]} ({100*row/se.shape[0]:.2f}%)')
    
    # Iterate over tokens in doc
    # and create lists of tokens and tags
    for i, t in enumerate(doc):
        words.append(t)
        if t.ent_iob_ == 'O':
            spacy_tags.append('O')
        else:
            spacy_tags.append(t.ent_iob_+'-'+t.ent_type_)


         0:	47839 (0.00%)
      2500:	47839 (5.23%)
      5000:	47839 (10.45%)
      7500:	47839 (15.68%)
     10000:	47839 (20.90%)
     12500:	47839 (26.13%)
     15000:	47839 (31.36%)
     17500:	47839 (36.58%)
     20000:	47839 (41.81%)
     22500:	47839 (47.03%)
     25000:	47839 (52.26%)
     27500:	47839 (57.48%)
     30000:	47839 (62.71%)
     32500:	47839 (67.94%)
     35000:	47839 (73.16%)
     37500:	47839 (78.39%)
     40000:	47839 (83.61%)
     42500:	47839 (88.84%)
     45000:	47839 (94.07%)
     47500:	47839 (99.29%)


In [15]:
df_spacy = pd.DataFrame({'Words': words, 'Tags': spacy_tags})

In [16]:
df_spacy.shape

(981669, 2)

In [17]:
df_spacy.to_csv('../data/ner_spacy_raw.csv')

In [18]:
df_spacy.head(7)

Unnamed: 0,Words,Tags
0,Thousands,B-CARDINAL
1,of,O
2,demonstrators,O
3,have,O
4,marched,O
5,through,O
6,London,B-GPE


In [19]:
def containsHyphen(s):
    return len(s) > 1 and (s.find('-') > -1)

def findHyphenatedWord(s, s_list, idx):
    return s + s_list[idx+1] + s_list[idx+2]

def findAdditionalHyphens(s, s_list, idx, inc):
        finished = 1
        idx += inc
        if idx < len(s_list) and s_list[idx] == '-':
            s = findHyphenatedWord(s, s_list, idx-1)
            finished = 0
            return (idx, s, finished)
        else:
            return (idx, s, finished)

def findHyphenatedWords(s_list, idx):
    s = s_list[idx]
    s = findHyphenatedWord(s, s_list, idx)
    
    finished = 0  # have we found all connected hyphens?
    increment = 3 # the first additional hyphen is further than
                  # subsequent ones, because the original 'idx'
                  # is pointing at a word, but the after calling
                  # findAdditionalHyphens() 'idx' will point at a
                  # hyphen after the word
                  
    
    while finished == 0:
        idx, s, finished = findAdditionalHyphens(s, s_list, idx, increment)
        increment = 2
                

    return (idx, s)

In [20]:
a = [ 'apple', 'apple-jacks', 'chocolate', 'coco-puffs-again', 
     'old-mcdonald-had-a-farm']
b = [ 'apple', 'apple', '-', 'jacks', 'chocolate', 'coco', 
     '-', 'puffs', '-', 'again', 'old', '-', 'mcdonald', '-', 'had', 
     '-', 'a', '-', 'farm']

def test_findHyphenatedWords(a, b):
    c = []
    i = 0
    for ai in (a):
        if containsHyphen(ai):
            i, ss = findHyphenatedWords(b, i)
            c.append(ss)
        else:
            c.append(b[i])
            i += 1

    c_cmp = ['apple',
             'apple-jacks',
             'chocolate',
             'coco-puffs-again',
             'old-mcdonald-had-a-farm']

    return c == c_cmp

In [21]:
assert test_findHyphenatedWords(a, b)

### Align tokens and tags

Spacy breaks up some words differently than our dataset. Below, we create a list of the tokens from our original dataset and their Spacy tags.

In [22]:
c = []
s_tags = []
i = 0
j = 0
supress_til = 1780
for word in df['tidy_word']:
    if containsHyphen(word):
        # grabbing a bunch of words in case we have to search
        # and grab more tokens from hyphenated words
        sp_word = [w.text for w in df_spacy['Words'].iloc[j:j+9]]
        if sp_word[0] == word:
            sp_tag_tmp = df_spacy['Tags'].iloc[j]
            s_tags.append(sp_tag_tmp)
            c.append(word)
            
            j += 1
        else:
            k = 0
            k, ss = findHyphenatedWords(sp_word, k)
            
            sp_tag_tmp = df_spacy['Tags'].iloc[j]
            s_tags.append(sp_tag_tmp)
            
            j += k
            
            c.append(ss)
    else:
        next_sp_word_ = df_spacy["Words"].iloc[j]
        if len(word) > 1 and word.endswith('.'):
            next_sp_word = df_spacy["Words"].iloc[j+1]
            if next_sp_word.text == '.':
                j += 1
                
        sp_tag_tmp = df_spacy['Tags'].iloc[j]
        s_tags.append(sp_tag_tmp)
        c.append(next_sp_word_.text)
        j += 1
        
    i += 1
    if len(c) == 500000:
        break
        
    if c[-1] != word:
        # try and take care of times where Spacy splits 'wont', 'wed'
        tmpa = df_spacy["Words"].iloc[j]
        tmp = c[-1] + tmpa.text
        if tmp == word:
            c[-1] = tmp
            
        j += 1
            
        print('done')
        print(f'i: {i} j: {j} c: {c[-1:]} word: "{word}", sp_word[0]: "{list(df_spacy["Words"].iloc[j-2:j])}"')
#         break
    last_word = word

done
i: 168074 j: 171909 c: ['wont'] word: "wont", sp_word[0]: "[wo, nt]"
done
i: 189964 j: 194298 c: ['wed'] word: "wed", sp_word[0]: "[we, d]"


In [23]:
all_spacy_tags = np.unique(s_tags)
print(all_spacy_tags)

['B-CARDINAL' 'B-DATE' 'B-EVENT' 'B-FAC' 'B-GPE' 'B-LANGUAGE' 'B-LAW'
 'B-LOC' 'B-MONEY' 'B-NORP' 'B-ORDINAL' 'B-ORG' 'B-PERCENT' 'B-PERSON'
 'B-PRODUCT' 'B-QUANTITY' 'B-TIME' 'B-WORK_OF_ART' 'I-CARDINAL' 'I-DATE'
 'I-EVENT' 'I-FAC' 'I-GPE' 'I-LAW' 'I-LOC' 'I-MONEY' 'I-NORP' 'I-ORG'
 'I-PERCENT' 'I-PERSON' 'I-PRODUCT' 'I-QUANTITY' 'I-TIME' 'I-WORK_OF_ART'
 'O']


In [24]:
spacy_word_tag = pd.DataFrame({'token': c, 'tag': s_tags})
spacy_word_tag.head()

Unnamed: 0,token,tag
0,Thousands,B-CARDINAL
1,of,O
2,demonstrators,O
3,have,O
4,marched,O


In [25]:
spacy_word_tag.to_csv('../data/ner_spacy_aligned.csv')

## Examine performance

In [26]:
swt = spacy_word_tag

In [27]:
swt.head()

Unnamed: 0,token,tag
0,Thousands,B-CARDINAL
1,of,O
2,demonstrators,O
3,have,O
4,marched,O


### Make sure all tokens match

Here we verify that the tokens from our training data match the tokens we created for the Spacy predictions.

In [28]:
def checkWordVectors(word_true, word_cmp):
    for i, tok in enumerate(word_cmp):
        if tok != word_true.iloc[i]:
            print(tok, word_true.iloc[i])
            return 0
            
    return 1
        

In [29]:
assert checkWordVectors(df['tidy_word'], swt['token'])

### Remove 'I' and 'B' tags

We want to get a preliminary look at the performance and not (at least not yet) worry about which token is used as the starting token.

In [96]:
df['Tag2'] = df['Tag'].apply(lambda tok: tok.replace('I-', ''))
df['Tag2'] = df['Tag2'].apply(lambda tok: tok.replace('B-', ''))

### Convert Spacy tags

Here we try and translate the Spacy tags to the same representation as our training data.

PERSON:      People, including fictional.<br/>
NORP:        Nationalities or religious or political groups.<br/>
FAC:         Buildings, airports, highways, bridges, etc.<br/>
ORG:         Companies, agencies, institutions, etc.<br/>
GPE:         Countries, cities, states.<br/>
LOC:         Non-GPE locations, mountain ranges, bodies of water.<br/>
PRODUCT:     Objects, vehicles, foods, etc. (Not services.)<br/>
EVENT:       Named hurricanes, battles, wars, sports events, etc.<br/>
WORK_OF_ART: Titles of books, songs, etc.<br/>
LAW:         Named documents made into laws.<br/>
LANGUAGE:    Any named language.<br/>
DATE:        Absolute or relative dates or periods.<br/>
TIME:        Times smaller than a day.<br/>
PERCENT:     Percentage, including ”%“.<br/>
MONEY:       Monetary values, including unit.<br/>
QUANTITY:    Measurements, as of weight or distance.<br/>
ORDINAL:     “first”, “second”, etc.<br/>
CARDINAL:    Numerals that do not fall under another type.

In [86]:
tags = df['Tag'].value_counts()
base_tags = [t.replace('I-', '') for t in tags.index if not t.startswith('B')]
base_tags

['O', 'per', 'org', 'geo', 'tim', 'art', 'eve', 'gpe', 'nat']

In [97]:
def normalizeTags(swt, spacy2ref):
    swt['tag2'] = swt['tag'].apply(lambda tok: tok.replace('I-', ''))
    swt['tag2'] = swt['tag2'].apply(lambda tok: tok.replace('B-', ''))
    for k in spacy2ref:
        swt['tag2'] = swt['tag2'].apply(lambda tok: tok.replace(k, spacy2ref[k]))
        
    return swt
    


In [99]:
spacy2ref = {'GPE': 'gpe', 'ORG': 'org', 'PERSON': 'per', 'DATE': 'tim',
             'TIME': 'tim', 'EVENT': 'eve', 'LOC': 'geo', 'ORDINAL': 'O',
             'CARDINAL': 'O', 'MONEY': 'O', 'PERCENT': 'O', 'GEO': 'geo', 
             'QUANTITY': 'O', 'FAC': 'geo', 'LAW': 'O', 'PRODUCT': 'O', 
             'WORK_OF_ART': 'art', 'LANGUAGE': 'O', 'NORP': 'gpe'}
swt = normalizeTags(swt, spacy2ref)

In [100]:
# Find out how many observations we have for each Spacy tag
swt['tag2'].value_counts()

O      410297
gpe     32429
tim     21015
org     17595
per     14008
geo      3199
eve      1209
art       248
Name: tag2, dtype: int64

### Check accuracy

In [101]:
def findMatches(df, swt):
    """
    Create binary vector of size swt.shape[0] with a one
    for every element where the tags match and a 0 otherwise.
    
    Also return a count of the number of misses and number of matches.
    """
    match_idx = np.zeros(swt.shape[0])
    match = 0
    miss = 0
    for i, tok in enumerate(swt):
        if tok != df.iloc[i]:
            miss += 1
        else:
            match += 1
            match_idx[i] = 1
    
    print(f'correct: {100 * match/swt.shape[0]:.1f}, incorrect: {100 * miss/swt.shape[0]:.1f}')
    return (match, miss, match_idx)

In [102]:
match, miss, match_idx = findMatches(df['Tag2'], swt['tag2'])

correct: 88.5, incorrect: 11.5


Here we see about 89% accuracy. However, this data is quite imbalanced, so we need to check and see if the success is just limited to prediction of the dominant class ('O').

In [103]:
# Split data into observations where the tags
# match and observations that do not match
def missMatchSplit(swt, match_idx):
    cmp = swt.copy()
    cmp['dftag'] = df['Tag2']
    cmp_match = cmp[match_idx == 1]
    cmp_miss = cmp[match_idx == 0]
    
    return cmp_miss, cmp_match

def missMatchTable(cmp_match, cmp_miss):
    return pd.DataFrame({'miss': cmp_miss['dftag'].value_counts(), 'match': cmp_match['dftag'].value_counts(), 'pct correct': cmp_match['dftag'].value_counts() / (cmp_match['dftag'].value_counts() + cmp_miss['dftag'].value_counts())})

In [104]:
cmp_miss, cmp_match = missMatchSplit(swt[['token', 'tag2']], match_idx)
missMatchTable(cmp_match, cmp_miss)

Unnamed: 0,miss,match,pct correct
O,18221,398529.0,0.956278
art,342,24.0,0.065574
eve,156,192.0,0.551724
geo,21473,1995.0,0.085009
gpe,684,7803.0,0.919406
nat,142,,
org,7605,11144.0,0.594378
per,7031,11081.0,0.611804
tim,2089,11489.0,0.846148


The above table shows that we have very poor performance with the `geo` tags. This could be due to differing definitions between the Spacy model and the training data. `gpe` seems to be the most likely category that could also be coded as `geo`.

In [107]:
def missTags(cmp_miss, tag_true):
    tag_counts = cmp_miss[cmp_miss['dftag'] == tag_true]['tag2'].value_counts()
    samples = cmp_miss[cmp_miss['dftag'] == tag_true][['token', 'tag2']]
    return tag_counts, samples

As we suspect, most of the "incorrect" 'geo' tags are marked as 'gpe'.

In [109]:
tag_counts, mistakes = missTags(cmp_miss, 'geo')
tag_counts

gpe    17631
O       1422
org     1397
per      884
eve       69
tim       47
art       23
Name: tag2, dtype: int64

The table below shows that the vast majority of the tokens "mislabeled" as 'gpe', appear quite reasonably labeled as 'gpe'. It is possible that one of the models is using context better than the other.

The top 20 by count account for more than 55% of the "mislabeled" 'geo' tags. 

In [117]:
mistakes.value_counts().iloc[0:20].sum()

8773

In [118]:
mistakes.value_counts().iloc[0:20]

token        tag2
US           gpe     1627
Iraq         gpe      884
United       gpe      656
States       gpe      655
Iran         gpe      552
Afghanistan  gpe      506
Israel       gpe      469
China        gpe      439
Baghdad      gpe      398
Pakistan     gpe      377
Russia       gpe      305
Gaza         gpe      254
Korea        gpe      250
Washington   gpe      238
India        gpe      234
South        gpe      198
North        gpe      195
New          gpe      188
UN           org      176
Venezuela    gpe      172
dtype: int64

In [119]:
from sklearn.metrics import classification_report

In [120]:
print(classification_report(df['Tag2'].iloc[:500000], swt['tag2']))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           O       0.97      0.96      0.96    416750
         art       0.10      0.07      0.08       366
         eve       0.16      0.55      0.25       348
         geo       0.62      0.09      0.15     23468
         gpe       0.24      0.92      0.38      8487
         nat       0.00      0.00      0.00       142
         org       0.63      0.59      0.61     18749
         per       0.79      0.61      0.69     18112
         tim       0.55      0.85      0.66     13578

    accuracy                           0.88    500000
   macro avg       0.45      0.51      0.42    500000
weighted avg       0.91      0.88      0.88    500000



  _warn_prf(average, modifier, msg_start, len(result))


### Change tag to tag translation

Let's try defining the Spacy assigned `gpe` tags as `geo` tags and see how the model performs.

In [121]:
spacy2ref = {'GPE': 'geo', 'ORG': 'org', 'PERSON': 'per', 'DATE': 'tim',
             'TIME': 'tim', 'EVENT': 'eve', 'LOC': 'geo', 'ORDINAL': 'O',
             'CARDINAL': 'O', 'MONEY': 'O', 'PERCENT': 'O', 'GEO': 'geo', 
             'QUANTITY': 'O', 'FAC': 'geo', 'LAW': 'O', 'PRODUCT': 'O', 
             'WORK_OF_ART': 'art', 'LANGUAGE': 'O', 'NORP': 'gpe'}
swt = normalizeTags(swt, spacy2ref)

In [122]:
print(classification_report(df['Tag2'].iloc[:500000], swt['tag2']))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           O       0.97      0.96      0.96    416750
         art       0.10      0.07      0.08       366
         eve       0.16      0.55      0.25       348
         geo       0.76      0.81      0.79     23468
         gpe       0.68      0.84      0.75      8487
         nat       0.00      0.00      0.00       142
         org       0.63      0.59      0.61     18749
         per       0.79      0.61      0.69     18112
         tim       0.55      0.85      0.66     13578

    accuracy                           0.92    500000
   macro avg       0.52      0.59      0.53    500000
weighted avg       0.92      0.92      0.92    500000



  _warn_prf(average, modifier, msg_start, len(result))


Switching the Spacy `gpe` tags to `geo` has greatly increased the f1-score for both `geo` and `gpe`:

Before `gpe` --> `geo`:

```
     precision    recall  f1-score
geo       0.62      0.09      0.15
gpe       0.24      0.92      0.38
```

after


```
 geo       0.76      0.81      0.79
 gpe       0.68      0.84      0.75
 ```