In [1351]:
import pandas as pd
import numpy as np
import spacy

In [1352]:
data_path = '../data/'
df = pd.read_csv(data_path + 'ner_dataset.zip', encoding="latin1")
df.shape

(1048575, 4)

In [1353]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


## Preprocess text

In [1354]:
# Keep all numbers, all letters and hyphens
df['tidy_word'] = df['Word'].apply(lambda x: re.sub("[^a-zA-Z0-9\-]", "", x))

In [1355]:
# Remove observations that are now empty
df = df[df['tidy_word'] != '']

# Hyphenated words are kept as a single token
# in this dataset. Remove observations that
# are a between-word dash. This causes problems
# with the algorithm
df = df[df['tidy_word'] != '-']

In [1356]:
# Later steps require a consecutive index
df = df.reset_index()

### Create Sentence Form of Text

We create a new Series where each row is a list with the contents of a sentence. We do this so that it can be more easily sent to Spacy NER.

In [1357]:
def toSentenceForm(df, var):
    """
    From a Series:
    
        Sentence  #      var
        -----------  -------
        Sentence: 1     Mary
                NaN      had
                NaN        a
                NaN   little
                NaN     lamb
                NaN        .
        Sentence: 2       He
                NaN followed
                NaN      her
                
    to a list of lists of strings by sentence:
    
        [['Mary', 'had', 'a', 'little', 'lamb' '.'],
         ['He', 'followed', 'her', ...]]
                                
    """
    sent_out = []

    # Find the index of each sentence start
    idx = df['Sentence #'].isna()
    sent_breaks = df[idx == False].index
    
    
    for i in range(len(sent_breaks)-1):
        
        # Create a list containing all strings
        # from the current sentence
        out = []
        for j in range(sent_breaks[i], sent_breaks[i+1]):
            out.append(df[var].iloc[j])
        
        # Append ith sentence list to the
        # list of all sentences
        sent_out.append(out)

    return sent_out

In [1358]:
sentences = toSentenceForm(df, 'tidy_word')

In [1359]:
tags = toSentenceForm(df, 'Tag')

In [1360]:
se = pd.DataFrame({'sentence_list': sentences, 'tags': tags})

In [1361]:
se.head()

Unnamed: 0,sentence_list,tags
0,"[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
1,"[Families, of, soldiers, killed, in, the, conf...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[They, marched, from, the, Houses, of, Parliam...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo]"
3,"[Police, put, the, number, of, marchers, at, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,"[The, protest, comes, on, the, eve, of, the, a...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,..."


In [1362]:
se.to_csv('../data/ner_sentences.csv')

### Perform NER

Spacy will automatically perform NER when we create a Doc object from a sentence.

Iterate over all words and tags in the Doc object and create lists of tokens and tags

In [1363]:
nlp = spacy.load("en_core_web_sm")

In [1389]:
words = []
spacy_tags = []
doc = nlp(fs)
ctr = 0

# number of iterations between progress report
report_freq = 2500

for row, sentence in enumerate(se['sentence_list']):
    doc = nlp(' '.join(sentence))

    # Print progress report
    if row == int(row / report_freq) * report_freq:
        print(f'{row:10.9g}:\t{se.shape[0]} ({100*row/se.shape[0]:.2f}%)')
    
    # Iterate over tokens in doc
    # and create lists of tokens and tags
    for i, t in enumerate(doc):
        words.append(t)
        if t.ent_iob_ == 'O':
            spacy_tags.append('O')
        else:
            spacy_tags.append(t.ent_iob_+'-'+t.ent_type_)


         0:	47839 (0.00%)
      2500:	47839 (5.23%)
      5000:	47839 (10.45%)
      7500:	47839 (15.68%)
     10000:	47839 (20.90%)
     12500:	47839 (26.13%)
     15000:	47839 (31.36%)
     17500:	47839 (36.58%)
     20000:	47839 (41.81%)
     22500:	47839 (47.03%)
     25000:	47839 (52.26%)
     27500:	47839 (57.48%)
     30000:	47839 (62.71%)
     32500:	47839 (67.94%)
     35000:	47839 (73.16%)
     37500:	47839 (78.39%)
     40000:	47839 (83.61%)
     42500:	47839 (88.84%)
     45000:	47839 (94.07%)
     47500:	47839 (99.29%)


In [1390]:
df_spacy = pd.DataFrame({'Words': words, 'Tags': spacy_tags})

In [1391]:
df_spacy.shape

(981669, 2)

In [1392]:
df_spacy.to_csv('../data/ner_spacy_raw.csv')

In [1393]:
df_spacy.head(7)

Unnamed: 0,Words,Tags
0,Thousands,B-CARDINAL
1,of,O
2,demonstrators,O
3,have,O
4,marched,O
5,through,O
6,London,B-GPE


In [1394]:
def containsHyphen(s):
    return len(s) > 1 and (s.find('-') > -1)

def findHyphenatedWord(s, s_list, idx):
    return s + s_list[idx+1] + s_list[idx+2]

def findAdditionalHyphens(s, s_list, idx, inc):
        finished = 1
        idx += inc
        if idx < len(s_list) and s_list[idx] == '-':
            s = findHyphenatedWord(s, s_list, idx-1)
            finished = 0
            return (idx, s, finished)
        else:
            return (idx, s, finished)

def findHyphenatedWords(s_list, idx):
    s = s_list[idx]
    s = findHyphenatedWord(s, s_list, idx)
    
    finished = 0  # have we found all connected hyphens?
    increment = 3 # the first additional hyphen is further than
                  # subsequent ones, because the original 'idx'
                  # is pointing at a word, but the after calling
                  # findAdditionalHyphens() 'idx' will point at a
                  # hyphen after the word
                  
    
    while finished == 0:
        idx, s, finished = findAdditionalHyphens(s, s_list, idx, increment)
        increment = 2
                

    return (idx, s)

In [1395]:
a = [ 'apple', 'apple-jacks', 'chocolate', 'coco-puffs-again', 
     'old-mcdonald-had-a-farm']
b = [ 'apple', 'apple', '-', 'jacks', 'chocolate', 'coco', 
     '-', 'puffs', '-', 'again', 'old', '-', 'mcdonald', '-', 'had', 
     '-', 'a', '-', 'farm']

def test_findHyphenatedWords(a, b):
    c = []
    i = 0
    for ai in (a):
        if containsHyphen(ai):
            i, ss = findHyphenatedWords(b, i)
            c.append(ss)
        else:
            c.append(b[i])
            i += 1

    c_cmp = ['apple',
             'apple-jacks',
             'chocolate',
             'coco-puffs-again',
             'old-mcdonald-had-a-farm']

    return c == c_cmp

In [1396]:
assert test_findHyphenatedWords(a, b)

### Align tokens and tags

Spacy breaks up some words differently than our dataset. Below, we create a list of the tokens from our original dataset and their Spacy tags.

In [1401]:
c = []
s_tags = []
i = 0
j = 0
supress_til = 1780
for word in df['tidy_word']:
    if containsHyphen(word):
        # grabbing a bunch of words in case we have to search
        # and grab more tokens from hyphenated words
        sp_word = [w.text for w in df_spacy['Words'].iloc[j:j+9]]
        if sp_word[0] == word:
            sp_tag_tmp = df_spacy['Tags'].iloc[j]
            s_tags.append(sp_tag_tmp)
            c.append(word)
            
            j += 1
        else:
            k = 0
            k, ss = findHyphenatedWords(sp_word, k)
            
            sp_tag_tmp = df_spacy['Tags'].iloc[j]
            s_tags.append(sp_tag_tmp)
            
            j += k
            
            c.append(ss)
    else:
        next_sp_word_ = df_spacy["Words"].iloc[j]
        if len(word) > 1 and word.endswith('.'):
            next_sp_word = df_spacy["Words"].iloc[j+1]
            if next_sp_word.text == '.':
                j += 1
                
        sp_tag_tmp = df_spacy['Tags'].iloc[j]
        s_tags.append(sp_tag_tmp)
        c.append(next_sp_word_.text)
        j += 1
        
    i += 1
    if len(c) == 500000:
        break
        
    if c[-1] != word:
        # try and take care of times where Spacy splits 'wont', 'wed'
        tmpa = df_spacy["Words"].iloc[j]
        tmp = c[-1] + tmpa.text
        if tmp == word:
            c[-1] = tmp
            
        j += 1
            
        print('done')
        print(f'i: {i} j: {j} c: {c[-1:]} word: "{word}", sp_word[0]: "{list(df_spacy["Words"].iloc[j-2:j])}"')
#         break
    last_word = word

done
i: 168074 j: 171909 c: ['wont'] word: "wont", sp_word[0]: "[wo, nt]"
done
i: 189964 j: 194298 c: ['wed'] word: "wed", sp_word[0]: "[we, d]"


In [1402]:
all_spacy_tags = np.unique(s_tags)
print(all_spacy_tags)

['B-CARDINAL' 'B-DATE' 'B-EVENT' 'B-FAC' 'B-GPE' 'B-LANGUAGE' 'B-LAW'
 'B-LOC' 'B-MONEY' 'B-NORP' 'B-ORDINAL' 'B-ORG' 'B-PERCENT' 'B-PERSON'
 'B-PRODUCT' 'B-QUANTITY' 'B-TIME' 'B-WORK_OF_ART' 'I-CARDINAL' 'I-DATE'
 'I-EVENT' 'I-FAC' 'I-GPE' 'I-LAW' 'I-LOC' 'I-MONEY' 'I-NORP' 'I-ORG'
 'I-PERCENT' 'I-PERSON' 'I-PRODUCT' 'I-QUANTITY' 'I-TIME' 'I-WORK_OF_ART'
 'O']


In [1403]:
spacy_word_tag = pd.DataFrame({'token': c, 'tag': s_tags})
spacy_word_tag.head()

Unnamed: 0,token,tag
0,Thousands,B-CARDINAL
1,of,O
2,demonstrators,O
3,have,O
4,marched,O


In [1404]:
spacy_word_tag.to_csv('../data/ner_spacy_aligned.csv')