In [2]:
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import codecs

In [3]:
with codecs.open('ner.xml',"r","utf-8") as data:
    soup = bs(data, "html5lib")

In [10]:
docs=[]
for elem in soup.find_all("document"):
    texts = []
    for c in elem.find("textwithnamedentities").children:
        if type(c) == Tag:
            if c.name == "namedentityintext":
                label = "N"  # part of a named entity
            else:
                label = "I"  # irrelevant word
            for w in c.text.split(" "):
                if len(w) > 0:
                    texts.append((w, label))
    docs.append(texts)

In [12]:
import nltk
data = []
for i, doc in enumerate(docs):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]

    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

In [31]:
data

[[('The', 'DT', 'I'),
  ('U.S.', 'NNP', 'I'),
  ('Patent', 'NNP', 'I'),
  ('Office', 'NNP', 'I'),
  ('allows', 'VBZ', 'I'),
  ('genes', 'NNS', 'I'),
  ('to', 'TO', 'I'),
  ('be', 'VB', 'I'),
  ('patented', 'VBN', 'I'),
  ('as', 'RB', 'I'),
  ('soon', 'RB', 'I'),
  ('as', 'IN', 'I'),
  ('someone', 'NN', 'I'),
  ('isolates', 'VBZ', 'I'),
  ('the', 'DT', 'I'),
  ('DNA', 'NN', 'I'),
  ('by', 'IN', 'I'),
  ('removing', 'VBG', 'I'),
  ('it', 'PRP', 'I'),
  ('from', 'IN', 'I'),
  ('the', 'DT', 'I'),
  ('cell', 'NN', 'I'),
  (',', ',', 'I'),
  ('says', 'VBZ', 'I'),
  ('ACLU', 'NNP', 'N'),
  ('attorney', 'NN', 'I'),
  ('Sandra', 'NNP', 'N'),
  ('Park', 'NNP', 'N'),
  ('.', '.', 'I')],
 [('``', '``', 'I'),
  ('Supporters', 'NNS', 'I'),
  ('of', 'IN', 'I'),
  ('these', 'DT', 'I'),
  ('programs', 'NNS', 'I'),
  ('make', 'VBP', 'I'),
  ('completely', 'RB', 'I'),
  ('unfounded', 'JJ', 'I'),
  ('generalizations', 'NNS', 'I'),
  ('about', 'IN', 'I'),
  ('boys', 'NNS', 'I'),
  ('and', 'CC', 'I'),
  ('g

In [14]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [29]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

*******>  [('The', 'DT', 'I'), ('U.S.', 'NNP', 'I'), ('Patent', 'NNP', 'I'), ('Office', 'NNP', 'I'), ('allows', 'VBZ', 'I'), ('genes', 'NNS', 'I'), ('to', 'TO', 'I'), ('be', 'VB', 'I'), ('patented', 'VBN', 'I'), ('as', 'RB', 'I'), ('soon', 'RB', 'I'), ('as', 'IN', 'I'), ('someone', 'NN', 'I'), ('isolates', 'VBZ', 'I'), ('the', 'DT', 'I'), ('DNA', 'NN', 'I'), ('by', 'IN', 'I'), ('removing', 'VBG', 'I'), ('it', 'PRP', 'I'), ('from', 'IN', 'I'), ('the', 'DT', 'I'), ('cell', 'NN', 'I'), (',', ',', 'I'), ('says', 'VBZ', 'I'), ('ACLU', 'NNP', 'N'), ('attorney', 'NN', 'I'), ('Sandra', 'NNP', 'N'), ('Park', 'NNP', 'N'), ('.', '.', 'I')]
*******>  [('``', '``', 'I'), ('Supporters', 'NNS', 'I'), ('of', 'IN', 'I'), ('these', 'DT', 'I'), ('programs', 'NNS', 'I'), ('make', 'VBP', 'I'), ('completely', 'RB', 'I'), ('unfounded', 'JJ', 'I'), ('generalizations', 'NNS', 'I'), ('about', 'IN', 'I'), ('boys', 'NNS', 'I'), ('and', 'CC', 'I'), ('girls', 'NNS', 'I'), (',', ',', 'I'), ('but', 'CC', 'I'), ('offe

*******>  [('Officials', 'NNS', 'I'), ('lifted', 'VBD', 'I'), ('evacuation', 'NN', 'I'), ('orders', 'NNS', 'I'), ('for', 'IN', 'I'), ('the', 'DT', 'I'), ('residents', 'NNS', 'I'), ('of', 'IN', 'I'), ('nearly', 'RB', 'I'), ('500', 'CD', 'I'), ('homes', 'NNS', 'I'), ('late', 'JJ', 'I'), ('Monday', 'NNP', 'I'), (',', ',', 'I'), ('said', 'VBD', 'I'), ('Daniel', 'NNP', 'N'), ('Berlant', 'NNP', 'N'), (',', ',', 'I'), ('a', 'DT', 'I'), ('spokesman', 'NN', 'I'), ('for', 'IN', 'I'), ('the', 'DT', 'I'), ('California', 'NNP', 'N'), ('Department', 'NNP', 'N'), ('of', 'IN', 'I'), ('Forestry', 'NNP', 'I'), ('and', 'CC', 'I'), ('Fire', 'NNP', 'I'), ('Protection', 'NNP', 'I'), ('.', '.', 'I')]
*******>  [('Industry', 'NN', 'I'), ('consultant', 'NN', 'I'), ('Daniel', 'NNP', 'N'), ('Yergin', 'NNP', 'N'), (',', ',', 'I'), ('chairman', 'NN', 'I'), ('of', 'IN', 'I'), ('IHS', 'NNP', 'N'), ('Cambridge', 'NNP', 'N'), ('Energy', 'NNP', 'N'), ('Research', 'NNP', 'N'), ('Associates', 'NNPS', 'N'), (',', ',', 'I'

In [32]:
X_train

[[['bias',
   'word.lower=the',
   'word[-3:]=The',
   'word[-2:]=he',
   'word.isupper=False',
   'word.istitle=True',
   'word.isdigit=False',
   'postag=DT',
   'BOS',
   '+1:word.lower=champions',
   '+1:word.istitle=True',
   '+1:word.isupper=False',
   '+1:word.isdigit=False',
   '+1:postag=NNPS'],
  ['bias',
   'word.lower=champions',
   'word[-3:]=ons',
   'word[-2:]=ns',
   'word.isupper=False',
   'word.istitle=True',
   'word.isdigit=False',
   'postag=NNPS',
   '-1:word.lower=the',
   '-1:word.istitle=True',
   '-1:word.isupper=False',
   '-1:word.isdigit=False',
   '-1:postag=DT',
   '+1:word.lower=,',
   '+1:word.istitle=False',
   '+1:word.isupper=False',
   '+1:word.isdigit=False',
   '+1:postag=,'],
  ['bias',
   'word.lower=,',
   'word[-3:]=,',
   'word[-2:]=,',
   'word.isupper=False',
   'word.istitle=False',
   'word.isdigit=False',
   'postag=,',
   '-1:word.lower=champions',
   '-1:word.istitle=True',
   '-1:word.isupper=False',
   '-1:word.isdigit=False',
   '-

In [21]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 14878
Seconds required: 0.088

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 6158.828924
Feature norm: 1.000000
Error norm: 6500.722347
Active features: 14046
Line search trials: 1
Line search step: 0.000045
Seconds required for this iteration: 0.011

***** Iteration #2 *****
Loss: 5062.161381
Feature norm: 0.821307
Error norm: 5754.661244
Active features: 14344
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #3 *****
Loss: 4260.033143
Feature norm: 0.542838
Error norm: 9243.558374
Active features: 14141
Line search trials: 2
Line search step: 0.500000
Seconds required for this 

***** Iteration #67 *****
Loss: 401.701140
Feature norm: 54.113225
Error norm: 43.489601
Active features: 3789
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #68 *****
Loss: 401.041816
Feature norm: 54.246927
Error norm: 39.129340
Active features: 3768
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #69 *****
Loss: 400.392226
Feature norm: 54.388781
Error norm: 30.074622
Active features: 3740
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #70 *****
Loss: 399.556657
Feature norm: 54.478710
Error norm: 28.950553
Active features: 3696
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #71 *****
Loss: 399.076891
Feature norm: 54.690302
Error norm: 65.209224
Active features: 3639
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

***** Iteration #112 *****
Loss: 389.578256
Feature norm: 55.778274
Error norm: 28.793569
Active features: 3118
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #113 *****
Loss: 389.484251
Feature norm: 55.784087
Error norm: 24.733978
Active features: 3120
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #114 *****
Loss: 389.422002
Feature norm: 55.786992
Error norm: 25.368512
Active features: 3121
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #115 *****
Loss: 389.363020
Feature norm: 55.793493
Error norm: 29.513850
Active features: 3121
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #116 *****
Loss: 389.288370
Feature norm: 55.800884
Error norm: 22.018893
Active features: 3117
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #185 *****
Loss: 386.276084
Feature norm: 56.935901
Error norm: 6.121189
Active features: 2890
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #186 *****
Loss: 386.239762
Feature norm: 56.933755
Error norm: 13.518739
Active features: 2887
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #187 *****
Loss: 386.202554
Feature norm: 56.930456
Error norm: 5.007929
Active features: 2883
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.006

***** Iteration #188 *****
Loss: 386.176139
Feature norm: 56.923264
Error norm: 6.091513
Active features: 2864
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.005

***** Iteration #189 *****
Loss: 386.143368
Feature norm: 56.915697
Error norm: 2.397846
Active features: 2852
Line search trials: 1
Line search step: 1.000000
Seconds required for th

In [25]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# y_pred
# Let's take a look at a random sample in the testing set
i = 1
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))

finally (I)
getting (I)
the (I)
opportunity (I)
to (I)
play (I)
in (I)
a (I)
regular (I)
competition (I)
against (I)
top-tier (I)
nations (I)
would (I)
be (I)
`` (I)
tough (I)
, (I)
'' (I)
but (I)
equally (I)
as (I)
rewarding (I)
, (I)
according (I)
to (I)
former (I)
argentina (N)
captain (I)
agustin (N)
pichot (N)
. (I)


In [26]:
X[3]

[['bias',
  'word.lower=kickoff',
  'word[-3:]=off',
  'word[-2:]=ff',
  'word.isupper=False',
  'word.istitle=True',
  'word.isdigit=False',
  'postag=NNP',
  'BOS',
  '+1:word.lower=is',
  '+1:word.istitle=False',
  '+1:word.isupper=False',
  '+1:word.isdigit=False',
  '+1:postag=VBZ'],
 ['bias',
  'word.lower=is',
  'word[-3:]=is',
  'word[-2:]=is',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'postag=VBZ',
  '-1:word.lower=kickoff',
  '-1:word.istitle=True',
  '-1:word.isupper=False',
  '-1:word.isdigit=False',
  '-1:postag=NNP',
  '+1:word.lower=set',
  '+1:word.istitle=False',
  '+1:word.isupper=False',
  '+1:word.isdigit=False',
  '+1:postag=VBN'],
 ['bias',
  'word.lower=set',
  'word[-3:]=set',
  'word[-2:]=et',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'postag=VBN',
  '-1:word.lower=is',
  '-1:word.istitle=False',
  '-1:word.isupper=False',
  '-1:word.isdigit=False',
  '-1:postag=VBZ',
  '+1:word.lower=for',
  '+1:w

In [27]:
data[3]

[('Kickoff', 'NNP', 'I'),
 ('is', 'VBZ', 'I'),
 ('set', 'VBN', 'I'),
 ('for', 'IN', 'I'),
 ('7:30', 'CD', 'I'),
 ('p.m.', 'NN', 'I'),
 ('``', '``', 'I'),
 ('It', 'PRP', 'I'),
 ("'s", 'VBZ', 'I'),
 ('time', 'NN', 'I'),
 ('to', 'TO', 'I'),
 ('face', 'VB', 'I'),
 ('someone', 'NN', 'I'),
 ('else', 'RB', 'I'),
 (',', ',', 'I'),
 ("''", "''", 'I'),
 ('AHS', 'NNP', 'N'),
 ('coach', 'VBP', 'I'),
 ('Shane', 'NNP', 'N'),
 ('Davis', 'NNP', 'N'),
 ('said', 'VBD', 'I'),
 ('.', '.', 'I')]