In [11]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /headless/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [8]:
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import codecs

# Read data file and parse the XML
with codecs.open("reuters.xml", "r", "utf-8") as infile:
    soup = bs(infile, "html5lib")

docs = []
for elem in soup.find_all("document"):
    texts = []

    # Loop through each child of the element under "textwithnamedentities"
    for c in elem.find("textwithnamedentities").children:
        if type(c) == Tag:
            if c.name == "namedentityintext":
                label = "N"  # part of a named entity
            else:
                label = "I"  # irrelevant word
            for w in c.text.split(" "):
                if len(w) > 0:
                    texts.append((w, label))
    docs.append(texts)

In [9]:
docs


[[('Paxar', 'N'),
  ('Corp', 'N'),
  ('said', 'I'),
  ('it', 'I'),
  ('has', 'I'),
  ('acquired', 'I'),
  ('Thermo-Print', 'N'),
  ('GmbH', 'N'),
  ('of', 'I'),
  ('Lohn', 'N'),
  (',', 'I'),
  ('West', 'N'),
  ('Germany', 'N'),
  (',', 'I'),
  ('a', 'I'),
  ('distributor', 'I'),
  ('of', 'I'),
  ('Paxar', 'N'),
  ('products,', 'I'),
  ('for', 'I'),
  ('undisclosed', 'I'),
  ('terms.', 'I')],
 [('Key', 'N'),
  ('Tronic', 'N'),
  ('corp', 'N'),
  ('said', 'I'),
  ('it', 'I'),
  ('has', 'I'),
  ('received', 'I'),
  ('contracts', 'I'),
  ('to', 'I'),
  ('provide', 'I'),
  ('seven', 'I'),
  ('original', 'I'),
  ('equipment', 'I'),
  ('manufacturers', 'I'),
  ('with', 'I'),
  ('which', 'I'),
  ('it', 'I'),
  ('has', 'I'),
  ('not', 'I'),
  ('done', 'I'),
  ('business', 'I'),
  ('recently', 'I'),
  ('with', 'I'),
  ('over', 'I'),
  ('300,000', 'I'),
  ('computer', 'I'),
  ('keyboards', 'I'),
  ('for', 'I'),
  ('delivery', 'I'),
  ('within', 'I'),
  ('the', 'I'),
  ('next', 'I'),
  ('12', 'I'

In [12]:
import nltk
data = []
for i, doc in enumerate(docs):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]

    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

In [13]:
data[0]

[('Paxar', 'NNP', 'N'),
 ('Corp', 'NNP', 'N'),
 ('said', 'VBD', 'I'),
 ('it', 'PRP', 'I'),
 ('has', 'VBZ', 'I'),
 ('acquired', 'VBN', 'I'),
 ('Thermo-Print', 'NNP', 'N'),
 ('GmbH', 'NNP', 'N'),
 ('of', 'IN', 'I'),
 ('Lohn', 'NNP', 'N'),
 (',', ',', 'I'),
 ('West', 'NNP', 'N'),
 ('Germany', 'NNP', 'N'),
 (',', ',', 'I'),
 ('a', 'DT', 'I'),
 ('distributor', 'NN', 'I'),
 ('of', 'IN', 'I'),
 ('Paxar', 'NNP', 'N'),
 ('products,', 'NN', 'I'),
 ('for', 'IN', 'I'),
 ('undisclosed', 'JJ', 'I'),
 ('terms.', 'NN', 'I')]

In [14]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [16]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [17]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 13497
Seconds required: 0.031

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5515.608589
Feature norm: 1.000000
Error norm: 6054.401733
Active features: 13091
Line search trials: 1
Line search step: 0.000042
Seconds required for this iteration: 0.014

***** Iteration #2 *****
Loss: 4479.180437
Feature norm: 0.850287
Error norm: 5454.093131
Active features: 13163
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.025

***** Iteration #3 *****
Loss: 3962.836796
Feature norm: 0.821476
Error norm: 12368.018879
Active features: 8549
Line search trials: 2
Line search step: 0.500000
Seconds required for this 

***** Iteration #49 *****
Loss: 206.704079
Feature norm: 42.017462
Error norm: 8.584639
Active features: 1970
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #50 *****
Loss: 206.456331
Feature norm: 42.045811
Error norm: 6.619936
Active features: 1955
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.018

***** Iteration #51 *****
Loss: 206.086099
Feature norm: 42.066694
Error norm: 5.816704
Active features: 1943
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #52 *****
Loss: 206.064214
Feature norm: 42.152432
Error norm: 20.505288
Active features: 1903
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.018

***** Iteration #53 *****
Loss: 205.521164
Feature norm: 42.069556
Error norm: 4.637841
Active features: 1915
Line search trials: 1
Line search step: 1.000000
Seconds required for this it

***** Iteration #95 *****
Loss: 201.194498
Feature norm: 43.403589
Error norm: 6.342275
Active features: 1691
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #96 *****
Loss: 201.149213
Feature norm: 43.429111
Error norm: 9.891290
Active features: 1689
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #97 *****
Loss: 201.095350
Feature norm: 43.442906
Error norm: 5.978759
Active features: 1693
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #98 *****
Loss: 201.069694
Feature norm: 43.455542
Error norm: 12.128904
Active features: 1691
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #99 *****
Loss: 201.003294
Feature norm: 43.462485
Error norm: 8.264985
Active features: 1692
Line search trials: 1
Line search step: 1.000000
Seconds required for this it

***** Iteration #148 *****
Loss: 199.534769
Feature norm: 43.381617
Error norm: 5.277373
Active features: 1635
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #149 *****
Loss: 199.524498
Feature norm: 43.372569
Error norm: 5.022343
Active features: 1631
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.009

***** Iteration #150 *****
Loss: 199.513547
Feature norm: 43.375141
Error norm: 5.562369
Active features: 1631
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #151 *****
Loss: 199.500582
Feature norm: 43.373512
Error norm: 3.053317
Active features: 1631
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.019

***** Iteration #152 *****
Loss: 199.492841
Feature norm: 43.376492
Error norm: 5.014246
Active features: 1630
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

***** Iteration #196 *****
Loss: 199.087330
Feature norm: 43.309882
Error norm: 5.476982
Active features: 1607
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #197 *****
Loss: 199.075746
Feature norm: 43.306204
Error norm: 4.150150
Active features: 1605
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #198 *****
Loss: 199.069727
Feature norm: 43.305572
Error norm: 5.426536
Active features: 1605
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #199 *****
Loss: 199.058980
Feature norm: 43.301518
Error norm: 3.903175
Active features: 1603
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.008

***** Iteration #200 *****
Loss: 199.053438
Feature norm: 43.301454
Error norm: 4.996352
Active features: 1603
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

In [18]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))

shr (I)
1.22 (I)
dlrs (I)
vs (I)
1.28 (I)
dlrs (I)
net (I)
226.4 (I)
mln (I)
vs (I)
233.9 (I)
mln (I)
assets (I)
80.45 (I)
billion (I)
vs (I)
70.23 (I)
billion (I)
loans (I)
35.16 (I)
billion (I)
vs (I)
35.99 (I)
billion (I)
deposits (I)
45.22 (I)
billion (I)
vs (I)
39.68 (I)
billion (I)
return (I)
on (I)
assets (I)
1.14 (I)
pct (I)
vs (I)
1.35 (I)
pct (I)
return (I)
on (I)
common (I)
equity (I)
18.20 (I)
pct (I)
vs (I)
22.08 (I)
pct (I)
note: (I)
1987 (I)
qtr (I)
net (I)
was (I)
reduced (I)
by (I)
20 (I)
mln (I)
dlrs (I)
because (I)
1.3 (I)
billion (I)
dlrs (I)
of (I)
loans (I)
to (I)
brazil (N)
were (I)
placed (I)
on (I)
non-accrual. (I)
loan (I)
loss (I)
provision (I)
35 (I)
mln (I)
dlrs (I)
vs (I)
70 (I)
mln (I)
year (I)
earlier. (I)


In [19]:
import numpy as np
from sklearn.metrics import classification_report

# Create a mapping of labels to indices
labels = {"N": 1, "I": 0}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

              precision    recall  f1-score   support

           I       0.98      0.97      0.98      2832
           N       0.81      0.88      0.85       396

    accuracy                           0.96      3228
   macro avg       0.90      0.93      0.91      3228
weighted avg       0.96      0.96      0.96      3228

