In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [3]:
%%time
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

CPU times: user 1.03 s, sys: 38.9 ms, total: 1.07 s
Wall time: 1.07 s


In [4]:
def sent2labels(sent):
    return [postag for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [5]:
#print(sent2features(train_sents[0])[0])
print(sent2labels(train_sents[0])[0])
print(sent2tokens(train_sents[0])[0])

NP
Melbourne


In [6]:
train_sents[0]

[('Melbourne', 'NP', 'B-LOC'),
 ('(', 'Fpa', 'O'),
 ('Australia', 'NP', 'B-LOC'),
 (')', 'Fpt', 'O'),
 (',', 'Fc', 'O'),
 ('25', 'Z', 'O'),
 ('may', 'NC', 'O'),
 ('(', 'Fpa', 'O'),
 ('EFE', 'NC', 'B-ORG'),
 (')', 'Fpt', 'O'),
 ('.', 'Fp', 'O')]

In [7]:
%%time
X_train = [sent2tokens(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2tokens(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 36.5 ms, sys: 0 ns, total: 36.5 ms
Wall time: 36.1 ms


In [8]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 2min 36s, sys: 44.6 ms, total: 2min 36s
Wall time: 2min 36s




CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [9]:
sentence = ['el', 'hombre', 'bajo','canta', 'bajo', 'el', 'Puente', 'bajo', 'tocando','el','bajo','en','la','escalera', 
            'baja']

          
def pos_tag(sentence):
    
    return list(zip(sentence, crf.predict([sentence])[0]))
 
print(pos_tag(sentence))  # [('I', 'PRP'), ('am', 'VBP'), ('Bob', 'NNP'), ('!', '.')]

[('el', 'PP'), ('hombre', 'VAI'), ('bajo', 'VMP'), ('canta', 'SP'), ('bajo', 'NC'), ('el', 'DA'), ('Puente', 'NC'), ('bajo', 'VMI'), ('tocando', 'VMG'), ('el', 'DA'), ('bajo', 'NC'), ('en', 'SP'), ('la', 'DA'), ('escalera', 'NC'), ('baja', 'AQ')]


In [10]:
labels = list(crf.classes_)

In [11]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

  average, "true nor predicted", 'F-score is', len(true_sum)


0.8222041210771038

In [12]:
# group B and I results
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           I      0.750     0.130     0.222        23
           Y      0.000     0.000     0.000         3
           Z      0.998     0.991     0.994      1080
          P0      0.856     0.953     0.902       532
          DA      0.926     0.978     0.952      5034
         VAI      0.923     0.963     0.943       299
         VAM      0.000     0.000     0.000         0
         VAN      0.842     0.842     0.842        19
         VAP      0.000     0.000     0.000         1
         VAS      0.826     0.731     0.776        26
          CC      0.950     0.928     0.939      1198
          NC      0.755     0.851     0.800     12347
          DD      0.879     0.920     0.899       364
          PD      0.750     0.469     0.577        32
          RG      0.731     0.517     0.606      1037
          DI      0.765     0.802     0.783      1090
          PI      0.784     0.357     0.491       112
         VMG      0.554    