In [1]:
# !pip install sklearn_crfsuite eli5

In [2]:
import eli5
import nltk
import scipy.stats
import sklearn
import sklearn_crfsuite

from itertools import chain
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### Загрузим данные:

In [3]:
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))
train_sents[0]

[('Melbourne', 'NP', 'B-LOC'),
 ('(', 'Fpa', 'O'),
 ('Australia', 'NP', 'B-LOC'),
 (')', 'Fpt', 'O'),
 (',', 'Fc', 'O'),
 ('25', 'Z', 'O'),
 ('may', 'NC', 'O'),
 ('(', 'Fpa', 'O'),
 ('EFE', 'NC', 'B-ORG'),
 (')', 'Fpt', 'O'),
 ('.', 'Fp', 'O')]

### Добавим фичи для каждого слова, чтобы обучить CRF (смотри лекцию:)):

In [4]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'postag': postag,
        'word_len': len(word),
        ### YOUR CODE HERE
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.istitle()': word1.istitle(),
            ### YOUR CODE HERE
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.istitle()': word1.istitle(),
            ### YOUR CODE HERE
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

### Посмотрим на пример фичей для одного слова:

In [5]:
X_train[0][1]

{'bias': 1.0,
 'postag': 'Fpa',
 'word_len': 1,
 '-1:word.istitle()': True,
 '+1:word.istitle()': True}

### Обучим CRF:

In [6]:
%%time
### YOUR CODE HERE (Probably you will change some hyperparameters)
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0,
    c2=0,
    max_iterations=50,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 8.08 s, sys: 78.8 ms, total: 8.16 s
Wall time: 8.45 s


CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0, c2=0, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=50,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

### Посмотрим на веса признаков:

In [7]:
eli5.show_weights(crf, top=30)

From \ To,O,B-LOC,I-LOC,B-MISC,I-MISC,B-ORG,I-ORG,B-PER,I-PER
O,2.557,2.266,-2.455,2.364,-3.135,3.636,-3.695,2.773,-3.245
B-LOC,0.993,-0.463,3.373,-0.35,-0.639,-0.577,-0.964,-0.542,-0.787
I-LOC,0.266,-0.292,2.79,-0.19,-0.345,-0.321,-0.508,-0.332,-0.425
B-MISC,-0.907,-0.344,-0.322,-0.235,3.666,-0.336,-0.678,-0.386,-0.549
I-MISC,-0.299,-0.491,-0.399,-0.286,4.418,-0.504,-0.862,-0.504,-0.73
B-ORG,0.596,-0.592,-0.577,-0.46,-0.834,-0.754,4.104,-0.614,-0.998
I-ORG,0.196,-0.803,-0.584,-0.493,-0.862,-0.794,4.472,-0.774,-1.061
B-PER,0.312,-0.638,-0.524,-0.414,-0.782,-0.656,-1.155,-0.784,4.45
I-PER,0.549,-0.458,-0.47,-0.336,-0.653,-0.577,-0.987,-0.573,2.525

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8
+4.236,bias,,,,,,,
+4.191,postag:Fc,,,,,,,
+3.170,EOS,,,,,,,
+2.435,postag:Fp,,,,,,,
+1.909,postag:CC,,,,,,,
+1.339,postag:DA,,,,,,,
+1.236,postag:Fpa,,,,,,,
+0.961,postag:Fg,,,,,,,
+0.694,postag:DI,,,,,,,
+0.667,postag:RG,,,,,,,

Weight?,Feature
+4.236,bias
+4.191,postag:Fc
+3.170,EOS
+2.435,postag:Fp
+1.909,postag:CC
+1.339,postag:DA
+1.236,postag:Fpa
+0.961,postag:Fg
+0.694,postag:DI
+0.667,postag:RG

Weight?,Feature
1.518,BOS
1.364,postag:NP
0.411,postag:VMI
0.334,postag:VMM
0.301,postag:NC
0.213,postag:VMN
0.156,+1:word.istitle()
0.109,word_len
0.071,postag:AQ
0.001,postag:PD

Weight?,Feature
1.956,-1:word.istitle()
0.998,postag:SP
0.553,postag:NC
0.406,+1:word.istitle()
0.31,bias
0.244,postag:AQ
0.087,postag:DA
0.022,postag:VMM
0.021,postag:VMS
0.003,postag:VAS

Weight?,Feature
+0.558,postag:NC
+0.492,+1:word.istitle()
+0.429,postag:Fe
+0.308,postag:Z
+0.117,postag:NP
+0.105,postag:AO
+0.101,postag:VMN
+0.084,postag:DN
+0.074,word_len
+0.028,postag:VMM

Weight?,Feature
+1.354,-1:word.istitle()
+1.044,postag:SP
+0.387,postag:Fe
+0.316,postag:Z
+0.165,postag:DA
+0.051,postag:AQ
+0.033,word_len
+0.026,postag:AO
+0.017,postag:VMM
+0.017,postag:VMN

Weight?,Feature
2.056,postag:NP
0.938,postag:NC
0.433,postag:AQ
0.331,BOS
0.22,+1:word.istitle()
0.215,postag:VMM
0.141,postag:VMI
0.029,postag:Y
0.024,postag:PN
0.013,postag:I

Weight?,Feature
1.974,-1:word.istitle()
1.672,postag:SP
0.377,postag:AQ
0.273,postag:NC
0.2,postag:DA
0.076,word_len
0.025,postag:Fe
0.024,postag:VMN
0.014,postag:VMM
0.006,postag:DN

Weight?,Feature
1.592,+1:word.istitle()
1.476,postag:VMI
1.171,BOS
0.928,postag:NP
0.819,postag:AQ
0.137,postag:VMN
0.128,postag:I
0.068,word_len
0.058,postag:VMS
0.044,postag:NC

Weight?,Feature
3.615,-1:word.istitle()
1.343,postag:AQ
0.987,postag:NC
0.08,+1:word.istitle()
0.036,postag:VMS
0.024,postag:I
0.022,postag:VMM
0.007,postag:VMP
0.004,postag:VMG
0.001,postag:Fz


### Посчитаем предсказание на тесте:

In [8]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-LOC', 'B-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']

In [9]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

0.25880458888777086

### А теперь отдельно для каждого тэга:

In [10]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.377     0.024     0.045      1084
       I-LOC      0.125     0.006     0.012       325
      B-MISC      0.000     0.000     0.000       339
      I-MISC      0.055     0.016     0.025       557
       B-ORG      0.438     0.129     0.199      1400
       I-ORG      0.447     0.381     0.412      1104
       B-PER      0.556     0.531     0.543       735
       I-PER      0.589     0.681     0.632       634

   micro avg      0.479     0.236     0.316      6178
   macro avg      0.323     0.221     0.233      6178
weighted avg      0.383     0.236     0.259      6178



### Посмотрим на наиболее и наименее вероятные переходы модели: 

In [11]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
I-ORG  -> I-ORG   4.471536
B-PER  -> I-PER   4.449507
I-MISC -> I-MISC  4.418327
B-ORG  -> I-ORG   4.104059
B-MISC -> I-MISC  3.665665
O      -> B-ORG   3.636294
B-LOC  -> I-LOC   3.373225
I-LOC  -> I-LOC   2.789625
O      -> B-PER   2.772881
O      -> O       2.557448
I-PER  -> I-PER   2.524761
O      -> B-MISC  2.363785
O      -> B-LOC   2.266030
B-LOC  -> O       0.992717
B-ORG  -> O       0.596311
I-PER  -> O       0.549242
B-PER  -> O       0.311946
I-LOC  -> O       0.266255
I-ORG  -> O       0.195644
I-LOC  -> B-MISC  -0.189555

Top unlikely transitions:
B-ORG  -> B-ORG   -0.753908
I-ORG  -> B-PER   -0.774496
B-PER  -> I-MISC  -0.781734
B-PER  -> B-PER   -0.784387
B-LOC  -> I-PER   -0.786924
I-ORG  -> B-ORG   -0.794411
I-ORG  -> B-LOC   -0.802640
B-ORG  -> I-MISC  -0.833858
I-MISC -> I-ORG   -0.861944
I-ORG  -> I-MISC  -0.862247
B-MISC -> O       -0.906669
B-LOC  -> I-ORG   -0.963950
I-PER  -> I-ORG   -0.987033
B-ORG  -> I-PER   -0.997563
I-ORG  -> I-PER 