In [1]:
PATH = "d:\\git-nlp\\ner-uk\\"

In [2]:
# Read tokens and positions of tokens from a file

def read_tokens(filename):
    tokens = []
    pos = 0
    with open(filename, "r", encoding='utf-8') as f:
        text = f.read().split("\n")
        for line in text:
            if len(line) == 0:
                pos += 1
            else:
                tokens.append(("<S>", pos, pos))
                for token in line.split(" "):
                    tokens.append((token, pos, pos + len(token)))
                    pos += len(token) + 1
                tokens.append(("</S>", pos, pos))
    return tokens


In [3]:
# Read annotations and positions of annotations from a file

def read_annotations(filename):
    anno = []
    with open(filename, "r", encoding='utf-8') as f:
        for line in f.readlines():
            annotations = line.split()
            #print(annotations)
            anno.append((annotations[1], int(annotations[2]), int(annotations[3])))
    return anno


In [4]:
# Using positions of tokens and annotations, extract a list of token labels
def cyr_to_lat(label:str):
    res = label.replace('ОРГ', 'ORG').replace('ЛОК', 'LOC').replace('ПЕРС', 'PERS').replace('РІЗН', 'MISC')
    return res

def extract_labels(anno, tokens):
    labels = []
    ann_id = 0
    for token in tokens:
        if ann_id < len(anno):
            label, beg, end = anno[ann_id]
            label = cyr_to_lat(label)
            if token[0] in ["<S>", "</S>"]:
                labels.append("--")
            elif token[1] < beg:
                labels.append("--")
            else:
                if token[1] == beg:
                    labels.append("B-" + label)
                else:
                    labels.append("I-" + label)
                if token[2] == end:
                    ann_id += 1
        else:
            labels.append("--")    
    return labels


In [5]:
tokens = read_tokens(PATH + "data/A_alumni.krok.edu.ua_Prokopenko_Vidrodzhennia_velotreku(5).tok.txt")
anno = read_annotations(PATH + "data/A_alumni.krok.edu.ua_Prokopenko_Vidrodzhennia_velotreku(5).tok.ann")
labels = extract_labels(anno, tokens)

In [6]:
for i, j in zip(tokens, labels):
    print(i[0], j)

<S> --
Історія --
змін --
. --
</S> --
<S> --
Спільними --
зусиллями --
влада --
та --
громадськість --
врятували --
й --
повертають --
до --
життя --
Київський B-ORG
велотрек I-ORG
</S> --
<S> --
Київський B-ORG
велотрек I-ORG
« I-ORG
Авангард I-ORG
» I-ORG
по --
вул B-LOC
. I-LOC
Богдана I-LOC
Хмельницького I-LOC
, I-LOC
58-А I-LOC
, --
що --
збудований --
у --
1913 --
році --
за --
ініціативи --
та --
кошти --
киян --
, --
відновлюється --
так --
само --
— --
силами --
громади --
і --
без --
фінансування --
з --
бюджету --
. --
</S> --
<S> --
А --
за --
відчутної --
підтримки --
влади --
реконструкція --
набирає --
обертів --
. --
</S> --
<S> --
« --
Ще --
недавно --
велотрек --
існував --
тільки --
у --
мріях --
ентузіастів --
велоруху --
, --
а --
вже --
зараз --
він --
стрімко --
набирає --
реалістичних --
контурів --
, --
— --
радіє --
голова --
Шевченківської B-ORG
райдержадміністрації I-ORG
Олег B-PERS
Гаряга I-PERS
. --
— --
Ми --
сподіваємося --
, --
що --
вже --
за --
півто

In [7]:
# Extract list of files for training and testing

dev_test = {"dev": [], "test": []}
category = ""
with open(PATH + "doc/dev-test-split.txt", "r") as f:
    for line in f.readlines():
        line = line.strip()
        if line in ["DEV", "TEST"]:
            category = line.lower()
        elif len(line) == 0:
            continue
        else:
            dev_test[category].append(line)


In [8]:
print(len(dev_test["dev"]), len(dev_test["test"]))

156 73


In [9]:
# Get train and test data and labels

train_tokens, test_tokens, train_labels, test_labels = [], [], [], []

for filename in dev_test["dev"]:
    try:
        tokens = read_tokens(PATH + "data/" + filename + ".txt")
        train_tokens += [token[0] for token in tokens]
        train_labels += extract_labels(read_annotations(PATH + "data/" + filename + ".ann"), tokens)
    except:
        pass

for filename in dev_test["test"]:
    try:
        tokens = read_tokens(PATH + "data/" + filename + ".txt")
        test_tokens += [token[0] for token in tokens]
        test_labels += extract_labels(read_annotations(PATH + "data/" + filename + ".ann"), tokens)
    except:
        pass


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, Normalizer


## https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html

In [11]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [24]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from bpemb import BPEmb

In [89]:
bpemb_uk = BPEmb(lang="uk", dim=100)

def calc_emb(text):
    res = np.zeros(bpemb_uk.vectors.shape[1], dtype=np.float32)
    # tokens = word_tokenize(text)
    # for t in tokens:
    embs = bpemb_uk.embed(text)
    for e in embs:
        res += e
    n = len(embs)
    if n:
        res /= n
    return res/2

def word2features(tokens, labels, i):
    word = tokens[i]
    # print(word)
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[:-3]': word[:-3],
        'word[:-2]': word[:-2],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    #emb = calc_emb(word)
    #emb_features = {f'e{k}':v for k, v in enumerate(emb)}
    #features.update(emb_features)
    if i > 0 and tokens[i-1]!='<S>':
        word1 = tokens[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:label': labels[i-1],
            '-1:qstart': word1=='«',
        })
    else:
        features['BOS'] = True
        features['-1:label']='--'

    if i < len(tokens)-1 and tokens[i+1]!='</S>':
        word1 = tokens[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:qend': word1=='»',
        })
    else:
        features['EOS'] = True

    return features


def tokens2features(tokens, labels):
    return [word2features(tokens, labels, i) for i in range(len(tokens))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [90]:
# emb = calc_emb('слово')
# print(emb)
# emb_features = {f'e{k}':v for k, v in enumerate(emb)}

tokens2features(train_tokens, train_labels)[:3]

[{'bias': 1.0,
  'word.lower()': '<s>',
  'word[-3:]': '<S>',
  'word[-2:]': 'S>',
  'word[:-3]': '',
  'word[:-2]': '<',
  'word.isupper()': True,
  'word.istitle()': True,
  'word.isdigit()': False,
  'BOS': True,
  '-1:label': '--',
  '+1:word.lower()': 'на',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:qend': False},
 {'bias': 1.0,
  'word.lower()': 'на',
  'word[-3:]': 'На',
  'word[-2:]': 'На',
  'word[:-3]': '',
  'word[:-2]': '',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'BOS': True,
  '-1:label': '--',
  '+1:word.lower()': 'довірливих',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:qend': False},
 {'bias': 1.0,
  'word.lower()': 'довірливих',
  'word[-3:]': 'вих',
  'word[-2:]': 'их',
  'word[:-3]': 'довірли',
  'word[:-2]': 'довірлив',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  '-1:word.lower()': 'на',
  '-1:word.istitle()': True,
  '-1:word.isupper()': Fals

In [91]:
X_train = [tokens2features(train_tokens, train_labels)]
y_train = [train_labels]

X_test = [tokens2features(test_tokens, test_labels)]
y_test = [test_labels]

In [92]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=1
)
crf.fit(X_train, y_train)


loading training data to CRFsuite:   0%|                                                         | 0/1 [00:00<?, ?it/s]
loading training data to CRFsuite: 100%|█████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.20s/it]


Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 170075
Seconds required: 0.638

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.34  loss=131442.13 active=169619 feature_norm=1.00
Iter 2   time=0.17  loss=69777.51 active=167405 feature_norm=2.94
Iter 3   time=0.17  loss=62839.88 active=164458 feature_norm=2.74
Iter 4   time=0.50  loss=45256.84 active=156919 feature_norm=2.26
Iter 5   time=0.17  loss=39771.95 active=147869 feature_norm=2.62
Iter 6   time=0.17  loss=32431.76 active=85497 feature_norm=4.54
Iter 7   time=0.17  loss=27277.89 active=83040 feature_norm=4.95
Iter 8   time=0.18  loss=23400.72 active=84121 feature_norm=5.74
Iter 9   time=0.17  loss=20347.11 active=75742 feature_norm=6.42
Iter 10

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=1)

In [93]:
labels = list(crf.classes_)
labels.remove('--')
labels

['B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-PERS', 'I-PERS']

In [94]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

0.8738412312645141

In [95]:
# group B and I results
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.831     0.751     0.789       413
       I-LOC      0.967     0.768     0.856      1071
      B-MISC      0.646     0.580     0.611       176
      I-MISC      0.970     0.939     0.954       375
       B-ORG      0.615     0.482     0.541       228
       I-ORG      0.991     0.715     0.831      1954
      B-PERS      0.843     0.784     0.812      1155
      I-PERS      0.979     0.985     0.982      2795

   micro avg      0.935     0.826     0.878      8167
   macro avg      0.855     0.750     0.797      8167
weighted avg      0.936     0.826     0.874      8167

