* Open data
* Build baseline

In [1]:
PATH = "../ner-uk/"

# Read tokens and positions of tokens from a file

def read_tokens(filename):
    tokens = []
    pos = 0
    with open(filename, "r") as f:
        text = f.read().split("\n")
        for line in text:
            if len(line) == 0:
                pos += 1
            else:
                tokens.append(("<S>", pos, pos))
                for token in line.split(" "):
                    tokens.append((token, pos, pos + len(token)))
                    pos += len(token) + 1
                tokens.append(("</S>", pos, pos))
    return tokens

# Read annotations and positions of annotations from a file

def read_annotations(filename):
    anno = []
    with open(filename, "r") as f:
        for line in f.readlines():
            annotations = line.split()
            anno.append((annotations[1], int(annotations[2]), int(annotations[3])))
    return anno

# Using positions of tokens and annotations, extract a list of token labels

def extract_labels(anno, tokens):
    labels = []
    ann_id = 0
    for token in tokens:
        if ann_id < len(anno):
            label, beg, end = anno[ann_id]
            if token[0] in ["<S>", "</S>"]:
                labels.append("--")
            elif token[1] < beg:
                labels.append("--")
            else:
                if token[1] == beg:
                    labels.append("B-" + label)
                else:
                    labels.append("I-" + label)
                if token[2] == end:
                    ann_id += 1
        else:
            labels.append("--")    
    return labels

# tokens = read_tokens(PATH + "data/A_alumni.krok.edu.ua_Prokopenko_Vidrodzhennia_velotreku(5).tok.txt")
# anno = read_annotations(PATH + "data/A_alumni.krok.edu.ua_Prokopenko_Vidrodzhennia_velotreku(5).tok.ann")
# labels = extract_labels(anno, tokens)

# for i, j in zip(tokens, labels):
#     print(i[0], j)

# Extract list of files for training and testing

dev_test = {"dev": [], "test": []}
category = ""
with open(PATH + "doc/dev-test-split.txt", "r") as f:
    for line in f.readlines():
        line = line.strip()
        if line in ["DEV", "TEST"]:
            category = line.lower()
        elif len(line) == 0:
            continue
        else:
            dev_test[category].append(line)

print(len(dev_test["dev"]), len(dev_test["test"]))

# Get train and test data and labels

train_tokens, test_tokens, train_labels, test_labels = [], [], [], []

for filename in dev_test["dev"]:
    try:
        tokens = read_tokens(PATH + "data/" + filename + ".txt")
        train_tokens += [token[0] for token in tokens]
        train_labels += extract_labels(read_annotations(PATH + "data/" + filename + ".ann"), tokens)
    except:
        pass

for filename in dev_test["test"]:
    try:
        tokens = read_tokens(PATH + "data/" + filename + ".txt")
        test_tokens += [token[0] for token in tokens]
        test_labels += extract_labels(read_annotations(PATH + "data/" + filename + ".ann"), tokens)
    except:
        pass

156 73


['<S>', 'На', 'довірливих', 'кіровоградців', 'полюють', 'шахраї', 'та', 'фірми-посередники', ',', 'які', 'за', '1000', 'грн', '.', 'готові', '«', 'виготовити', '»', 'біометричний', 'паспорт', ',', 'який', 'коштує', '518', 'грн', '.', '</S>', '<S>', 'Із', 'запровадженням', 'біометричних', 'паспортів', 'активізувалися', 'шахраї', 'та', 'фірми-посередники', ',', 'які', 'пропонують', '«', 'прискорити', '»', 'оформлення', 'біометричного', 'паспорта', 'або', 'просто', 'оформити', 'цей', 'документ', '–', 'повідомляють', 'Першій', 'електронній', 'в', 'прес-службі', 'УДМС', 'України', 'в', 'Кіровоградській', 'області', '.', '</S>', '<S>', 'Розцінки', 'на', 'послуги', 'таких', 'посередників', 'починаються', 'від', '1000', 'грн', '.', '</S>', '<S>', 'Закінчується', '«', 'біометрична', 'афера', '»', 'в', 'кращому', 'випадку', 'звичайним', 'оформленням', 'документа', 'у', 'міграційній', 'службі', 'у', 'встановлений', 'законодавством', 'строк', ',', 'у', 'гіршому', '–', 'втратою', 'коштів']
['--', '

In [5]:
from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics import classification_report
from pymorphy2 import MorphAnalyzer
import string

# Rules

def first_in_sentence(i, tokens):
    if tokens[i-1] == '<S>':
        return True
    else: return False

def is_title_case(i, tokens):
    return tokens[i].title() == tokens[i]

def is_upper_case(i, tokens):
    return tokens[i].upper() == tokens[i]

def is_punct(i, tokens):
    return tokens[i] in string.punctuation

def is_digit(i, tokens):
    digits = '0123456789,.'
    
    for c in tokens[i]:
        if c not in digits:
            return False
    return True

def lemma(i, tokens, morph):
    try:
        return str(morph.parse(tokens[i])[0].normal_form.lower())
    except IndexError:
        return '<NA>'
    
def pos(i, tokens, morph):
    try:
        return str(morph.parse(tokens[i])[0].tag.POS)
    except IndexError:
        return '<NA>'



def my_little_baseline(tokens):
    
    labels = []
    
    for i in range(0, len(tokens)):
        if is_title_case(i, tokens):
            labels.append('B-ПЕРС')
        else:
            labels.append('--')
    
    return labels

def feature_extractor(i, tokens):
    """:rtype {feature: value, ...}"""
    
    global morph
    
    features = {}
    
    features['first_in_sentence'] = first_in_sentence(i, tokens)
    features['is_title_case'] = is_title_case(i, tokens)
    features['is_upper_case'] = is_upper_case(i, tokens)  
    features['is_punct'] = is_punct(i, tokens)
    features['is_digit'] = is_digit(i, tokens)
    
    features['lemma-1'] = lemma(i-1, tokens, morph)
    features['lemma'] = lemma(i, tokens, morph)
    features['lemma+1'] = lemma(i+1, tokens, morph)
    
    features['pos-1'] = pos(i-1, tokens, morph)
    features['pos'] = pos(i, tokens, morph)
    features['pos+1'] = pos(i+1, tokens, morph)

    # lemma-1, lemma-2. lemma+1, lemma+2
    # pos-1, pos-2, pos+1, pos+2
    
    # print(features)
    
    return features


def vectorize(tokens, fit):
    
    feature_list = []
    global vectorizer
    
    for i in range(0, len(tokens)):
        feature_list.append(feature_extractor(i, tokens))
    
    if fit:
        vec = vectorizer.fit_transform(feature_list) #.toarray()
    else:
        vec = vectorizer.transform(feature_list) #.toarray()
    # print("\nTotal number of features: ", len(vec.get_feature_names()))
    
    return vec

morph = MorphAnalyzer(lang='uk')
vectorizer = DictVectorizer()

X_train = vectorize(train_tokens, fit=True)
X_test = vectorize(test_tokens, fit=False)
Y_train = train_labels
Y_test = test_labels

# pred_labels = my_little_baseline(test_tokens)
# print(classification_report(test_labels, pred_labels))

In [6]:
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn import svm

from sklearn.metrics import classification_report

def dtree_classify(X_train, X_test, Y_train, Y_test):
    """Run classification with Decision trees"""
    
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    
    print('DTree Classification:')
    print(classification_report(Y_test, Y_pred))
    
    return clf
    
def nbayes_classify(X_train, X_test, Y_train, Y_test):
    clf = GaussianNB()
    clf = clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    
    print('Gaussian Naive Bayes Classification:')
    print(classification_report(Y_test, Y_pred))

    return clf
    
def knn_classify(X_train, X_test, Y_train, Y_test):
    clf = KNeighborsClassifier(n_neighbors=3)
    clf = clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    
    print('kNN Classification:')
    print(classification_report(Y_test, Y_pred))

    return clf

def logreg_classify(X_train, X_test, Y_train, Y_test):
    clf = LogisticRegression(random_state=0, solver='lbfgs',
                             multi_class='multinomial')
    clf = clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    
    print('LogReg Classification:')
    print(classification_report(Y_test, Y_pred))

    return clf

def svm_classify(X_train, X_test, Y_train, Y_test):
    clf = svm.SVC(gamma='scale')
    clf = clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    
    print('SVM Classification:')
    print(classification_report(Y_test, Y_pred))

    return clf

def perceptron_classify(X_train, X_test, Y_train, Y_test):
    clf = Perceptron()
    clf = clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    
    print('Perceptron Classification:')
    print(classification_report(Y_test, Y_pred))

    return clf

dtree_classify(X_train, X_test, Y_train, Y_test)
# nbayes_classify(X_train, X_test, Y_train, Y_test)
# knn_classify(X_train, X_test, Y_train, Y_test)
logreg_classify(X_train, X_test, Y_train, Y_test)
# svm_classify(X_train, X_test, Y_train, Y_test)
perceptron_classify(X_train, X_test, Y_train, Y_test)

DTree Classification:
              precision    recall  f1-score   support

          --       0.93      0.97      0.95     69817
       B-ЛОК       0.51      0.47      0.49       414
       B-ОРГ       0.35      0.27      0.30       230
      B-ПЕРС       0.58      0.75      0.65      1190
      B-РІЗН       0.36      0.44      0.40       178
       I-ЛОК       0.13      0.03      0.05      1071
       I-ОРГ       0.55      0.05      0.09      1958
      I-ПЕРС       0.15      0.08      0.10      2808
      I-РІЗН       0.07      0.06      0.07       377

   micro avg       0.89      0.89      0.89     78043
   macro avg       0.40      0.35      0.34     78043
weighted avg       0.86      0.89      0.87     78043





LogReg Classification:
              precision    recall  f1-score   support

          --       0.92      1.00      0.96     69817
       B-ЛОК       0.59      0.62      0.61       414
       B-ОРГ       0.34      0.08      0.13       230
      B-ПЕРС       0.66      0.76      0.71      1190
      B-РІЗН       0.35      0.43      0.39       178
       I-ЛОК       0.72      0.02      0.04      1071
       I-ОРГ       0.76      0.04      0.07      1958
      I-ПЕРС       0.55      0.05      0.09      2808
      I-РІЗН       0.35      0.02      0.04       377

   micro avg       0.91      0.91      0.91     78043
   macro avg       0.58      0.34      0.34     78043
weighted avg       0.89      0.91      0.88     78043





Perceptron Classification:
              precision    recall  f1-score   support

          --       0.92      0.97      0.94     69817
       B-ЛОК       0.56      0.62      0.59       414
       B-ОРГ       0.38      0.29      0.33       230
      B-ПЕРС       0.77      0.49      0.60      1190
      B-РІЗН       0.52      0.08      0.14       178
       I-ЛОК       0.14      0.02      0.04      1071
       I-ОРГ       0.83      0.02      0.04      1958
      I-ПЕРС       0.13      0.06      0.08      2808
      I-РІЗН       0.05      0.15      0.08       377

   micro avg       0.88      0.88      0.88     78043
   macro avg       0.48      0.30      0.32     78043
weighted avg       0.86      0.88      0.86     78043



Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
      fit_intercept=True, max_iter=None, n_iter=None, n_iter_no_change=5,
      n_jobs=None, penalty=None, random_state=0, shuffle=True, tol=None,
      validation_fraction=0.1, verbose=0, warm_start=False)

In [43]:
s1 = ['1000']
s2 = ['1.00']
s3 = ['1,000q']

is_digit(0, s3)

False