In [None]:
! pip install sklearn-crfsuite

In [None]:
# test data
in_file = open('hi-ud-test.conllu', encoding="utf-8")
data = in_file.readlines()

test_sent = []
test_data = []
for row in data[1:]:
  if row == '\t\t\n':
    test_data.append(test_sent)
    test_sent = []
  elif 'PUNCT' in row:
    _,  word, tag = row.strip().split('\t')
    if word == ",":
      train_sent.append((',', 'PUNCT'))
    else :
      train_sent.append((word, tag))
  else:
    _, word, tag = row.strip().split('\t')
    test_sent.append((word, tag))

In [None]:
# train data
in_file = open('hi-ud-train.conllu', encoding="utf-8")
data = in_file.readlines()

train_sent = []
train_data = []
for row in data[1:]:
  if row == ',,\n':
    train_data.append(train_sent)
    train_sent = []
  elif 'COMMA' in row:
    _,  word, tag = row.strip().split(',')
    train_sent.append((',', 'PUNCT'))
  else:
    _, word, tag = row.strip().split(',')
    train_sent.append((word, tag))

In [None]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[:2]': word[:2],
        'word[:3]': word[:3],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [postag for token, postag in sent]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
X_train = [sent2features(s) for s in train_data]
y_train = [sent2labels(s) for s in train_data]

X_test = [sent2features(s) for s in test_data]
y_test = [sent2labels(s) for s in test_data]

In [None]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [None]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top 10 most common transitions:")
print_transitions(Counter(crf_train.transition_features_).most_common(10))

print("\nTop 10 least common transitions:")
print_transitions(Counter(crf_train.transition_features_).most_common()[-10:])

Top 10 most common transitions:
VERB   -> AUX     4.033018
PROPN  -> PROPN   2.708065
AUX    -> AUX     2.551731
PROPN  -> ADP     2.398286
ADJ    -> NOUN    2.250517
AUX    -> SCONJ   1.934415
NUM    -> NOUN    1.864068
PROPN  -> PUNCT   1.859276
PRON   -> ADP     1.857447
NOUN   -> ADP     1.846757

Top 10 least common transitions:
DET    -> CCONJ   -1.126597
PROPN  -> AUX     -1.135531
ADP    -> CCONJ   -1.178577
NUM    -> PRON    -1.201640
ADV    -> AUX     -1.202103
PRON   -> PUNCT   -1.287442
CCONJ  -> AUX     -1.587035
DET    -> ADP     -1.862737
ADJ    -> PRON    -2.011258
ADJ    -> ADP     -2.033969


In [None]:
labels = list(crf.classes_)
y_pred_test = crf.predict(X_test)
y_pred_train = crf.predict(X_train)

print(f'Accuracy on Test Data : {metrics.flat_accuracy_score(y_test, y_pred_test)}')
print(f'Accuracy on Train Data: {metrics.flat_accuracy_score(y_train, y_pred_train)}')


Accuracy on Test Data : 0.8519083969465648
Accuracy on Train Data: 0.9967096604369571


In [None]:
sorted_labels = sorted(labels, key=lambda name: (name, name[0]))
print(f'Test data:\n {metrics.flat_classification_report(y_test, y_pred_test, labels=sorted_labels, digits=3)}')
print(f'Train data:\n {metrics.flat_classification_report(y_train, y_pred_train, labels=sorted_labels, digits=3)}')

Test data:
               precision    recall  f1-score   support

         ADJ      0.697     0.734     0.715        94
         ADP      0.955     0.970     0.962       303
         ADV      0.667     0.476     0.556        21
         AUX      0.956     0.949     0.953       138
       CCONJ      1.000     1.000     1.000        25
         DET      0.842     0.889     0.865        36
        NOUN      0.772     0.898     0.830       324
         NUM      0.957     0.880     0.917        25
        PART      1.000     0.939     0.969        33
        PRON      0.915     0.831     0.871        65
       PROPN      0.690     0.479     0.566       144
       PUNCT      0.000     0.000     0.000         0
       SCONJ      0.750     1.000     0.857         3
        VERB      0.904     0.859     0.881        99
           X      0.000     0.000     0.000         0

   micro avg      0.852     0.852     0.852      1310
   macro avg      0.740     0.727     0.729      1310
weighted avg  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
