In [16]:
import glob
from itertools import chain
import sklearn_crfsuite
from sklearn_crfsuite import scorers
import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
import os
from sklearn.model_selection import cross_val_score
from sklearn_crfsuite import metrics

In [5]:
# deleted the first line from both train and test corpus
f=open('hi-ud-train.conllu')
f_test=open('hi-ud-test.conllu')
train=f.read()
test=f_test.read()

In [7]:
c=[]
store = []
for line in train.split('\n'):
    words=line.split(',') 

    if words[0]=="":
        c.append(store)
        store=[]

    elif words[1]=="":
        store.append((',',words[2]))
    else:
        store.append((words[1],words[2]))
    

In [8]:
c_test=[]
store = []
for line in test.split('\n'):
    words=line.split('\t') 
    
    if words[0]=="":
        c_test.append(store)
        store=[]

    elif words[1]=="":
        store.append((',',words[2]))
    else:
        store.append((words[1],words[2]))


In [9]:
train_sent = c
test_sent = c_test

In [11]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
## These are the features related to current word i
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:                                   ##These are the features related to the last word
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:                      ##These are the features related to the next word
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        ## +1 features are for next word
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]  # makes features for sent

def sent2label(sent):
    return [postag for token, postag in sent] ## labels of words in sent


In [14]:
%%time
X_train=[sent2features(s) for s in train_sent]
Y_train=[sent2label(s) for s in train_sent]
X_test=[sent2features(s) for s in test_sent]
Y_test=[sent2label(s) for s in test_sent]

CPU times: user 31.7 ms, sys: 8.04 ms, total: 39.8 ms
Wall time: 40.3 ms


In [17]:
%%time
## c1 and c2 are the L1 and L2 regularization coefficients
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c2=0.1,
    c1=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, Y_train)

CPU times: user 732 ms, sys: 7.49 ms, total: 740 ms
Wall time: 739 ms


CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [20]:
tags=list(crf.classes_)

In [21]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(Y_test, y_pred,average='weighted', labels=tags)
print(metrics.flat_classification_report(
    Y_test, y_pred, labels=sorted(tags), digits=3))


              precision    recall  f1-score   support

         ADJ      1.000     1.000     1.000        94
         ADP      1.000     1.000     1.000       309
         ADV      1.000     1.000     1.000        21
         AUX      1.000     1.000     1.000       139
       CCONJ      1.000     1.000     1.000        25
       COMMA      0.000     0.000     0.000         0
         DET      1.000     1.000     1.000        36
        NOUN      1.000     1.000     1.000       329
         NUM      1.000     1.000     1.000        25
        PART      1.000     1.000     1.000        33
        PRON      1.000     1.000     1.000        65
       PROPN      1.000     1.000     1.000       145
       PUNCT      1.000     1.000     1.000       135
       SCONJ      1.000     1.000     1.000         3
        VERB      1.000     1.000     1.000        99
           X      0.000     0.000     0.000         0

   micro avg      1.000     1.000     1.000      1458
   macro avg      0.875   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [25]:
from collections import Counter

def print_transitions(features):
    for (frm, to), w in features:
        print("%-6s -> %-7s %0.5f" % (frm, to, w))

print("Top 10 likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(10))

print("\nTop 10 unlikely transitions:")
length=len(Counter(crf.transition_features_).most_common())
print_transitions(Counter(crf.transition_features_).most_common()[length-1:length-11:-1])

Top 10 likely transitions:
VERB   -> AUX     0.99159
AUX    -> PUNCT   0.75910
NOUN   -> ADP     0.69290
AUX    -> AUX     0.67332
PROPN  -> ADP     0.64213
PROPN  -> PROPN   0.58835
ADJ    -> NOUN    0.48945
DET    -> NOUN    0.46631
AUX    -> SCONJ   0.38395
NUM    -> NOUN    0.36026

Top 10 unlikely transitions:
ADP    -> AUX     -0.22032
NOUN   -> ADJ     -0.21160
AUX    -> NOUN    -0.19853
ADP    -> CCONJ   -0.19264
PROPN  -> PRON    -0.17453
PROPN  -> ADJ     -0.16758
ADJ    -> ADP     -0.15779
ADJ    -> PRON    -0.14183
VERB   -> NOUN    -0.14139
ADP    -> COMMA   -0.10075


In [26]:
%%time

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=tags)

# searching the params_space
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
rs.fit(X_train, Y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.4min finished


CPU times: user 1min 26s, sys: 268 ms, total: 1min 26s
Wall time: 1min 27s


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                                 all_possible_transitions=True, averaging=None,
                                 c=None, c1=None, c2=None,
                                 calibration_candidates=None,
                                 calibration_eta=None,
                                 calibration_max_trials=None,
                                 calibration_rate=None,
                                 calibration_samples=None, delta=None,
                                 epsilon=None, error_sensitive=None,...
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa519e650b8>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa519dd6e48>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_sco

In [28]:
## Best parameters found
print('best parameters:', rs.best_params_)
print('best Cross Validation score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))


best parameters: {'c1': 0.6101995873514603, 'c2': 0.06964630842266561}
best Cross Validation score: 0.9995914957414596
model size: 0.01M
