In [1]:
import sklearn
import csv
import sys
import pandas as pd
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats
from sklearn.metrics import make_scorer

In [2]:
### THE CODE FOR THIS ALGORITHM WAS TAKEN FROM HERE: https://github.com/cltl/ba-text-mining/blob/master/lab_sessions/lab4/Lab4a.4-NERC-CRF-Dutch.ipynb
### AS WELL AS THE SKLEARN DOCUMENTATION, AND SUBSEQUENTLY ADJUSTED FOR OUR SPECIFIC DATA!

# Copyright: Vrije Universiteit Amsterdam, Faculty of Humanities, CLTL

In [3]:
### NOTE: YOU NEED TO HAVE SKLEARN-VERSION < 0.24 IN ORDER TO RUN PART OF THE CODE
### LATER VERSIONS WILL NOT WORK, SEE: https://github.com/TeamHG-Memex/sklearn-crfsuite/issues/60

In [4]:
def extract_sents_from_tsv(inputfile):
    sents = []
    current_sent = []

    with open(inputfile, "r") as infile:
        reader = csv.reader(infile)
        next(reader)
        for line in infile:
            # using tsv files here as the csv files get split incorrectly 
            row = line.strip("\n").split('\t')
            if row[4] == ".":
                current_sent.append(tuple(row))
                sents.append(current_sent)
                current_sent = []
            else:
                current_sent.append(tuple(row))
    return sents

def token2features(sentence, i):
    story = sentence[i][1]
    sent_index = sentence[i][2]
    token_index = sentence[i][3]
    prev_prev_token= sentence[i][4]
    prev_token = sentence[i][5]
    token = sentence[i][6]
    next_token = sentence[i][7]
    next_next_token = sentence[i][8]
    pos = sentence[i][9]
    chunk = sentence[i][10]
    lemma= sentence[i][11]
    matchesNeg = sentence[i][12]
    hasPrefix = sentence[i][13]
    hasSuffix = sentence[i][14]
    hasPrefixAntonym = sentence[i][15]
    hasSuffixAntonym = sentence[i][16]
    matchesMulticue = sentence[i][17]

    features = {
        'bias': 1.0,
        'story':story,
        'sent_index':sent_index,
        'token_index':token_index,
        'token-2':prev_prev_token,
        'token-1':prev_token,
        'token': token,
        'token+1':next_token,
        'token+2': next_next_token,
        'pos': pos,
        'chunk':chunk,
        'lemma':lemma,
        'matchesNeg':matchesNeg,
        'hasPrefix':hasPrefix,
        'hasSuffix':hasSuffix,
        'hasPrefixAntonym':hasPrefixAntonym,
        'hasSuffixAntonym':hasSuffixAntonym,
        'matchesMulticue':matchesMulticue
    }
        
    return features

def sent2features(sent):
    return [token2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    # gold labels at index 18
    return [word[18] for word in sent]

def sent2tokens(sent):
    # tokens at index 6
    return [word[6] for word in sent]
    

In [5]:
# feature files obtained by running pipeline main_B.py and converting df to a tsv
train_sents = extract_sents_from_tsv("training_features_B.tsv")
test_sents = extract_sents_from_tsv("dev_features_B.tsv")

In [6]:
X_train = [sent2features(s) for s in train_sents]
Y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
Y_test = [sent2labels(s) for s in test_sents]

In [7]:
### Taken from the sklearn documentation: https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#features

crf = sklearn_crfsuite.CRF(algorithm='lbfgs', max_iterations=100, all_possible_transitions=True)
params_space = {'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05)}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted')

# search
rs = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer)
rs.fit(X_train, Y_train)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   47.3s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  3.3min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002C3C2AE0B50>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002C3B1285460>},
                   scoring=make_scorer(flat_f1_score, average=weighted),
                   verbose=1)

In [8]:
print("Best parameters for CRF:", rs.best_params_)
crf = rs.best_estimator_
labels = list(crf.classes_)
Y_pred = crf.predict(X_test)

Best parameters for CRF: {'c1': 0.10816419273720317, 'c2': 0.0052509679623781455}


In [9]:
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

In [10]:
print(metrics.flat_classification_report(Y_test, Y_pred, labels=sorted_labels, digits=4))
# report = pd.DataFrame(classification_report(y_true=Y_test, y_pred=Y_pred, output_dict=True)).transpose()

              precision    recall  f1-score   support

           O     0.9982    0.9990    0.9986     13375
       B-NEG     0.9217    0.8693    0.8947       176
       I-NEG     1.0000    0.6667    0.8000         3

    accuracy                         0.9973     13554
   macro avg     0.9733    0.8450    0.8978     13554
weighted avg     0.9972    0.9973    0.9972     13554





In [11]:
# creating dataframe to check which labels don't overlap

trial_df = pd.DataFrame()
pred_labels = []
for small_list in Y_pred:
    for label in small_list:
        pred_labels.append(label)
        
trial_df['predicted_label'] = pred_labels

gold_labels = []
for small_list in Y_test:
    for label in small_list:
        gold_labels.append(label)
        
trial_df['gold_label'] = gold_labels

test_tokens = []
for small_list in test_sents:
    for token_tuple in small_list:
        test_tokens.append(token_tuple[6])
trial_df['token'] = test_tokens

In [12]:
trial_df[(trial_df['gold_label'] == "B-NEG") & (trial_df['predicted_label'] != "B-NEG")]

Unnamed: 0,predicted_label,gold_label,token
800,O,B-NEG,unbrushed
805,O,B-NEG,unshaven
946,O,B-NEG,unbrushed
948,O,B-NEG,unkempt
3056,O,B-NEG,unburned
3297,O,B-NEG,undoubtedly
3625,O,B-NEG,uncommonly
4586,O,B-NEG,unnatural
4949,O,B-NEG,irreproachable
5035,O,B-NEG,insensibly


In [13]:
trial_df[(trial_df['gold_label'] == "I-NEG") & (trial_df['predicted_label'] != "I-NEG")]

Unnamed: 0,predicted_label,gold_label,token
13008,O,I-NEG,more
