### NER with Conditional Random Fields

Tutorial from: https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system

In [2]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [7]:
from itertools import chain

import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [51]:
def word2features(sent, i):
    word = sent[i][1]
    postag = sent[i][3]

    # Create a dictionary with the features of the word
    # This can be seen as the embedding of the word with all its particular dimensions
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }


    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [tag for sentence, word, tag, pos in sent]

def sent2tokens(sent):
    return [word for sentence, word, tag, pos in sent]

In [52]:
def add_pos_tags(df: pd.DataFrame) -> pd.DataFrame:
    df['pos'] = df['word'].apply(lambda x: nltk.pos_tag([x])[0][1])
    return df

### Data

In [53]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/trivia10k13_pos.csv', encoding = "ISO-8859-1")
df = df.fillna(method='ffill')

# Rename columns to sentence, word, tag, pos
df.columns = ['sentence', 'word', 'tag', 'pos']

df.head()

Unnamed: 0,sentence,word,tag,pos
0,Sentence: 0,steve,B-Actor,NN
1,Sentence: 0,mcqueen,I-Actor,NN
2,Sentence: 0,provided,O,VBN
3,Sentence: 0,a,O,DT
4,Sentence: 0,thrilling,B-Plot,VBG


In [54]:
# Test sentence2features
sent2features(df.iloc[0:10].values)

[{'bias': 1.0,
  'word.lower()': 'steve',
  'word[-3:]': 'eve',
  'word[-2:]': 've',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'NN',
  'postag[:2]': 'NN',
  'BOS': True,
  '+1:word.lower()': 'sentence: 0',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:postag': 'mcqueen',
  '+1:postag[:2]': 'mc'},
 {'bias': 1.0,
  'word.lower()': 'mcqueen',
  'word[-3:]': 'een',
  'word[-2:]': 'en',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'NN',
  'postag[:2]': 'NN',
  '-1:word.lower()': 'sentence: 0',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '-1:postag': 'steve',
  '-1:postag[:2]': 'st',
  '+1:word.lower()': 'sentence: 0',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:postag': 'provided',
  '+1:postag[:2]': 'pr'},
 {'bias': 1.0,
  'word.lower()': 'provided',
  'word[-3:]': 'ded',
  'word[-2:]': 'ed',
  'word.isupper()': False,
  'word.istitle()

In [56]:
# Group by sentence, export values and apply sent2features
X = [sent2features(s) for s in df.groupby('sentence').apply(lambda x: x.values).values]
Y = [sent2labels(s) for s in df.groupby('sentence').apply(lambda x: x.values).values]

In [57]:
split_proportion = 0.8
split_index = int(len(X) * split_proportion)

X_train = X[:split_index]
X_test = X[split_index:]

Y_train = Y[:split_index]
Y_test = Y[split_index:]


In [64]:
sklearn_crfsuite.CRF._get_param_names()

['algorithm',
 'all_possible_states',
 'all_possible_transitions',
 'averaging',
 'c',
 'c1',
 'c2',
 'calibration_candidates',
 'calibration_eta',
 'calibration_max_trials',
 'calibration_rate',
 'calibration_samples',
 'delta',
 'epsilon',
 'error_sensitive',
 'gamma',
 'keep_tempfiles',
 'linesearch',
 'max_iterations',
 'max_linesearch',
 'min_freq',
 'model_filename',
 'num_memories',
 'pa_type',
 'period',
 'trainer_cls',
 'variance',
 'verbose']

In [65]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

try:
    crf.fit(X_train, Y_train)
except AttributeError:
    pass

CPU times: user 32.3 s, sys: 0 ns, total: 32.3 s
Wall time: 32.3 s


### Evaluation and production ready pipeline

In [66]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-Actor',
 'I-Actor',
 'B-Plot',
 'I-Plot',
 'B-Opinion',
 'I-Opinion',
 'B-Award',
 'I-Award',
 'B-Year',
 'B-Director',
 'B-Genre',
 'I-Genre',
 'B-Origin',
 'I-Origin',
 'I-Director',
 'B-Soundtrack',
 'I-Soundtrack',
 'B-Character_Name',
 'I-Character_Name',
 'B-Quote',
 'I-Quote',
 'B-Relationship',
 'I-Relationship',
 'I-Year']

In [67]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(Y_test, y_pred,
                      average='weighted', 
                      labels=labels)

0.8289777943559292

### Conclusion

Basically the score with the CRF overpass all the existing algorithms for classification from sklearn.

In [74]:
# Test for single sentence where don't know the labels or the pos tags
sample_sentence = 'Movies with Bruce Willis'
sample_sentence = nltk.pos_tag(nltk.word_tokenize(sample_sentence))
sample_sentence = list(map(lambda x: ('', x[0], '', x[1]), sample_sentence))
sample_sentence

sample_sentence_features = sent2features(sample_sentence)
sample_sentence_features

[{'bias': 1.0,
  'word.lower()': 'movies',
  'word[-3:]': 'ies',
  'word[-2:]': 'es',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'postag': 'NNS',
  'postag[:2]': 'NN',
  'BOS': True,
  '+1:word.lower()': '',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'with',
  '+1:postag[:2]': 'wi'},
 {'bias': 1.0,
  'word.lower()': 'with',
  'word[-3:]': 'ith',
  'word[-2:]': 'th',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'IN',
  'postag[:2]': 'IN',
  '-1:word.lower()': '',
  '-1:word.istitle()': False,
  '-1:word.isupper()': False,
  '-1:postag': 'Movies',
  '-1:postag[:2]': 'Mo',
  '+1:word.lower()': '',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'Bruce',
  '+1:postag[:2]': 'Br'},
 {'bias': 1.0,
  'word.lower()': 'bruce',
  'word[-3:]': 'uce',
  'word[-2:]': 'ce',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'p

In [75]:
predicted_labels = crf.predict_single(sample_sentence_features)

In [76]:
predicted_labels

['O', 'O', 'B-Actor', 'I-Actor']