In [1]:
#Named Entity Recognition With Conditional Random Fields In Python

In [2]:
import pandas as pd
import numpy as np

data=pd.read_csv("crftag100.tsv",encoding="latin1")


In [3]:
data = data.fillna(method="ffill")

In [4]:
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
9346,Sentence: 100,or,CC,o
9347,Sentence: 100,anything,NN,o
9348,Sentence: 100,.,.,o
9349,Sentence: 100,Use,NNP,o
9350,Sentence: 100,code,NN,o
9351,Sentence: 100,reviews,NNS,o
9352,Sentence: 100,for,IN,o
9353,Sentence: 100,that,DT,o
9354,Sentence: 100,purpose,NN,o
9355,Sentence: 100,.,.,o


In [5]:
words = list(set(data["Word"].values))

In [6]:
n_words = len(words); n_words

2165

In [7]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [8]:
getter = SentenceGetter(data)


In [9]:
sent = getter.get_next()

In [10]:
print(sent)

[('I', 'PRP', 'o'), ('want', 'VBP', 'o'), ('to', 'TO', 'o'), ('be', 'VB', 'o'), ('able', 'JJ', 'o'), ('to', 'TO', 'o'), ('display', 'VB', 'o'), ('a', 'DT', 'o'), ('normal', 'JJ', 'o'), ('YouTube', 'NNP', 'o'), ('video', 'NN', 'o'), ('with', 'IN', 'o'), ('overlaid', 'JJ', 'o'), ('annotations', 'NNS', 'o'), (',', ',', 'o'), ('consisting', 'VBG', 'o'), ('of', 'IN', 'o'), ('coloured', 'JJ', 'o'), ('rectangles', 'NNS', 'o'), ('for', 'IN', 'o'), ('each', 'DT', 'o'), ('frame', 'NN', 'o'), ('.', '.', 'o'), ('The', 'DT', 'o'), ('only', 'JJ', 'o'), ('requirement', 'NN', 'o'), ('is', 'VBZ', 'o'), ('that', 'IN', 'o'), ('this', 'DT', 'o'), ('should', 'MD', 'o'), ('be', 'VB', 'o'), ('done', 'VBN', 'o'), ('programmatically', 'RB', 'o'), ('.', '.', 'o'), ('YouTube', 'NNP', 'o'), ('has', 'VBZ', 'o'), ('annotations', 'NNS', 'o'), ('now', 'RB', 'o'), (',', ',', 'o'), ('but', 'CC', 'o'), ('require', 'VBP', 'o'), ('you', 'PRP', 'o'), ('to', 'TO', 'o'), ('use', 'VB', 'o'), ('their', 'PRP$', 'o'), ('front', 

In [11]:
sentences = getter.sentences

In [12]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [13]:
X = [sent2features(s) for s in sentences]


In [14]:
y = [sent2labels(s) for s in sentences]

In [15]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [16]:
from sklearn.cross_validation import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report



In [17]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

In [18]:
report = flat_classification_report(y_pred=pred, y_true=y)


  'precision', 'predicted', average, warn_for)


In [19]:
print(report)

                          precision    recall  f1-score   support

  Hardware-Architectures       0.00      0.00      0.00         1
       Non-OOP-Functions       0.92      0.57      0.71        21
      OOP-Public methods       0.00      0.00      0.00         1
        OS&SystemKernels       1.00      0.15      0.27        13
         Object-oriented       1.00      0.08      0.15        12
           Others-Events       0.00      0.00      0.00         3
              Procedural       1.00      0.73      0.85        15
Software design patterns       0.00      0.00      0.00         3
    SoftwareApplications       1.00      0.17      0.29         6
       SoftwareOperation       0.89      0.40      0.55        43
            SoftwareRole       1.00      0.06      0.12        16
           SoftwareTools       0.00      0.00      0.00         3
       StandardProtocols       1.00      0.85      0.92        13
         Web development       1.00      0.57      0.73        21
         

In [20]:
crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)