In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
#from sklearn.cross_validation import cross_val_score
#from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import pickle

In [2]:
loc1='features1-train.txt'
loc2='features1-test.txt'

In [3]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    tag1=sent[i][2]
    tag2=sent[i][4]
    tag3 = sent[i][5]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'tag1': tag1,
        'tag1[:2]': tag1[:2],
        'tag2': tag2,
        'tag2[:2]': tag2[:2],
        'tag3': tag3,
        'tag3[:2]': tag3[:2],
        'wordlength': len(word),
        'wordinitialcap': word[0].isupper(),
        'wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
        'wordallcap': len([x for x in word if x.isupper()])==len(word),
        'distfromsentbegin': i
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        tag11=sent[i-1][2]
        tag22=sent[i-1][4]
        tag33 = sent[i-1][5]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:tag1': tag1,
            '-1:tag1[:2]': tag1[:2],
            '-1:tag2': tag2,
            '-1:tag2[:2]': tag2[:2],
            '-1:tag3': tag3,
            '-1:tag3[:2]': tag3[:2],
            '-1:wordlength': len(word),
            '-1:wordinitialcap': word[0].isupper(),
            '-1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
            '-1:wordallcap': len([x for x in word if x.isupper()])==len(word),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        tag11=sent[i+1][2]
        tag22=sent[i+1][4]
        tag33 = sent[i+1][5]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:tag1': tag1,
            '+1:tag1[:2]': tag1[:2],
            '+1:tag2': tag2,
            '+1:tag2[:2]': tag2[:2],
            '+1:tag3': tag3,
            '+1:tag3[:2]': tag3[:2],
            '+1:wordlength': len(word),
            '+1:wordinitialcap': word[0].isupper(),
            '+1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
            '+1:wordallcap': len([x for x in word if x.isupper()])==len(word),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, tag1, label, tag2, tag3 in sent]

def sent2tokens(sent):
    return [token for token, postag, tag1, label, tag2, tag3, tag4, tag5 in sent]

def convertCONLLFormJustExtractionSemEval1(loc):
    dT=open(loc, encoding='utf-8').read().split("\n")[:-2]
    sI = [-1] + [i for i, x in enumerate(dT) if not x.strip()] + [len(dT)]
    sT1s = [dT[sI[i]+1:sI[i+1]] for i in range(len(sI)-1)]
    sTs = []
    for s in sT1s:
        ts= [(x.split("\t")[0],x.split("\t")[1], x.split("\t")[2], x.split("\t")[3], x.split("\t")[4],x.split("\t")[5], x.split("\t")[6], x.split("\t")[7]) for x in s]
        ts1= [(tss[0],tss[1],tss[2],tss[3], tss[6], tss[7]) for tss in ts]
        sTs.append(ts1)
    return sTs

In [4]:
train_sents = convertCONLLFormJustExtractionSemEval1(loc1)
test_sents = convertCONLLFormJustExtractionSemEval1(loc2)

In [5]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [6]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 4.75 s


CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [7]:
y_pred = crf.predict(X_test)

In [8]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-KP', 'I-KP']

In [9]:
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.5758797831170186

In [10]:
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print((metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)))

              precision    recall  f1-score   support

        B-KP      0.529     0.546     0.537       390
        I-KP      0.673     0.541     0.600       630

   micro avg      0.609     0.543     0.574      1020
   macro avg      0.601     0.544     0.569      1020
weighted avg      0.618     0.543     0.576      1020



In [11]:
pickle.dump(crf,open("linear-chain-crf-enhanced.model-new.pickle","wb"), protocol = 0, fix_imports = True)

In [19]:
vectors=get_features('word')

In [32]:
vectors=np.expand_dims(vectors, axis=1)

In [33]:
vectors.shape

(300, 1)

In [12]:
from gensim.models import KeyedVectors
import numpy as np
# Load vectors directly from the file
model1 = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)



In [34]:
def get_features(word):
    word=word.lower()
    try:
        vectors=model1[word]
    except:
        vectors=0
    #vectors=np.array(vectors)
    #vectors=vectors[0]
    vectors=np.expand_dims(vectors, axis=1)
    return vectors

In [35]:
def word2features(sent, i):
    word = sent[i][0]
    wordembdding=get_features(word)
    #wordembdding=np.array(wordembdding)
    #wordembdding= 
    #wordembdding=wordembdding[0]
    postag = sent[i][1]
    tag1=sent[i][2]
    tag2=sent[i][4]
    tag3 = sent[i][5]
    

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'wordembdding': wordembdding,
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'tag1': tag1,
        'tag1[:2]': tag1[:2],
        'tag2': tag2,
        'tag2[:2]': tag2[:2],
        'tag3': tag3,
        'tag3[:2]': tag3[:2],
        'wordlength': len(word),
        'wordinitialcap': word[0].isupper(),
        'wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
        'wordallcap': len([x for x in word if x.isupper()])==len(word),
        'distfromsentbegin': i
    }
    if i > 0:
        word1 = sent[i-1][0]
        wordembdding1= get_features(word1)
        #wordembdding1=np.array(wordembdding1)
        #wordembdding1=f2(wordembdding1)
        postag1 = sent[i-1][1]
        tag11=sent[i-1][2]
        tag22=sent[i-1][4]
        tag33 = sent[i-1][5]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:wordembdding': wordembdding1,
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:tag1': tag1,
            '-1:tag1[:2]': tag1[:2],
            '-1:tag2': tag2,
            '-1:tag2[:2]': tag2[:2],
            '-1:tag3': tag3,
            '-1:tag3[:2]': tag3[:2],
            '-1:wordlength': len(word),
            '-1:wordinitialcap': word[0].isupper(),
            '-1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
            '-1:wordallcap': len([x for x in word if x.isupper()])==len(word),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        wordembdding1= get_features(word1)
        #wordembdding1= get_features(word1)
        #wordembdding1=np.array(wordembdding1)
        #wordembdding1=f2(wordembdding)
        postag1 = sent[i+1][1]
        tag11=sent[i+1][2]
        tag22=sent[i+1][4]
        tag33 = sent[i+1][5]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:wordembdding': wordembdding1,
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:tag1': tag1,
            '+1:tag1[:2]': tag1[:2],
            '+1:tag2': tag2,
            '+1:tag2[:2]': tag2[:2],
            '+1:tag3': tag3,
            '+1:tag3[:2]': tag3[:2],
            '+1:wordlength': len(word),
            '+1:wordinitialcap': word[0].isupper(),
            '+1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
            '+1:wordallcap': len([x for x in word if x.isupper()])==len(word),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, tag1, label, tag2, tag3 in sent]

def sent2tokens(sent):
    return [token for token, postag, tag1, label, tag2, tag3, tag4, tag5 in sent]

def convertCONLLFormJustExtractionSemEval1(loc):
    dT=open(loc, encoding='utf-8').read().split("\n")[:-2]
    sI = [-1] + [i for i, x in enumerate(dT) if not x.strip()] + [len(dT)]
    sT1s = [dT[sI[i]+1:sI[i+1]] for i in range(len(sI)-1)]
    sTs = []
    for s in sT1s:
        ts= [(x.split("\t")[0],x.split("\t")[1], x.split("\t")[2], x.split("\t")[3], x.split("\t")[4],x.split("\t")[5], x.split("\t")[6], x.split("\t")[7]) for x in s]
        ts1= [(tss[0],tss[1],tss[2],tss[3], tss[6], tss[7]) for tss in ts]
        sTs.append(ts1)
    return sTs

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    wordembdding=get_features(word)   ## word embedding vector 
    postag = sent[i][1]
    tag1=sent[i][2]
    tag2=sent[i][4]
    tag3 = sent[i][5]


    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'tag1': tag1,
        'tag1[:2]': tag1[:2],
        'tag2': tag2,
        'tag2[:2]': tag2[:2],
        'tag3': tag3,
        'tag3[:2]': tag3[:2],
        'wordlength': len(word),
        'wordinitialcap': word[0].isupper(),
        'wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
        'wordallcap': len([x for x in word if x.isupper()])==len(word),
        'distfromsentbegin': i
    }

    # here you add 300 features (one for each vector component)
    for iv,value in enumerate(wordembdding):
        features['v{}'.format(iv)]=value

In [36]:
train_sents = convertCONLLFormJustExtractionSemEval1(loc1)
test_sents = convertCONLLFormJustExtractionSemEval1(loc2)

In [37]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

  if __name__ == '__main__':


In [39]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

TypeError: only size-1 arrays can be converted to Python scalars

In [26]:
a=[1]

In [3]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    tag1=sent[i][2]
    tag2=sent[i][4]
    tag3 = sent[i][5]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'tag1': tag1,
        'tag1[:2]': tag1[:2],
        'tag2': tag2,
        'tag2[:2]': tag2[:2],
        'tag3': tag3,
        'tag3[:2]': tag3[:2],
        'wordlength': len(word),
        'wordinitialcap': word[0].isupper(),
        'wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
        'wordallcap': len([x for x in word if x.isupper()])==len(word),
        'distfromsentbegin': i
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        tag11=sent[i-1][2]
        tag22=sent[i-1][4]
        tag33 = sent[i-1][5]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:tag1': tag1,
            '-1:tag1[:2]': tag1[:2],
            '-1:tag2': tag2,
            '-1:tag2[:2]': tag2[:2],
            '-1:tag3': tag3,
            '-1:tag3[:2]': tag3[:2],
            '-1:wordlength': len(word),
            '-1:wordinitialcap': word[0].isupper(),
            '-1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
            '-1:wordallcap': len([x for x in word if x.isupper()])==len(word),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        tag11=sent[i+1][2]
        tag22=sent[i+1][4]
        tag33 = sent[i+1][5]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:tag1': tag1,
            '+1:tag1[:2]': tag1[:2],
            '+1:tag2': tag2,
            '+1:tag2[:2]': tag2[:2],
            '+1:tag3': tag3,
            '+1:tag3[:2]': tag3[:2],
            '+1:wordlength': len(word),
            '+1:wordinitialcap': word[0].isupper(),
            '+1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
            '+1:wordallcap': len([x for x in word if x.isupper()])==len(word),
        })
    else:
        features['EOS'] = True
    
    if i > 2:
        word1 = sent[i-2][0]
        postag1 = sent[i-2][1]
        tag11=sent[i-2][2]
        tag22=sent[i-2][4]
        tag33 = sent[i-1][5]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:tag1': tag1,
            '-1:tag1[:2]': tag1[:2],
            '-1:tag2': tag2,
            '-1:tag2[:2]': tag2[:2],
            '-1:tag3': tag3,
            '-1:tag3[:2]': tag3[:2],
            '-1:wordlength': len(word),
            '-1:wordinitialcap': word[0].isupper(),
            '-1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
            '-1:wordallcap': len([x for x in word if x.isupper()])==len(word),
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-2:
        word1 = sent[i+2][0]
        postag1 = sent[i+2][1]
        tag11=sent[i+2][2]
        tag22=sent[i+2][4]
        tag33 = sent[i+2][5]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:tag1': tag1,
            '+1:tag1[:2]': tag1[:2],
            '+1:tag2': tag2,
            '+1:tag2[:2]': tag2[:2],
            '+1:tag3': tag3,
            '+1:tag3[:2]': tag3[:2],
            '+1:wordlength': len(word),
            '+1:wordinitialcap': word[0].isupper(),
            '+1:wordmixedcap': len([x for x in word[1:] if x.isupper()])>0,
            '+1:wordallcap': len([x for x in word if x.isupper()])==len(word),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, tag1, label, tag2, tag3 in sent]

def sent2tokens(sent):
    return [token for token, postag, tag1, label, tag2, tag3, tag4, tag5 in sent]

def convertCONLLFormJustExtractionSemEval1(loc):
    dT=open(loc, encoding='utf-8').read().split("\n")[:-2]
    sI = [-1] + [i for i, x in enumerate(dT) if not x.strip()] + [len(dT)]
    sT1s = [dT[sI[i]+1:sI[i+1]] for i in range(len(sI)-1)]
    sTs = []
    for s in sT1s:
        ts= [(x.split("\t")[0],x.split("\t")[1], x.split("\t")[2], x.split("\t")[3], x.split("\t")[4],x.split("\t")[5], x.split("\t")[6], x.split("\t")[7]) for x in s]
        ts1= [(tss[0],tss[1],tss[2],tss[3], tss[6], tss[7]) for tss in ts]
        sTs.append(ts1)
    return sTs

In [4]:
train_sents = convertCONLLFormJustExtractionSemEval1(loc1)
test_sents = convertCONLLFormJustExtractionSemEval1(loc2)

X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [5]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 5.48 s


CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [6]:
y_pred = crf.predict(X_test)

In [7]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-KP', 'I-KP']

In [8]:
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.5645220992658448

In [9]:
sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0]))
print((metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3)))

              precision    recall  f1-score   support

        B-KP      0.521     0.531     0.526       390
        I-KP      0.663     0.529     0.588       630

   micro avg      0.601     0.529     0.563      1020
   macro avg      0.592     0.530     0.557      1020
weighted avg      0.609     0.529     0.565      1020

