In [None]:
from itertools import chain
import pandas as pd
import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import nltk
nltk.download('averaged_perceptron_tagger')

## Read and preprocess the data


In [None]:
# Read the data
def read_data(filename):
    rows = []
    with open(f'ner/wnut16/{filename}',encoding="utf8") as f:
        for line in f.readlines():
            if len(line) < 2:
                continue
            rows.append(line.rstrip('\n').split())
    data = pd.DataFrame(rows, columns=['term', 'entitytags'])
    data["pos"]=nltk.pos_tag(data["term"])
    for i in range(len(data)):
        data["pos"][i]=data["pos"][i][1]
    return data


In [None]:
train = read_data('train')
test = read_data('test')
dev = read_data('dev')
train

In [None]:
# process to get the train, test, dev dataset for crf

def process_data(data):
    dataset = []
    sent = []
    for i, (term, entitytags,pos) in data.iterrows():
        if term == '.':
            sent.append((term, entitytags,pos))
            dataset.append(sent)
            sent = []
        else:
            sent.append((term, entitytags,pos))
    return dataset


In [None]:
train_sents = process_data(train)
test_sents = process_data(test)
dev_sents = process_data(dev)

## The following function will design the feature for crf model. 

In [None]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        "word.isupper()": word.isupper(),
        'word.lower()': word.lower(),
        "word.postags": sent[i][2],
        "word.istitle()":word.istitle(),
        "word.isdigit()": word.isdigit(),
        "word[-3:]": word[-3:],
        "word[:2]" : word[:2],
        "len_word": len(word)
    }
    
    if i>0 :
        word1=sent[i-1][0]
        features.update({"word1.isupper()": word1.isupper(),
                         'word1.lower()': word1.lower(),

        "word1.postags": sent[i-1][2],
        "word1.istitle()":word1.istitle(),
        "word1.isdigit()": word1.isdigit(),
         "word1[-3:]": word1[-3:],
         "word1[:2]" : word1[:2],
         "len_word1": len(word1)               }     
                       )
        
    else:
        features["BOS"]=True
        
        
    if i < (len(sent) -1) :
        word2=sent[i+1][0]
        features.update({"word2.isupper()": word2.isupper(),
                         'word2.lower()': word2.lower(),

        "word2.postags": sent[i-1][2],
        "word2.istitle()":word2.istitle(),
        "word2.isdigit()": word2.isdigit(),
        "word2[-3:]": word2[-3:],
        "word2[:2]" : word2[:2],
         "len_word2": len(word2)                }              )
        
    else:
        features["EOS"]=True
        
        

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, label, tags in sent]


def sent2tokens(sent):
    return [token for token, label, tags in sent]


In [None]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

X_dev = [sent2features(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]


In the following block of code, we use try and except because the version of the library.

In [None]:
crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

This block of code will help visualize the learned features for crf model.

In [None]:
labels = list(crf.classes_)
labels.remove('O')
labels


In [None]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)


In [None]:
words = [sent2tokens(s) for s in test_sents]


In [None]:
labels = [sent2labels(s) for s in test_sents]


In [None]:
predictions = []
for (word, true_id, pred_id) in zip(words, labels, y_pred):
    for (w, t, p) in zip(word, true_id, pred_id):
        line = ' '.join([w, t, p])
        predictions.append(line)
    predictions.append('')
with open('crf_pred', 'w',encoding="utf8") as f:
    f.write('\n'.join(predictions))
          

In [None]:
import os
eval_script = '../released/src/conlleval.pl'
predf = 'crf_pred'
scoref = 'crf_score'
# os.system('%s < %s > %s' % (eval_script, predf, scoref))
os.system('perl %s < %s > %s' % (eval_script, predf, scoref))


In [None]:
eval_lines = [l.rstrip() for l in open(scoref, 'r', encoding='utf8')]

for i, line in enumerate(eval_lines):
    print(line)

## Let's check what classifier learned:


In [None]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

## Check the state features:


In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

In [None]:
# for gmb data

In [None]:
# Read the data
import time
start= time.time()
def read_data(filename):
    rows = []
    with open(f'ner/GMB/{filename}',encoding="utf8") as f:
        for line in f.readlines():
            if len(line) < 2:
                continue
            rows.append(line.rstrip('\n').split())
    data = pd.DataFrame(rows, columns=['term', 'entitytags'])
    # add the pos tags to the dataframe
    # some lines of codes
    data["pos"]=nltk.pos_tag(data["term"])
    for i in range(len(data)):
        data["pos"][i]=data["pos"][i][1]
    return data


In [None]:
train = read_data('train')
test = read_data('test')
dev = read_data('dev')

In [None]:
# process to get the train, test, dev dataset for crf
def process_data(data):
    dataset = []
    sent = []
    for i, (term, entitytags,pos) in data.iterrows():
        if term == '.':
            sent.append((term, entitytags,pos))
            dataset.append(sent)
            sent = []
        else:
            sent.append((term, entitytags,pos))
    return dataset


In [None]:
train_sents = process_data(train)
test_sents = process_data(test)
dev_sents = process_data(dev)

In [None]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        "word.isupper()": word.isupper(),
        'word.lower()': word.lower(),
        # add more features here
        "word.postags": sent[i][2],
        "word.istitle()":word.istitle(),
        "word.isdigit()": word.isdigit(),
        "word[-3:]": word[-3:],
        "word[:2]" : word[:2],
        "len_word": len(word)
    }
    
    if i>0 :
        word1=sent[i-1][0]
        features.update({"word1.isupper()": word1.isupper(),
                         'word1.lower()': word1.lower(),

        "word1.postags": sent[i-1][2],
        "word1.istitle()":word1.istitle(),
        "word1.isdigit()": word1.isdigit(),
         "word1[-3:]": word1[-3:],
         "word1[:2]" : word1[:2],
         "len_word1": len(word1)               }     
                       )
        
    else:
        features["BOS"]=True
        
        
    if i < (len(sent) -1) :
        word2=sent[i+1][0]
        features.update({"word2.isupper()": word2.isupper(),
                         'word2.lower()': word2.lower(),

        "word2.postags": sent[i-1][2],
        "word2.istitle()":word2.istitle(),
        "word2.isdigit()": word2.isdigit(),
        "word2[-3:]": word2[-3:],
        "word2[:2]" : word2[:2],
         "len_word2": len(word2)                }              )
        
    else:
        features["EOS"]=True
        
        

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, label, tags in sent]


def sent2tokens(sent):
    return [token for token, label, tags in sent]


In [None]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

X_dev = [sent2features(s) for s in dev_sents]
y_dev = [sent2labels(s) for s in dev_sents]


In [None]:

crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

end= time.time()-start
print(end//60)

In [None]:
labels = list(crf.classes_)
labels.remove('O')
labels


In [None]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)


In [None]:
words = [sent2tokens(s) for s in test_sents]
labels = [sent2labels(s) for s in test_sents]