https://github.com/shubham-IISc/Named-Entity-Recognition-using-CRF

In [1]:
import nltk
import numpy as np
from sklearn.metrics import make_scorer
from nltk.corpus import wordnet as wn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV
import scipy

sno = nltk.stem.SnowballStemmer('english')
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
#reading data
with open("/Users/sdeshpande/Desktop/text_analysis_scripts/ner.txt", 'r') as f:
    data=f.readlines()

In [3]:
#creating list of list of tuples
docs=[]
doc=[]
for sent in data:
    if(len(sent)==1):
        docs.append(doc)
        doc=[]
    else:
        word1,word2=sent.split()
        word_tuple=(word1,word2[-2:])
        doc.append(word_tuple)


In [4]:
print(docs[0])

[('All', 'O'), ('live', 'O'), ('births', 'O'), ('>', 'O'), ('or', 'O'), ('=', 'O'), ('23', 'O'), ('weeks', 'O'), ('at', 'O'), ('the', 'O'), ('University', 'O'), ('of', 'O'), ('Vermont', 'O'), ('in', 'O'), ('1995', 'O'), ('(', 'O'), ('n', 'O'), ('=', 'O'), ('2395', 'O'), (')', 'O'), ('were', 'O'), ('retrospectively', 'O'), ('analyzed', 'O'), ('for', 'O'), ('delivery', 'O'), ('route', 'O'), (',', 'O'), ('indication', 'O'), ('for', 'O'), ('cesarean', 'O'), (',', 'O'), ('gestational', 'O'), ('age', 'O'), (',', 'O'), ('parity', 'O'), (',', 'O'), ('and', 'O'), ('practice', 'O'), ('group', 'O'), ('(', 'O'), ('to', 'O'), ('reflect', 'O'), ('risk', 'O'), ('status', 'O'), (')', 'O')]


In [5]:
# Appending the POS tags
data=[]
for doc in docs:
    words = [word for word,label in doc ]
    pos_tags=nltk.pos_tag(words)
    data_sent=[]
    for i in range(len(pos_tags)):
        data_sent.append((doc[i][0],pos_tags[i][1],doc[i][1]))
    data.append(data_sent)
    
print(data[0])

[('All', 'DT', 'O'), ('live', 'JJ', 'O'), ('births', 'NNS', 'O'), ('>', 'VBP', 'O'), ('or', 'CC', 'O'), ('=', 'VBP', 'O'), ('23', 'CD', 'O'), ('weeks', 'NNS', 'O'), ('at', 'IN', 'O'), ('the', 'DT', 'O'), ('University', 'NNP', 'O'), ('of', 'IN', 'O'), ('Vermont', 'NNP', 'O'), ('in', 'IN', 'O'), ('1995', 'CD', 'O'), ('(', '(', 'O'), ('n', 'IN', 'O'), ('=', 'NNP', 'O'), ('2395', 'CD', 'O'), (')', ')', 'O'), ('were', 'VBD', 'O'), ('retrospectively', 'RB', 'O'), ('analyzed', 'VBN', 'O'), ('for', 'IN', 'O'), ('delivery', 'NN', 'O'), ('route', 'NN', 'O'), (',', ',', 'O'), ('indication', 'NN', 'O'), ('for', 'IN', 'O'), ('cesarean', 'NN', 'O'), (',', ',', 'O'), ('gestational', 'JJ', 'O'), ('age', 'NN', 'O'), (',', ',', 'O'), ('parity', 'NN', 'O'), (',', ',', 'O'), ('and', 'CC', 'O'), ('practice', 'NN', 'O'), ('group', 'NN', 'O'), ('(', '(', 'O'), ('to', 'TO', 'O'), ('reflect', 'VB', 'O'), ('risk', 'NN', 'O'), ('status', 'NN', 'O'), (')', ')', 'O')]


In [6]:
# features from word net 

def no_of_contexts(word):
    temp=0
    for syn in wn.synsets(word):
        temp+=1
    return temp

# if it is alphanumeric
def contain_digit(word):
    for ch in list(word):
        if ch.isdigit()==True:
            return True
    return False

In [7]:
#print the report

def showreport(y_test,y_pred):
    label_dict = {"O": 0, "D": 1,"T":2}
   # creating predicted list of entities
    model_output=[]
    for row in y_pred:
        for entity in row:
            model_output.append(label_dict[entity])
    #creating true list of entities
    true_output=[]
    for row in y_test:
        for entity in row:
            true_output.append(label_dict[entity])       
    
    # Print out the classification report
    print(classification_report(true_output, model_output, target_names=["O", "D","T"]))

In [8]:
def word_to_features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'suffix_3': word[-3:],
        #'suffix_2': word[-2:],
        'prefix_3':word[:3],
        'wordlen':len(word),
       'word.isupper': word.isupper(),
     'word.isdigit': contain_digit(word),
      'postag': postag,
        'no_of_contexts':no_of_contexts(word),
        'word_stem':sno.stem(word.lower())

      }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({

         
              '-1:wordlen': len(word1),
           '-1:word.isupper': word1.isupper(),
         '-1:word.isdigit': contain_digit(word1),
         '-1:postag': postag1,
            '-1:no_of_contexts':no_of_contexts(word1)
            
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({

          '+1:word.isupper': word1.isupper(),
               '+1:wordlen': len(word1),
           '+1:word.isdigit':contain_digit(word1),
           '+1:postag': postag1,
           '+1:no_of_contexts':no_of_contexts(word1)
            
        })
    else:
        features['EOS'] = True

    return features

In [9]:
# convert words of each document into features reresented in form of dictionary
X=[]
Y=[]
for doc in data:
    X.append([word_to_features(doc,i) for i in range(len(doc))])
    final_y=[label for (word,pos_tag,label) in doc]
    Y.append(final_y)

In [10]:
#splitting in ratio of 70:10:20
X_train, X_testanddev, y_train, y_testanddev= train_test_split(X, Y, test_size=0.3,random_state=4)

X_test,X_dev, y_test,y_dev = train_test_split(X_testanddev, y_testanddev, test_size=0.33,random_state=4)

In [None]:
# hyper parameter tunning 
#code referred from crf suite examples


labels=["D","T","O"]

crf = sklearn_crfsuite.CRF(algorithm='lbfgs', 
                           max_iterations=1000,
                           all_possible_transitions=True,
                           verbose=False)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}


f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='macro', labels=labels)

rs = RandomizedSearchCV(crf, params_space,
                        cv=10,
                        verbose=1,
                        n_jobs=1,
                        n_iter=20,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
# fitting the models with obtained hyperparameters c1=.055 and c2=.066


crf = sklearn_crfsuite.CRF(algorithm='lbfgs',c1=0.055,c2=0.066 ,
                           max_iterations=1000,
                           all_possible_transitions=True,
                           verbose=False)
crf.fit(X_train,y_train)
labels=["O","D","T"]

#predicting the entities for test data
y_pred=crf.predict(X_test)
print("F1 score for D, T and O label(average) is %lf "% (metrics.flat_f1_score(y_test, y_pred,
                      average='macro', labels=labels)))
#printing the classfication report
showreport(y_test,y_pred)