<a href="https://colab.research.google.com/github/shreyab375/Natural-Language-Processing/blob/main/Named_Entity_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
!pip install pycrf
!pip install sklearn-crfsuite
import nltk
import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from nltk.tokenize import word_tokenize
model = spacy.load("en_core_web_sm")
import pandas as pd




In [27]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

##Data Preprocessing

The dataset provided is in the form of one word per line.

Suppose there are x words in a sentence, then there will be x continuous lines with one word in each line.
Further, the two sentences are separated by empty lines. The labels for the data follow the same format.

In [15]:
 #Reading the train and test sentences and labels
with open('train_sent', 'r') as train_sent_file:
  train_sent = train_sent_file.readlines()

with open('train_label', 'r') as train_labels_file:
  train_labels = train_labels_file.readlines()

with open('test_sent', 'r') as test_sent_file:
  test_sent = test_sent_file.readlines()

with open('test_label', 'r') as test_labels_file:
  test_labels = test_labels_file.readlines()

In [17]:
train_sent[:5]

['All\n', 'live\n', 'births\n', '>\n', 'or\n']

In [18]:
# Sanity check to see that the number of tokens and no. of corresponding labels match.
print("Count of tokens in training set\n","No. of words: ",len(train_words),"\nNo. of labels: ",len(train_labels_by_word))
print("\n\nCount of tokens in test set\n","No. of words: ",len(test_words),"\nNo. of labels: ",len(test_labels_by_word))

Count of tokens in training set
 No. of words:  48501 
No. of labels:  48501


Count of tokens in test set
 No. of words:  19674 
No. of labels:  19674


In [19]:
def convert_to_sentences(dataset):
    sent_list = []
    sent = ""
    for entity in dataset:
        if entity != '\n':
            sent = sent + entity[:-1] + " "       # Adding word/label to current sentence / sequence of labels
        else:
            sent_list.append(sent[:-1])           # Getting rid of the space added after the last entity.
            sent = ""
    return sent_list

In [20]:
# Converting tokens to sentences and individual labels to sequences of corresponding labels.
train_sentences = convert_to_sentences(train_sent)
train_labels = convert_to_sentences(train_labels)
test_sentences = convert_to_sentences(test_sent)
test_labels = convert_to_sentences(test_labels)

In [21]:
train_sentences[:5]

['All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )',
 'The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )',
 'Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 )',
 "The `` corrected '' cesarean rate ( maternal-fetal medicine and transported patients excluded ) was 12.4 % ( 273 of 2194 ) , and the `` corrected '' primary rate was 9.6 % ( 190 of 1975 )",
 "Arrest of dilation was the most common indication in both `` corrected '' subgroups ( 23.4 and 24.6 % , respectively )"]

In [22]:
print("Number of sentences in the train dataset: {}".format(len(train_sentences)))
print("Number of sentences in the test dataset: {}".format(len(test_sentences)))

Number of sentences in the train dataset: 2599
Number of sentences in the test dataset: 1056


In [23]:
print("Number of lines of labels in the train dataset: {}".format(len(train_labels)))
print("Number of lines of labels in the test dataset: {}".format(len(test_labels)))

Number of lines of labels in the train dataset: 2599
Number of lines of labels in the test dataset: 1056


In [29]:
#Extract those tokens which have NOUN or PROPN as their PoS tag and find their frequency
noun_propn = []         # Initiating list for nouns and proper nouns
pos_tag = []            # initiating list for corresponding PoS tags.
for sent in train_sent:
    for token in model(sent):
        if token.pos_ in ['NOUN', 'PROPN']:
           noun_propn.append(token.text)
           pos_tag.append(token.pos_)
print("No. of tokens in combined dataset with PoS tag of 'NOUN' or 'PROPN': {}".format(len(noun_propn)))

No. of tokens in combined dataset with PoS tag of 'NOUN' or 'PROPN': 17110


In [32]:
noun_pos = pd.DataFrame({"Noun": noun_propn, "POS":pos_tag})

In [35]:
noun_pos["Noun"].value_counts().head(25)

patients        354
treatment       195
cancer          135
therapy         120
disease         106
cell             99
lung             87
chemotherapy     65
group            63
effects          61
patient          59
gene             55
TO_SEE           55
results          54
surgery          51
survival         50
risk             49
women            48
activity         47
children         47
=                47
analysis         47
cases            46
rate             46
primary          46
Name: Noun, dtype: int64

In [91]:
def POS_tagger(sentences, position):
    sentence = " ".join(sentences)  # Convert the list of words into a string
    words = sentence.split()  # Split the string into individual words
    word = words[position]  # Get the word at the specified position
    posit = 0
    for token in model(sentence):
        pos = token.pos_
        if (token.text == word) and (posit == position):
            break
        posit += 1
    return pos

def getFeaturesForOneWord(sent_list, position):
    sentence = " ".join(sent_list)  # Convert the list of words into a string
    word = sent_list[position]  # Get the word at the specified position
    features = [
        'word.lower=' + word.lower(),  # serves as word id
        'word.postag=' + POS_tagger(sent_list, position),  # PoS tag of current word
        'word[-3:]=' + word[-3:],  # last three characters
        'word[-2:]=' + word[-2:],  # last two characters
        'word.isupper=%s' % word.isupper(),  # is the word in all uppercase
        'word.isdigit=%s' % word.isdigit(),  # is the word a number
        'words.startsWithCapital=%s' % word[0].isupper()  # is the word starting with a capital letter
    ]

    if position > 0:
        prev_word = sent_list[position - 1]
        features.extend([
            'prev_word.lower=' + prev_word.lower(),  # previous word
            'prev_word.postag=' + POS_tagger(sent_list, position - 1),  # PoS tag of previous word
            'prev_word.isupper=%s' % prev_word.isupper(),  # is the previous word in all uppercase
            'prev_word.isdigit=%s' % prev_word.isdigit(),  # is the previous word a number
            'prev_words.startsWithCapital=%s' % prev_word[0].isupper()  # is the previous word starting with a capital letter
        ])
    else:
        features.append('BEG')  # feature to track begin of sentence

    if position == len(sent_list) - 1:
        features.append('END')  # feature to track end of sentence

    return features

In [92]:
X = getFeaturesForOneWord(train_sentences[5], 1)
X

['word.lower=e',
 'word.postag=PROPN',
 'word[-3:]=e',
 'word[-2:]=e',
 'word.isupper=False',
 'word.isdigit=False',
 'words.startsWithCapital=False',
 'prev_word.lower=c',
 'prev_word.postag=PROPN',
 'prev_word.isupper=True',
 'prev_word.isdigit=False',
 'prev_words.startsWithCapital=True']

In [93]:
#  get features for a sentence.
def getFeaturesForOneSentence(sentence):
  sentence_list = sentence.split()
  return [getFeaturesForOneWord(sentence_list, position) for position in range(len(sentence_list))]

In [94]:
 #Checking feature extraction
example_sentence = train_sentences[5]
print(example_sentence)

features = getFeaturesForOneSentence(example_sentence)
features[0]

Cesarean rates at tertiary care hospitals should be compared with rates at community hospitals only after correcting for dissimilar patient groups or gestational age


['word.lower=cesarean',
 'word.postag=ADJ',
 'word[-3:]=ean',
 'word[-2:]=an',
 'word.isupper=False',
 'word.isdigit=False',
 'words.startsWithCapital=True',
 'BEG']

In [95]:
features[4]

['word.lower=care',
 'word.postag=NOUN',
 'word[-3:]=are',
 'word[-2:]=re',
 'word.isupper=False',
 'word.isdigit=False',
 'words.startsWithCapital=False',
 'prev_word.lower=tertiary',
 'prev_word.postag=ADJ',
 'prev_word.isupper=False',
 'prev_word.isdigit=False',
 'prev_words.startsWithCapital=False']

In [96]:
# Write a code to get the labels for a sentence.
def getLabelsInListForOneSentence(labels):
  return labels.split()

In [97]:
example_labels = getLabelsInListForOneSentence(train_labels[5])
print(example_labels)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [98]:
X_train = [getFeaturesForOneSentence(sentence) for sentence in train_sentences]
X_test = [getFeaturesForOneSentence(sentence) for sentence in test_sentences]

In [99]:
Y_train = [getLabelsInListForOneSentence(labels) for labels in train_labels]
Y_test = [getLabelsInListForOneSentence(labels) for labels in test_labels]

In [103]:
# Building the CRF model. Using max_iterations as 200.
crf = sklearn_crfsuite.CRF(max_iterations=300)

try:
    crf.fit(X_train, Y_train)
except AttributeError:
    pass

In [104]:
Y_pred = crf.predict(X_test)

In [105]:
metrics.flat_f1_score(Y_test, Y_pred, average='weighted')

0.9097748874746256