In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import random 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn.metrics import classification_report
import os
from sklearn.model_selection import train_test_split

import tensorflow as tf
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
        

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a name="Introduction"></a>
# Introduction

NER is a subtask of information extraction that locates and classifies named entities in a text. The named entities could be organizations, persons, locations, times, etc. 

**For example:**

COVID-19 has ensured low key August  15    celebrations in India
 O        O   O       O   O   B-time I-time  O.          O. B-geo
Is labeled as follows: 

- August 15 - time indicator
- India - geographic indicator

Everything else that is labeled with an `O` is not considered to be a named entity. 

<a name="Exploring Data"></a>
# Exploring the Dataset

In the dataset provided on Kaggle consists of four columns, the sentence number, the word, the part of speech of the word, and the tag associated with each word.
We will have to preprocess the dataset, in such a way that every sentence is one row and Tokenise and Encode 



In [1]:
data=pd.read_csv("/kaggle/input/entity-annotated-corpus/ner_dataset.csv",encoding="latin1")
data.head()

In [1]:
data['Sentence #']=data['Sentence #'].ffill(axis = 0) 
data.head()

We have to create a list of sentences and list of NER Tags associated with each sentence

In [1]:
agg_func = lambda s: [(w,p, t) for w,p, t in zip(s["Word"].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                        s["Tag"].values.tolist())]

In [1]:
agg_data=data.groupby(['Sentence #']).apply(agg_func).reset_index().rename(columns={0:'Sentence_POS_Tag_Pair'})
agg_data.head()

In [1]:
agg_data['Sentence']=agg_data['Sentence_POS_Tag_Pair'].apply(lambda sentence:" ".join([s[0] for s in sentence]))
agg_data['POS']=agg_data['Sentence_POS_Tag_Pair'].apply(lambda sentence:" ".join([s[1] for s in sentence]))
agg_data['Tag']=agg_data['Sentence_POS_Tag_Pair'].apply(lambda sentence:" ".join([s[2] for s in sentence]))

In [1]:
agg_data.shape

In [1]:
agg_data.head()

In [1]:
agg_data['tokenised_sentences']=agg_data['Sentence'].apply(lambda x:x.split())
agg_data['tag_list']=agg_data['Tag'].apply(lambda x:x.split())
agg_data.head()

In [1]:
agg_data['len_sentence']=agg_data['tokenised_sentences'].apply(lambda x:len(x))
agg_data['len_tag']=agg_data['tag_list'].apply(lambda x:len(x))
agg_data['is_equal']=agg_data.apply(lambda row:1 if row['len_sentence']==row['len_tag'] else 0,axis=1)
agg_data['is_equal'].value_counts()

In [1]:
agg_data=agg_data[agg_data['is_equal']!=0]

In [1]:
agg_data.shape

In [1]:
sentences_list=agg_data['Sentence'].tolist()
tags_list=agg_data['tag_list'].tolist()

print("Number of Sentences in the Data ",len(sentences_list))
print("Are number of Sentences and Tag list equal ",len(sentences_list)==len(tags_list))

In [1]:
tags_list[0]

<a name="Tokenising the Sentence"></a>

### Tokenising the sentences

- We have to create a vocab of words and then each word should be assigned an unique identifier. Keras provides an Tokeniser API, which can be used for this purpose. It will both tokenise and encode sentences. 

- Also, since different sentences are of different length, we will have to pad the input sequence to the largest sentence length.Keras provides pad_sequences function to achieve the same


#### Using Tokeniser To Encode the Sentences

In [1]:
tokeniser= tf.keras.preprocessing.text.Tokenizer(lower=False,filters='')

tokeniser.fit_on_texts(sentences_list)


In [1]:
print("Vocab size of Tokeniser ",len(tokeniser.word_index)+1) ## Adding one since 0 is reserved for padding

In [1]:
tokeniser.index_word[326]

In [1]:
encoded_sentence=tokeniser.texts_to_sequences(sentences_list)
print("First Original Sentence ",sentences_list[0])
print("First Encoded Sentence ",encoded_sentence[0])
print("Is Length of Original Sentence Same as Encoded Sentence ",len(sentences_list[0].split())==len(encoded_sentence[0]))
print("Length of First Sentence ",len(encoded_sentence[0]))


#### Creating Tag Mapping and Encoding the Tags

In [1]:
tags=list(set(data['Tag'].values))
print(tags)
num_tags=len(tags)
print("Number of Tags ",num_tags)

tags_map={tag:i for i,tag in enumerate(tags)}
print("Tags Map ",tags_map)

As we can see in the tags map above, every entity has a prefix B- or I-. B- denotes the **beginning** and I- **inside** of an entity. The prefixes are used to detect multiword entities.

In [1]:
reverse_tag_map={v: k for k, v in tags_map.items()}

In [1]:
encoded_tags=[[tags_map[w] for w in tag] for tag in tags_list]
print("First Sentence ",sentences_list[0])
print('First Sentence Original Tags ',tags_list[0])
print("First Sentence Encoded Tags ",encoded_tags[0])
print("Is length of Original Tags and Encoded Tags same ",len(tags_list[0])==len(encoded_tags[0]))
print("Length of Tags for First Sentence ",len(encoded_tags[0]))

#### Understanding the Length of Sentences - Identifying appropriate SEQ LENGTH

In [1]:
max_sentence_length=max([len(s.split()) for s in sentences_list])
print(max_sentence_length)

From the histogram, we can see that 50 will be a good number for sequence length and 104 is the maximum sequence length. Instead of using 104 as the sequence length, let us choose the nearest power of two as the sequence length - in this case it is 128

We now have to pad the encoded sentences and encoded tags . For the Encoded Sentences we can use 0 as the padding token and for Tags, the token index of 'O' can be used 

In [1]:
max_len=128
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

padded_encoded_sentences=pad_sequences(maxlen=max_len,sequences=encoded_sentence,padding="post",value=0)
padded_encoded_tags=pad_sequences(maxlen=max_len,sequences=encoded_tags,padding="post",value=tags_map['O'])

print("Shape of Encoded Sentence ",padded_encoded_sentences.shape)
print("Shape of Encoded Labels ",padded_encoded_tags.shape)

print("First Encoded Sentence Without Padding ",encoded_sentence[0])
print("First Encoded Sentence with padding ",padded_encoded_sentences[0])
print("First Sentence Encoded Label without Padding ",encoded_tags[0])
print("First Sentence Encoded Label with Padding ",padded_encoded_tags[0])

In [1]:
target= [to_categorical(i,num_classes = num_tags) for i in  padded_encoded_tags]
print("Shape of Labels  after converting to Categorical for first sentence ",target[0].shape)

<a name="Splitting Data into Train, Test and Validation"></a>

### Splitting the data into Train,Test and Validation Sets

In [1]:
from sklearn.model_selection import train_test_split
X_train,X_val_test,y_train,y_val_test = train_test_split(padded_encoded_sentences,target,test_size = 0.3,random_state=42)
X_val,X_test,y_val,y_test = train_test_split(X_val_test,y_val_test,test_size = 0.2,random_state=42)
print("Input Train Data Shape ",X_train.shape)
print("Train Labels Length ",len(y_train))
print("Input Test Data Shape ",X_test.shape)
print("Test Labels Length ",len(y_test))

print("Input Validation Data Shape ",X_val.shape)
print("Validation Labels Length ",len(y_val))

In [1]:
#print("First Sentence in Training Data ",X_train[0])
#print("First sentence Label ",y_train[0])
print("Shape of First Sentence -Train",X_train[0].shape)
print("Shape of First Sentence Label  -Train",y_train[0].shape)

<a name="Model Building Using LSTM"></a>

## Building the Model using LSTM

In [1]:
from tensorflow.keras import Model,Input
from tensorflow.keras.layers import LSTM,Embedding,Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D,Bidirectional

In [1]:
embedding_dim=128
vocab_size=len(tokeniser.word_index)+1
lstm_units=128
max_len=128

input_word = Input(shape = (max_len,))
model = Embedding(input_dim = vocab_size+1,output_dim = embedding_dim,input_length = max_len)(input_word)

model = LSTM(units=embedding_dim,return_sequences=True)(model)
out = TimeDistributed(Dense(num_tags,activation = 'softmax'))(model)
model = Model(input_word,out)
model.summary()

In [1]:
model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])

In [1]:
history = model.fit(X_train,np.array(y_train),validation_data=(X_val,np.array(y_val)),batch_size = 32,epochs = 3)


<a name="Model Evaluation"></a>

## Evaluating the Model

To evaluate the model, we will have to remove the padded portion and identify the accuracy. For this, for every test data, let us create a dataframe wuth the tokens and the actual and predicted value - and use it to calculate the metrics

In [1]:
preds=model.predict(X_test) ## Predict using model on Test Data

In [1]:

def evaluatePredictions(test_data,preds,actual_preds):
    print("Shape of Test Data Array",test_data.shape)
    y_actual=np.argmax(np.array(actual_preds),axis=2)
    y_pred=np.argmax(preds,axis=2)
    num_test_data=test_data.shape[0]
    print("Number of Test Data Points ",num_test_data)
    data=pd.DataFrame()
    df_list=[]
    for i in range(num_test_data):
        test_str=list(test_data[i])
        df=pd.DataFrame()
        df['test_tokens']=test_str
        df['tokens']=df['test_tokens'].apply(lambda x:tokeniser.index_word[x] if x!=0 else '<PAD>')
        df['actual_target_index']=list(y_actual[i])
        df['pred_target_index']=list(y_pred[i])
        df['actual_target_tag']=df['actual_target_index'].apply(lambda x:reverse_tag_map[x])
        df['pred_target_tag']=df['pred_target_index'].apply(lambda x:reverse_tag_map[x])
        df['id']=i+1
        df_list.append(df)
    data=pd.concat(df_list)
    pred_data=data[data['tokens']!='<PAD>']
    accuracy=pred_data[pred_data['actual_target_tag']==pred_data['pred_target_tag']].shape[0]/pred_data.shape[0]
    
    
    return pred_data,accuracy
        

In [1]:
pred_data,accuracy=evaluatePredictions(X_test,preds,y_test)

### Let us calculate the precision and recall of each Class - and also the F-Score

In [1]:
y_pred=pred_data['pred_target_tag'].tolist()
y_actual=pred_data['actual_target_tag'].tolist()

In [1]:
print(classification_report(y_actual,y_pred))

In [1]:
pred_data[pred_data['actual_target_tag']=="B-art"]

The model has failed to learn tags associated with Work of Art. For a Few cases where there is word like "France" or "Washington", it has classified it as a place.

In [1]:
pred_data[pred_data['actual_target_tag']=="B-nat"]

Also, while the model has learnt to classify words as B-GPE, it has failed to understand I-GPE Words. For example in African American => it classifies African as B-GPE but classifies American as B-ORG instead of I-GPE

<a name="Conclusion and Future Steps"></a>

## How to Improve the Model

- Currently the model looks only at the previous words to predict the tag and not the words after it. This also can be one reason why I-GPE Tags are not classified properly.To avoid this we can use an Bidirectional LSTM

- Using pre-trained word Embeddings can also help improve the model. 

- Using BERT like models for Fine-Tuning

- We can also make changes to the model hyperparameters like epochs, number of LSTM Units, Activation Function used etc


