In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
np.random.seed(0)
plt.style.use("ggplot")

import tensorflow as tf
print('Tensorflow version:', tf.__version__)
print('GPU detected:', tf.config.list_physical_devices('GPU'))

Tensorflow version: 2.9.1
GPU detected: []


In [2]:
#Loading data
data= pd.read_csv("https://raw.githubusercontent.com/yrnigam/Named-Entity-Recognition-NER-using-LSTMs/master/ner_dataset.csv",encoding="latin1")

In [3]:
#Looking for null values
print(data.isnull().sum())

Sentence #    1000616
Word                0
POS                 0
Tag                 0
dtype: int64


In [4]:
#Filling Null Values
data = data.fillna(method='ffill')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [5]:
print("Unique Words in corpus:",data['Word'].nunique())
print("Unique Tag in corpus:",data['Tag'].nunique())

Unique Words in corpus: 35178
Unique Tag in corpus: 17


In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

class NERTextPrepper():
    def __init__(self, data, max_len=50):
        self.words = list(set(data['Word'].values))
        self.words.append("ENDPAD")
        self.num_words = len(self.words)
        
        self.tags = list(set(data['Tag'].values))
        self.num_tags = len(self.tags)
        
        self.word2idx =  {w : i+1 for i,w in enumerate(self.words)}
        self.tag2idx  =  {t : i for i,t in enumerate(self.tags)}
        
        self.n_sent = 1 #counter
        self.data = data
        agg_func = lambda s:[(w,t) for w,t in zip(s['Word'].tolist(),s['Tag'].tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
        self.max_len = 50
        
        self.X = [[self.word2idx[w[0]] for w in s]for s in self.sentences]
        self.X = pad_sequences(maxlen=self.max_len , sequences=self.X, padding='post', value=self.num_words-1)

        self.y = [[self.tag2idx[w[1]] for w in s]for s in self.sentences]
        self.y = pad_sequences(maxlen=max_len , sequences=self.y, padding='post', value=self.tag2idx["O"])
        self.y = [to_categorical(i, num_classes=self.num_tags) for i in self.y]

In [7]:
ner = NERTextPrepper(data)

In [8]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(ner.X, ner.y, test_size=0.1, random_state=1)

In [9]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

input_word = Input(shape = (ner.max_len,))
model = Embedding(input_dim=ner.num_words,output_dim=ner.max_len,input_length=ner.max_len)(input_word)
model = SpatialDropout1D(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(ner.num_tags,activation='softmax'))(model)
model = Model(input_word,out)
model.summary()

model.compile(optimizer="adam",loss='categorical_crossentropy',metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_accuracy',patience=1,verbose=0,mode='max',restore_best_weights=False)
callbacks = [early_stopping]

history = model.fit(
    x_train,np.array(y_train),
    validation_split = 0.2,
    batch_size = 64,
    epochs = 5,
    verbose = 1,
    callbacks=callbacks
)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding (Embedding)       (None, 50, 50)            1758950   
                                                                 
 spatial_dropout1d (SpatialD  (None, 50, 50)           0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 50, 200)          120800    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 50, 17)           3417      
 ibuted)                                                         
                                                             

In [10]:
model.evaluate(x_test, np.array(y_test))



[0.051496170461177826, 0.9851084351539612]

In [11]:
y_pred_probs = model.predict(np.array(x_test))



In [12]:
y_true = np.argmax(np.array(y_test), axis=-1)
y_true_flat = [item for sublist in y_true for item in sublist]

y_pred = np.argmax(y_pred_probs, axis=-1)
y_pred = [item for sublist in y_pred for item in sublist]

In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_true_flat, y_pred, target_names=ner.tags))

              precision    recall  f1-score   support

           O       0.99      1.00      1.00    223746
       B-nat       0.00      0.00      0.00        14
       B-eve       1.00      0.09      0.16        34
       B-tim       0.93      0.85      0.89      2114
       I-geo       0.83      0.74      0.78       760
       I-eve       0.00      0.00      0.00        30
       I-tim       0.84      0.66      0.74       684
       B-gpe       0.95      0.94      0.95      1604
       I-gpe       0.89      0.32      0.47        25
       I-org       0.77      0.78      0.77      1629
       B-org       0.78      0.71      0.75      1948
       I-art       0.00      0.00      0.00        49
       B-per       0.84      0.80      0.82      1651
       B-geo       0.88      0.87      0.87      3765
       I-per       0.88      0.81      0.84      1695
       B-art       0.00      0.00      0.00        48
       I-nat       0.00      0.00      0.00         4

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
