# Named Entity Recognition(NER)

# Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import ast



# Load data

In [2]:
def loading_data(data_path):
    
    data = pd.read_csv(data_path)
    
    data.dropna(inplace=True)
    print("Number of rows : ",data.shape[0]," and the number of columns : ",data.shape[1])
    
    return data

In [3]:
data = loading_data("/kaggle/input/named-entity-recognition-ner-corpus/ner.csv")

data.head()

Number of rows :  47959  and the number of columns :  4


Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [4]:
data['POS'][0]

"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', '.']"

# Data preprocessing

In [5]:
def preprocess_data(data):
    for i in range(len(data)):
        pos = ast.literal_eval(data['POS'][i])
        tags = ast.literal_eval(data['Tag'][i])
        data['POS'][i] = [str(word) for word in pos]
        data['Tag'][i] = [str(word.upper()) for word in tags]
    return data

In [6]:
data = preprocess_data(data)
data.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-GEO, O, O, O, O, O, B-GEO..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"[NNS, IN, NNS, VBN, IN, DT, NN, VBD, DT, NNS, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"[PRP, VBD, IN, DT, NNS, IN, NN, TO, DT, NN, IN...","[O, O, O, O, O, O, O, O, O, O, O, B-GEO, I-GEO..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","[NNS, VBD, DT, NN, IN, NNS, IN, CD, IN, NNS, V...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,Sentence: 5,The protest comes on the eve of the annual con...,"[DT, NN, VBZ, IN, DT, NN, IN, DT, JJ, NN, IN, ...","[O, O, O, O, O, O, O, O, O, O, O, B-GEO, O, O,..."


In [7]:
import re

def lower_text(text: str):
    return text.lower()

def remove_punctuation(text: str):
    """
    Substitute all punctiations with space in case of
    "hello!nice to meet you"
    
    If subs with '' -> "hellonice to meet you"
    With ' ' -> "hello nice to meet you"
    """
    text_nopunct = re.sub('[^A-Za-z0-9\s]', '', text)
    return text_nopunct

def remove_multiple_spaces(text: str):
    text_no_doublespace = re.sub('\s+', ' ', text)
    return text_no_doublespace

In [8]:
sample_text = data['Sentence'][3]

_lowered = lower_text(sample_text)
_without_punct = remove_punctuation(_lowered)
_single_spaced = remove_multiple_spaces(_without_punct)

print(sample_text)
print('-'*10)
print(_lowered)
print('-'*10)
print(_without_punct)
print('-'*10)
print(_single_spaced)

Police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 .
----------
police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 .
----------
police put the number of marchers at 10000 while organizers claimed it was 100000 
----------
police put the number of marchers at 10000 while organizers claimed it was 100000 


In [9]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
import nltk

stopWords = set(stopwords.words('english'))

def tokenize_text(text: str) -> list[str]:
    return word_tokenize(text)

def remove_stop_words(tokenized_text: list[str]) -> list[str]:
    wordsFiltered = [w for w in tokenized_text if w not in stopWords]
    return wordsFiltered

def stem_words(tokenized_text: list[str]) -> list[str]:
    stemmer = WordNetLemmatizer()
    output = [stemmer.lemmatize(text) for text in tokenized_text]
    return output


In [10]:
def preprocessing_stage(text):
    _lowered = lower_text(text)
    _without_punct = remove_punctuation(_lowered)
    _single_spaced = remove_multiple_spaces(_without_punct)
    _tokenized = tokenize_text(_single_spaced)
#     _without_sw = remove_stop_words(_tokenized)
    _stemmed = stem_words(_tokenized)
    _stemmed = ' '.join(_stemmed)
    
    return _stemmed

def clean_text_inplace(df):
    df['Sentence'] = df['Sentence'].apply(preprocessing_stage)
    return df

In [11]:
nltk.download("wordnet")
nltk.download("omw-1.4")
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/


data = clean_text_inplace(data)
data.head()

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,thousand of demonstrator have marched through ...,"[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-GEO, O, O, O, O, O, B-GEO..."
1,Sentence: 2,family of soldier killed in the conflict joine...,"[NNS, IN, NNS, VBN, IN, DT, NN, VBD, DT, NNS, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,Sentence: 3,they marched from the house of parliament to a...,"[PRP, VBD, IN, DT, NNS, IN, NN, TO, DT, NN, IN...","[O, O, O, O, O, O, O, O, O, O, O, B-GEO, I-GEO..."
3,Sentence: 4,police put the number of marcher at 10000 whil...,"[NNS, VBD, DT, NN, IN, NNS, IN, CD, IN, NNS, V...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,Sentence: 5,the protest come on the eve of the annual conf...,"[DT, NN, VBZ, IN, DT, NN, IN, DT, JJ, NN, IN, ...","[O, O, O, O, O, O, O, O, O, O, O, B-GEO, O, O,..."


In [12]:
df_final = data[['Sentence','Tag']]

df_train, df_test = train_test_split(df_final,test_size=0.2,random_state=42)
len(df_train), len(df_test)

(38367, 9592)

# Import model libraries and Make RNN model

In [13]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping

In [14]:
train_targets = list(df_train.Tag.values)
test_targets = list(df_test.Tag.values)

tokenizer = Tokenizer(lower=True,oov_token="UNK")
tokenizer.fit_on_texts(df_train['Sentence'])

train_inputs = tokenizer.texts_to_sequences(df_train['Sentence'])
test_inputs = tokenizer.texts_to_sequences(df_test['Sentence'])

In [15]:
word2idx = tokenizer.word_index
V = len(word2idx) # Vocab size
print("Found %s unique tokens "%V)

Found 26047 unique tokens 


In [16]:
train_tags = set([val for sublist in train_targets for val in sublist])
test_tags = set([val for sublist in test_targets for val in sublist])

print("Unique NER tags in train set: ",train_tags)
print("Unique NER tags in test set: ",test_tags)

Unique NER tags in train set:  {'B-PER', 'B-GEO', 'B-ART', 'B-ORG', 'I-ORG', 'B-GPE', 'B-NAT', 'I-ART', 'B-TIM', 'I-TIM', 'B-EVE', 'I-EVE', 'I-NAT', 'O', 'I-GEO', 'I-GPE', 'I-PER'}
Unique NER tags in test set:  {'B-PER', 'B-GEO', 'B-ART', 'B-ORG', 'I-ORG', 'B-GPE', 'B-NAT', 'I-ART', 'B-TIM', 'I-TIM', 'I-GPE', 'B-EVE', 'I-EVE', 'O', 'I-GEO', 'I-NAT', 'I-PER'}


In [17]:
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(train_targets)
train_tgt_int = tag_tokenizer.texts_to_sequences(train_targets)
test_tgt_int = tag_tokenizer.texts_to_sequences(test_targets)

In [18]:
# Max length
max_length_train = max(len(sent) for sent in train_inputs)
max_length_test = max(len(sent) for sent in test_inputs)
max_length = max(max_length_train,max_length_test)

# Pad input sequences
train_inputs_final = pad_sequences(train_inputs, maxlen=max_length, padding="post")
print("Shape of train inputs: ",train_inputs_final.shape)

test_inputs_final = pad_sequences(test_inputs, maxlen=max_length, padding="post")
print("Shape of test inputs: ",test_inputs_final.shape)

train_targets_final = pad_sequences(train_tgt_int, maxlen=max_length, padding="post")
print("Shape of train targets: ",train_targets_final.shape)

test_targets_final = pad_sequences(test_tgt_int, maxlen=max_length, padding="post")
print("Shape of test targets: ",test_targets_final.shape)

Shape of train inputs:  (38367, 77)
Shape of test inputs:  (9592, 77)
Shape of train targets:  (38367, 77)
Shape of test targets:  (9592, 77)


In [19]:
# Number of classes

K = len(tag_tokenizer.word_index)  +1
K

18

In [20]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dropout, LSTM, TimeDistributed, Dense, Bidirectional
from tensorflow.keras.models import Model

# Create a MirroredStrategy for multi-GPU support
strategy = tf.distribute.MirroredStrategy()

# Define the model inside the strategy scope
with strategy.scope():
    vector_size = 128

    i = Input(shape=(max_length,))
    x = Embedding(input_dim=V+1, output_dim=vector_size, mask_zero=True)(i)
    x = Dropout(0.2)(x)
    x = Bidirectional(LSTM(256, return_sequences=True, recurrent_dropout=0.2))(x)
    x = Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.2))(x)
    x = TimeDistributed(Dense(K, activation='softmax'))(x)

    model = Model(i, x)
    model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 77)]              0         
                                                                 
 embedding (Embedding)       (None, 77, 128)           3334144   
                                                                 
 dropout (Dropout)           (None, 77, 128)           0         
                                                                 
 bidirectional (Bidirection  (None, 77, 512)           788480    
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 77, 256)           656384    
 onal)                                                           
                                                                 
 time_distributed (TimeDist  (None, 77, 18)            4626  

In [21]:
import os
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from sklearn.utils.class_weight import compute_class_weight

# Set the visible GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Change this to the GPU IDs you want to use

# Limit GPU memory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# Create data pipelines
train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs_final, train_targets_final))
test_dataset = tf.data.Dataset.from_tensor_slices((test_inputs_final, test_targets_final))

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
lr_scheduler = LearningRateScheduler(lambda epoch: 0.001 * 0.9 ** epoch)

# Compile the model inside the strategy scope
with strategy.scope():
    model.compile(optimizer="adam",
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=["accuracy"])

# Fit the model
model.fit(train_dataset.batch(32),  # Adjust the batch size based on your GPU memory
          epochs=5,
          validation_data=test_dataset.batch(32),
          callbacks=[early_stopping, lr_scheduler])

# Save the model
model.save('ner_model.h5')


Physical devices cannot be modified after being initialized
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(


In [22]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [23]:
sentence = "Polish Prime Minister Jaroslaw Kaczynski has voiced support for the deployment of 10 U.S. missile interceptors in Poland and guidance technology in the Czech Republic ."
sentence = preprocessing_stage(sentence)
predictions = model.predict(pad_sequences(tokenizer.texts_to_sequences([sentence]),
                                          maxlen=max_length,
                                         padding="post"))
predictions



array([[[1.0427975e-10, 1.0000000e+00, 2.9627755e-11, ...,
         2.7311967e-11, 5.9637273e-12, 1.5243216e-10],
        [4.3046647e-12, 1.0000000e+00, 1.1069937e-12, ...,
         1.1971145e-12, 1.6909662e-13, 7.2540047e-12],
        [2.5362692e-12, 1.0000000e+00, 6.4361217e-13, ...,
         7.1327032e-13, 9.3871010e-14, 4.4317982e-12],
        ...,
        [5.5214286e-02, 5.8033735e-02, 5.5201918e-02, ...,
         5.5236496e-02, 5.5185113e-02, 5.5247698e-02],
        [5.5214286e-02, 5.8033735e-02, 5.5201918e-02, ...,
         5.5236496e-02, 5.5185113e-02, 5.5247698e-02],
        [5.5214286e-02, 5.8033735e-02, 5.5201918e-02, ...,
         5.5236496e-02, 5.5185113e-02, 5.5247698e-02]]], dtype=float32)

In [24]:
prediction_ner = np.argmax(predictions,axis=-1)
prediction_ner

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [25]:
NER_tags = [tag_tokenizer.index_word[num] for num in list(prediction_ner.flatten())]
NER_tags[:len(tokenizer.texts_to_sequences([sentence])[0])], sentence

(['o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o',
  'o'],
 'polish prime minister jaroslaw kaczynski ha voiced support for the deployment of 10 u missile interceptor in poland and guidance technology in the czech republic')