In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense,TimeDistributed, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:

#Sample training sentence (tokenized manually)
#These are example sentences for tarining
#Each sentence is tokenized (split into words)
train_sentences=[
    ["john","lives","in","New","York"],
    ["Alice","is","from","Paris"],
    ["Berlin","is","the","capital","of","Germany"]
]

#Corresponding NER tags for training data'
#Corresponding NER (Named Entity Recognition) tags for each word.
#B-PER : Beginning of a person entity
#B-Loc: Beginning of a location entity
#O : Outside, meaning no entity

train_ner_tags = [
    ["B-PER", "O", "O", "B-LOC", "B-LOC"],
    ["B-PER", "O", "O", "B-LOC"],
    ["B-LOC", "O", "O", "O", "O","B-LOC"]
]


#Vocabulary and tag mappings(for tokenization)

vocab={
    "john":1,"lives":2,"in":3,"New":4,"York":5,"Alice":6,"is":7,
    "from":8,"Paris":9,"Berlin":10,"the":11,"capital":12,"of":13,"Germany":14
}
tags={"O":0,"B-LOC":1,"B-PER":2}
#The vocab dictionary maps eaxh word in the training sentences to a unique integer.
#This is needed because the neural network dosen't operate on words but on umeric values
#The tags dictionary maps each NER tag to the Integer (0 for o,1 for B-LOC,2 for B-PER).


In [3]:
#Tokenize the sentences and labels
tokenized_train_sentences=[[vocab[word] for word in sentence] for sentence in train_sentences]
tokenized_train_ner_tags=[[tags[tag] for tag in ner] for ner in train_ner_tags]
#This converts the Training sentences and NER tags into lists of intergers, using the mapping from vocab and tags
#Sentences:["john","lives","in","New","York"] because [1,2,3,4,5].

In [4]:
#Extend vocabulary to include new test words
vocab.update({
    "Mary":15,"visited":16,"London":17,"Tom":18,"moved":19,"to":20,"statue":21
})

#Here, we  extend the vocabulary to inclued additional words from the test sentence.
# for example," mary" is mapped to 15, "london" to 17 etc.

In [5]:
#Tokenize the test sentences
test_sentences=[
    ["Marry","visisted","London"],
    ["Tom","moved","to","Berlin"],
    ["The","statue","is","in","Paris"]
]
tokenized_train_sentences=[[vocab.get(word,0) for word in sentence] for sentence in test_sentences]
#The 0 in vocab.get(word, 0) serves as the default value returned
#if a word is not found in the vocab dictionary. Here's how it works:
#vocab.get(word, 0) looks up word in the vocab dictionary.
#If the word exists in vocab, it returns its corresponding value (likely a token or index).
#If the word does not exist in vocab, it returns e as a fallback.

In [6]:
#Parameters
vocab_size=len(vocab)+1 # Updated vocab size to account for all words in the vocab
embedding_dim=64 #Dimension of embedding vectors
n_tags=len(tags) #Number of entity tags
max_len=6 #Max sentence length (after padding)

# vocab size: Total number of unique words in the vocabulary (plus 1 for padding).
# embedding din: The size of word embeddings (vectors representing each word).
# n_tags: The number of MER tags (O, B-LOC, B-PER).
# max_len: Maximun sentence length for padding.
# Padding training and test sequences

tokenized_train_sentences=pad_sequences(tokenized_train_sentences,maxlen=max_len,padding="post")
tokenized_train_ner_tags=pad_sequences(tokenized_train_ner_tags,maxlen=max_len,padding="post")
tokenized_test_sentences=pad_sequences(tokenized_train_sentences,maxlen=max_len,padding="post")

In [7]:
#Split the Training data (Train/Test split)
X_train,X_test,y_train,y_test=train_test_split(tokenized_train_sentences,tokenized_train_ner_tags,test_size=0.2)

In [8]:
#Model Creation
model=Sequential()

In [9]:
# Embedding layer adjusted to the current vocab_size
model.add(Embedding(input_dim=vocab_size,output_dim=embedding_dim,input_length=max_len))



In [10]:
model.add(Bidirectional(GRU(units=embedding_dim,return_sequences=True)))  #Bidirectional GRU
model.add(TimeDistributed(Dense(n_tags, activation="softmax")))
#Optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [11]:
#Compile Model
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])

In [12]:
#Train the model
history=model.fit(X_train,y_train,batch_size=32,epochs=10,validation_split=0.2)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step - accuracy: 0.3333 - loss: 1.0931 - val_accuracy: 0.1667 - val_loss: 1.0964
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.8333 - loss: 1.0829 - val_accuracy: 0.6667 - val_loss: 1.0874
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - accuracy: 0.5000 - loss: 1.0729 - val_accuracy: 0.6667 - val_loss: 1.0785
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 0.5000 - loss: 1.0630 - val_accuracy: 0.6667 - val_loss: 1.0695
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.5000 - loss: 1.0531 - val_accuracy: 0.6667 - val_loss: 1.0605
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - accuracy: 0.5000 - loss: 1.0432 - val_accuracy: 0.6667 - val_loss: 1.0513
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━

In [14]:
#Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.6667 - loss: 0.9749
Test Loss: 0.9748923778533936, Test Accuracy: 0.6666666865348816


In [15]:
#Predict on new test sentences
predictions=model.predict(tokenized_test_sentences)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step


In [16]:
# Decoding the Predictions back to NER tags
reverse_tags={i:tag for tag,i in tags.items()}

'''
purpose of reverse_tags:
Original tags dictionary: This typically maps tags(e.g, labels, classes)
to unique numerical IDs or indices.
Example: tags = {"NOUN":1,"VERB":2,"ADJECTIVE":3}
Reversed reverese_tags dictionary: The reverse dictionary swaps the roles,
mapping the numerical ID's for indices back to their corresponding tags.
Example: reverse_tags={1:"NOUN",2:"VERB",3:"ADJECTIVE"}
'''

'\npurpose of reverse_tags:\nOriginal tags dictionary: This typically maps tags(e.g, labels, classes)\nto unique numerical IDs or indices.\nExample: tags = {"NOUN":1,"VERB":2,"ADJECTIVE":3}\nReversed reverese_tags dictionary: The reverse dictionary swaps the roles,\nmapping the numerical ID\'s for indices back to their corresponding tags.\nExample: reverse_tags={1:"NOUN",2:"VERB",3:"ADJECTIVE"}\n'

In [17]:
def decode_predictions(preds, max_len):
    decoded_predictions=[]
    #This list will store the decoded predictions for all sentences.
    #Iterating over each prediction (pred) for every token in each sentence

    for pred in preds:
        decoded_Sentence=[reverse_tags[np.argmax(p)] for p in pred]
        #pred represents the list of predicted probabilities for a token.
#For each token p, we apply np.argmax(p) to get the index of the highest probability,
#This tells us which NEll tag has the highest probability.
#For example, if the probabilities for a token are [8.1, 0.8, 0.1]
#then np.argmax(p) returns 1 (because 6.8 is the highest value),
#which corresponds to the tag B-LOC (fros reverse_tags).
#We use reverse_tags[np.argmax(p) to look up the actual tag (like "0", "B-LOC", or "B-PER")
# based on the index returned by np.argmax(p)
        decoded_predictions.append(decoded_Sentence)
    return decoded_predictions
#Suppose the preds for one sentence look like this (simplified to one token per sentence for clar
#prods - [
#[10.2, 0.7, 8.1], [8.9, 0.05, 0.05], [0.1, 0.2, 0.7]] #Probabilities for three tokens
# ]
#The model output probabilities for three tokens (one for each possible NER tag):
#For token 1: [0.2, 0.7, 0.1] highest probability is at index 1 -> B-LOC.
#For token 2: 10.9, 0.05, 0.05] highest probability is at index 0 ->  O.
#For token 3: [0.1, 0.2, 0.7] highest probability is at Index 2 -> B-PER.
#After decoding, you get
#Decoded_sentence =['B-LOC','O','B-PER']

In [18]:
# Decoding prediction
decode_predictions=decode_predictions(predictions,max_len)

In [19]:
# Show test sentences with predicted tags
for sentence, pred_tags in zip(test_sentences, decode_predictions):
    print(f"Sentence: {' '.join(sentence)}")
    print(f"Predicted NER Tags: {pred_tags}")
    print()

Sentence: Marry visisted London
Predicted NER Tags: ['O', 'O', 'O', 'O', 'O', 'O']

Sentence: Tom moved to Berlin
Predicted NER Tags: ['O', 'O', 'O', 'O', 'O', 'O']

Sentence: The statue is in Paris
Predicted NER Tags: ['O', 'O', 'O', 'O', 'O', 'O']

