## Named Entity Recognition

In [13]:
import zipfile
import pandas as pd
import numpy as np
import ast
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

data = pd.read_csv('/content/ner.csv')

In [21]:
data

Unnamed: 0,Sentence #,Sentence,POS,Tag,Tag_encoded
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[16, 16, 16, 16, 16, 16, 2, 16, 16, 16, 16, 16..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 2..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 1..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,...","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 2..."
...,...,...,...,...,...
47954,Sentence: 47955,Indian border security forces are accusing the...,"['JJ', 'NN', 'NN', 'NNS', 'VBP', 'VBG', 'PRP$'...","[B-gpe, O, O, O, O, O, O, B-gpe, O, O, O, O, O...","[3, 16, 16, 16, 16, 16, 16, 3, 16, 16, 16, 16,..."
47955,Sentence: 47956,Indian officials said no one was injured in Sa...,"['JJ', 'NNS', 'VBD', 'DT', 'NN', 'VBD', 'VBN',...","[B-gpe, O, O, O, O, O, O, O, B-tim, O, O, O, O...","[3, 16, 16, 16, 16, 16, 16, 16, 7, 16, 16, 16,..."
47956,Sentence: 47957,Two more landed in fields belonging to a nearb...,"['CD', 'JJR', 'VBD', 'IN', 'NNS', 'VBG', 'TO',...","[O, O, O, O, O, O, O, O, O, O, O]","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]"
47957,Sentence: 47958,They say not all of the rockets exploded upon ...,"['PRP', 'VBP', 'RB', 'DT', 'IN', 'DT', 'NNS', ...","[O, O, O, O, O, O, O, O, O, O, O]","[16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]"


## Data Preprocessing

In [14]:

x = data['Sentence']
x = [str(text) for text in x if text is not None]
tk = Tokenizer()
tk.fit_on_texts(x)
x = tk.texts_to_sequences(x)


if isinstance(data['Tag'].iloc[0], str):
    data['Tag'] = data['Tag'].apply(ast.literal_eval)

all_tags = [tag for sublist in data['Tag'] for tag in sublist]
le = LabelEncoder()
le.fit(all_tags)

def encode_tags(tags):
    return [le.transform([tag])[0] for tag in tags]

data['Tag_encoded'] = data['Tag'].apply(encode_tags)

max_len_x = max(len(seq) for seq in x)
max_len_y = max(len(seq) for seq in data['Tag_encoded'])
max_len = max(max_len_x, max_len_y)


x = pad_sequences(x, maxlen=max_len, padding='post')
y = pad_sequences(data['Tag_encoded'], maxlen=max_len, padding='post')
y = np.array(y).astype(np.int32)

x, x_test, y, y_test = train_test_split(x, y, test_size=0.1, random_state=2)


## Model Building

In [15]:
input_shape = (x.shape[1],)
vocab_size = len(tk.word_index) + 1
embedding_dim = 128
hidden_dim = 64
output_dim =len(le.classes_)


input = Input(shape=input_shape)
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input)
bilstm = Bidirectional(LSTM(units=hidden_dim, return_sequences=True))(embedding)
output = Dense(output_dim, activation='softmax')(bilstm)


model = Model(inputs=input, outputs=output)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


model.summary()




Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 104)]             0         
                                                                 
 embedding (Embedding)       (None, 104, 128)          3578112   
                                                                 
 bidirectional (Bidirection  (None, 104, 128)          98816     
 al)                                                             
                                                                 
 dense (Dense)               (None, 104, 17)           2193      
                                                                 
Total params: 3679121 (14.03 MB)
Trainable params: 3679121 (14.03 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
model.fit(x, y, batch_size=32,validation_data=(x_test,y_test), epochs=2)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7a83a4c39ae0>

## Prediction

In [25]:

def my_zip(*args):
    return zip(*args)


def predict_entities(sentence):

    tokenized_sentence = tk.texts_to_sequences([sentence])

    padded_sentence = pad_sequences(tokenized_sentence, maxlen=max_len, padding='post')

    predictions = model.predict(padded_sentence)

    predicted_labels = np.argmax(predictions, axis=-1)

    decoded_labels = le.inverse_transform(predicted_labels[0])

    words = sentence.split()
    labels = decoded_labels[:len(words)]
    return [(word, label) for word, label in my_zip(words, labels)]

test_sentence = "Ali was born in pakistan"
predicted_entities = predict_entities(test_sentence)
print(predicted_entities)


[('Ali', 'B-per'), ('was', 'O'), ('born', 'O'), ('in', 'O'), ('pakistan', 'B-geo')]


## Report

In [26]:
from sklearn.metrics import classification_report
from itertools import chain
y_pred = model.predict(x_test)

y_pred = np.argmax(y_pred, axis=-1)

y_pred_decoded = [le.inverse_transform(y) for y in y_pred]
y_test_decoded = [le.inverse_transform(y) for y in y_test]

y_pred_flat = list(chain.from_iterable(y_pred_decoded))
y_test_flat = list(chain.from_iterable(y_test_decoded))


report = classification_report(y_test_flat, y_pred_flat)
print(report)



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-art       1.00      1.00      1.00    394419
       B-eve       0.00      0.00      0.00        38
       B-geo       0.67      0.49      0.57      3794
       B-gpe       0.80      0.61      0.69      1622
       B-nat       0.00      0.00      0.00        19
       B-org       0.60      0.41      0.49      1913
       B-per       0.69      0.52      0.59      1667
       B-tim       0.71      0.52      0.60      2028
       I-art       0.00      0.00      0.00        42
       I-eve       0.00      0.00      0.00        33
       I-geo       0.57      0.40      0.47       751
       I-gpe       1.00      0.09      0.17        11
       I-nat       0.00      0.00      0.00         4
       I-org       0.52      0.52      0.52      1602
       I-per       0.69      0.67      0.68      1679
       I-tim       0.57      0.40      0.47       660
           O       0.92      0.96      0.94     88502

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
