In [None]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.20.0 (from tensorflow)
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow)
  Downloading wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard~=2.20.0->tensorflow)
  Downloading tensorboard_data_server-0.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow
from sklearn.model_selection import train_test_split
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from itertools import chain
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
from numpy.random import seed



In [None]:
data = pd.read_csv("/content/ner_dataset.csv",encoding="latin",on_bad_lines='skip')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [None]:
def get_dict(data,token_or_tag):
  tok2idx ,idx2tok = {},{}
  if token_or_tag == 'token':
    vocab = list(set(data['Word'].to_list()))
  else:
    vocab = list(set(data['Tag'].to_list()))
  idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
  tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
  return tok2idx, idx2tok

token2idx,idx2token = get_dict(data,'token')
tag2idx,idx2tag = get_dict(data,'tag')

In [None]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data_fillna = data.fillna(method='ffill', axis=0)

data_group = data_fillna.groupby(['Sentence #'],as_index=False)[['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx']].agg(lambda x: list(x))
data.head()

  data_fillna = data.fillna(method='ffill', axis=0)


Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,4412,2
1,,of,IN,O,17195,2
2,,demonstrators,NNS,O,17312,2
3,,have,VBP,O,19277,2
4,,marched,VBN,O,5234,2


In [None]:
def get_pad_train_test_val(data_group, data):

    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]

    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )

    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 32372 
train_tokens length: 32372 
test_tokens length: 4796 
test_tags: 4796 
val_tokens: 10791 
val_tags: 10791


In [None]:
seed(2)
tensorflow.random.set_seed(2)

In [None]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)

In [None]:
input_dim = len(list(set(data['Word'].to_list())))+1
output_dim = 64
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)

In [None]:
def get_bilstm_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    return model

In [None]:
def train_model(X, y, model):
    loss = list()
    for i in range(25):
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [None]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
model_bilstm_lstm.build((None, input_length))
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)



You must install pydot (`pip install pydot`) for `plot_model` to work.
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 658ms/step - accuracy: 0.8293 - loss: 1.6391 - val_accuracy: 0.9681 - val_loss: 0.4473
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 577ms/step - accuracy: 0.9676 - loss: 0.4214 - val_accuracy: 0.9681 - val_loss: 0.3540
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 570ms/step - accuracy: 0.9677 - loss: 0.3577 - val_accuracy: 0.9681 - val_loss: 0.2927
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 562ms/step - accuracy: 0.9677 - loss: 0.3301 - val_accuracy: 0.9681 - val_loss: 0.2808
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 552ms/step - accuracy: 0.9677 - loss: 0.3024 - val_accuracy: 0.9681 - val_loss: 0.2180
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 553ms/step - accuracy: 0.9677 - loss: 0.2536 - val_accuracy: 0.9682 - val_loss: 0.2021
[1m26/26[0m [3

In [None]:
while True:
  nlp = spacy.load('en_core_web_sm')
  text = nlp(input("Enter text: "))
  displacy.render(text, style = 'ent', jupyter=True)

Enter text: hey i am sarvesh


Enter text: i am Loop


Enter text: my name is Loop


Enter text: i am working at Google


Enter text: i am the ceo of LOOP-info.tech


KeyboardInterrupt: Interrupted by user

In [None]:
model_bilstm_lstm.save('ner_model.h5')

import pickle
with open('token2idx.pkl', 'wb') as f:
    pickle.dump(token2idx, f)

with open('tag2idx.pkl', 'wb') as f:
    pickle.dump(tag2idx, f)

with open('maxlen.pkl', 'wb') as f:
    pickle.dump(input_length, f)

