In [55]:
import json

with open("C:/Users/Nitin Mishra/Downloads/expanded_dataset json format.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print("Sample Data:", data[:2]) 

Sample Data: [{'text': 'Amount:I wish to withdraw 24036 rupees', 'entities': [{'start': 26, 'end': 31, 'label': 'Amount'}]}, {'text': 'Amount:I have to withdraw 83810 rupees', 'entities': [{'start': 26, 'end': 31, 'label': 'Amount'}]}]


In [56]:
len(data)

22169

In [57]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional, Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

In [58]:
# Tokenization and entity label mapping
words = set()
tags = {"O": 0, "Name": 1, "Account Number": 2, "Phone Number": 3, "Amount": 4}

In [59]:
def preprocess_data(data):
    sentences = []
    labels = []
    
    for item in data:
        text = item["text"].split()
        label_seq = ["O"] * len(text)
        
        for entity in item["entities"]:
            entity_words = text[entity["start"]: entity["end"]]  # Adjust index if needed
            for i, word in enumerate(entity_words):
                label_seq[text.index(word)] = entity["label"]
        
        words.update(text)
        sentences.append(text)
        labels.append([tags[tag] for tag in label_seq])
    
    return sentences, labels

In [60]:
sentences, labels = preprocess_data(data)
word2idx = {w: i + 1 for i, w in enumerate(words)}
idx2word = {i: w for w, i in word2idx.items()}
idx2tag = {i: tag for tag, i in tags.items()}

In [61]:
# Save word2idx and idx2tag after training
with open("word2idx2.json", "w") as f:
    json.dump(word2idx, f)

with open("idx2tag2.json", "w") as f:
    json.dump(idx2tag, f)

In [62]:
# Padding sequences
max_len = max(len(s) for s in sentences)
X = pad_sequences([[word2idx[w] for w in s] for s in sentences], maxlen=max_len, padding='post')
y = pad_sequences(labels, maxlen=max_len, padding='post')

In [63]:
max_len

12

In [64]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
from collections import Counter

all_labels = [tag for sample in y for tag in sample]
label_counts = Counter(all_labels)

print("Label Distribution:", label_counts)

Label Distribution: Counter({np.int32(0): 266028})


In [66]:
X_train.shape

(17735, 12)

In [67]:
y_test.shape

(4434, 12)

In [68]:
# Model definition
from tensorflow.keras.layers import Dropout

input_layer = Input(shape=(max_len,))
embedding = Embedding(input_dim=len(word2idx) + 1, output_dim=64, input_length=max_len)(input_layer)
bi_lstm = Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(embedding)
out = TimeDistributed(Dense(len(tags), activation="softmax"))(bi_lstm)

model = Model(input_layer, out)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
# model = Model(input_layer, out)
# model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [69]:
# Training the model
model.fit(X_train, y_train, batch_size=8, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
[1m2217/2217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 63ms/step - accuracy: 0.9959 - loss: 0.0545 - val_accuracy: 1.0000 - val_loss: 6.3642e-06
Epoch 2/10
[1m2217/2217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 95ms/step - accuracy: 1.0000 - loss: 4.9440e-06 - val_accuracy: 1.0000 - val_loss: 1.9173e-06
Epoch 3/10
[1m2217/2217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 71ms/step - accuracy: 1.0000 - loss: 1.6067e-06 - val_accuracy: 1.0000 - val_loss: 8.4674e-07
Epoch 4/10
[1m2217/2217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 80ms/step - accuracy: 1.0000 - loss: 7.5294e-07 - val_accuracy: 1.0000 - val_loss: 5.2401e-07
Epoch 5/10
[1m   8/2217[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:12[0m 60ms/step - accuracy: 1.0000 - loss: 5.2326e-07

KeyboardInterrupt: 

In [32]:
# Save the model
model.save("NER_Model/bilstm_ner1.keras")
print("Model training complete and saved as bilstm_ner1.keras")


Model training complete and saved as bilstm_ner1.keras


## Importing saved Model

In [33]:
import numpy as np
import json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

# Load the model using the recommended format
model = load_model("NER_Model/bilstm_ner1.keras")

In [43]:
with open("NER_Model/word2idx1.json", "r") as f:
    word2idx = json.load(f)

with open("NER_Model/idx2tag1.json", "r") as f:
    idx2tag = json.load(f)

In [44]:
# Ensure indices are converted from string keys (JSON saves dict keys as strings)
word2idx = {k: int(v) for k, v in word2idx.items()}
idx2tag = {int(k): v for k, v in idx2tag.items()}

In [45]:
# Set maximum sequence length (same as used in training)
max_len = 12  # Adjust based on training data

In [46]:
def predict_entities(sentence):
    words = sentence.split()
    print("Words:", words)

    seq = pad_sequences([[word2idx.get(w, 0) for w in words]], maxlen=max_len, padding='post')
    print("Padded sequence:", seq)

    pred = model.predict(seq)
    print("Raw predictions:", pred)

    pred = np.argmax(pred, axis=-1)
    print("Predicted indices:", pred)

    entities = []
    for i, word in enumerate(words):
        tag = idx2tag.get(pred[0][i], "O")  # Ensure index exists in idx2tag
        if tag != "O":
            entities.append((word, tag))

    return entities

In [47]:
# Example usage
sentence = "Name:My name is Nitin Kumar Mshra"
predicted_entities = predict_entities(sentence)
print("Predicted Entities:", predicted_entities)

Words: ['Name:My', 'name', 'is', 'Nitin', 'Kumar', 'Mshra']
Padded sequence: [[ 912 4047 2604    0    0    0    0    0    0    0    0    0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Raw predictions: [[[9.30110931e-01 7.78599596e-03 5.62981986e-05 1.06722828e-05
   6.20361716e-02]
  [6.04669452e-01 1.07440956e-01 3.24686943e-03 1.74729188e-03
   2.82895446e-01]
  [9.73670423e-01 1.01222144e-02 3.43416585e-03 2.69144913e-03
   1.00817652e-02]
  [1.00000000e+00 3.29657031e-11 8.38076386e-10 6.40503189e-12
   5.73187296e-11]
  [1.00000000e+00 4.77501665e-13 5.75845725e-11 3.63407573e-13
   3.74960534e-13]
  [1.00000000e+00 1.01459280e-13 2.68067044e-11 1.96413835e-13
   1.15777573e-13]
  [1.00000000e+00 6.34980146e-14 1.99692138e-11 1.58914914e-13
   7.04198120e-14]
  [1.00000000e+00 5.45231178e-14 1.80912646e-11 1.50943101e-13
   5.93549867e-14]
  [1.00000000e+00 5.16940684e-14 1.76651714e-11 1.51354786e-13
   5.48671724e-14]
  [1.00000000e+00 5.67603182e-14 1

In [54]:
# Example predictions
test_sentences = [
    "Amount:I wish to withdraw 24036 rupees",
    "Amount:I have to withdraw 83810 rupees",
    "Name:If you're wondering, I'm Chaturbhuj Sundararajan",
    "Name:If you're wondering, I'm Chaturbhuj Ramesh"
]

for sentence in test_sentences:
    print(f"Sentence: {sentence}")
    print("Predicted Entities:", predict_entities(sentence))
    print("-" * 50)

Sentence: Amount:I wish to withdraw 24036 rupees
Words: ['Amount:I', 'wish', 'to', 'withdraw', '24036', 'rupees']
Padded sequence: [[3406 5435 4927 2891 5106 5119    0    0    0    0    0    0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
Raw predictions: [[[9.99995708e-01 1.39884037e-07 7.84539154e-08 6.35270991e-09
   4.10617531e-06]
  [9.99802530e-01 1.01882715e-05 1.23370489e-06 8.93285517e-08
   1.86051664e-04]
  [9.88624930e-01 2.58469093e-03 8.32499518e-06 6.39509949e-07
   8.78144708e-03]
  [7.53185749e-01 8.14223289e-02 3.92979186e-04 5.90899217e-05
   1.64939851e-01]
  [9.94273841e-01 3.04160430e-03 4.67890386e-05 4.66243864e-06
   2.63301493e-03]
  [9.25597310e-01 4.29364629e-02 1.11385332e-02 7.90948328e-03
   1.24181667e-02]
  [1.00000000e+00 3.45438747e-11 1.08424947e-09 7.14827373e-12
   6.62479654e-11]
  [1.00000000e+00 4.72789126e-13 5.90765561e-11 3.73413919e-13
   3.59759260e-13]
  [1.00000000e+00 9.99745466e-14 2.72053335e-11 2.02291851e-

In [None]:
# Initialize lists
sentences = []
labels = []

# Process JSON data
# for entry in data:
#     text = entry["text"]
#     words = nltk.word_tokenize(text)  # Tokenize sentence
#     sentence_labels = ["O"] * len(words)  # Default label "O" (no entity)
    
#     # Process entities
#     for entity in entry["entities"]:
#         start, end, label = entity["start"], entity["end"], entity["label"]
        
#         entity_words = nltk.word_tokenize(text[start:end])  # Extract entity words
#         entity_found = False  # Track first word of the entity

#         for i, word in enumerate(words):
#             if word in entity_words:
#                 if not entity_found:
#                     sentence_labels[i] = f"B-{label}"  # First word of entity
#                     entity_found = True
#                 else:
#                     sentence_labels[i] = f"I-{label}"  # Inside the entity

    
#     sentences.append(words)
#     labels.append(sentence_labels)

# # Print results
# print("Tokenized Sentences:", sentences[:2])
# print("Entity Labels:", labels[:2])

Tokenized Sentences: [['You', 'can', 'reach', 'me', 'at', '0646843308'], ['Let', 'me', 'give', 'you', 'my', 'account', 'number', ':', '617547715491']]
Entity Labels: [['O', 'O', 'O', 'O', 'O', 'B-PHONE_NO'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ACCOUNT_NO']]


In [None]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# from sklearn.preprocessing import LabelEncoder
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# # Tokenize words
# word_tokenizer = Tokenizer(lower=False)
# word_tokenizer.fit_on_texts(sentences)
# X = word_tokenizer.texts_to_sequences(sentences)

In [36]:
X

[[15, 20, 23, 6, 24, 153],
 [40, 6, 73, 18, 9, 5, 1, 10, 154],
 [70, 18, 71, 72, 30, 2, 29, 114, 134],
 [7, 16, 22, 1, 3, 155],
 [7, 16, 22, 1, 3, 156],
 [7, 98, 21, 3, 93, 129],
 [2, 35, 74, 5, 36, 1, 157],
 [13, 3, 91, 129, 48],
 [2, 35, 74, 5, 36, 1, 158],
 [2, 12, 46, 19, 105, 134],
 [7, 14, 1, 3, 159],
 [7, 14, 1, 3, 160],
 [15, 64, 65, 9, 5, 1, 19, 161],
 [2, 12, 115, 119],
 [7, 16, 22, 1, 3, 162],
 [106, 28, 6, 122, 107],
 [99, 2, 11, 163, 4, 42],
 [7, 14, 1, 3, 164],
 [7, 16, 22, 1, 3, 165],
 [2, 12, 25, 17, 1, 166],
 [40, 6, 73, 18, 9, 5, 1, 10, 167],
 [2, 12, 25, 17, 1, 168],
 [15, 20, 23, 6, 24, 169],
 [40, 6, 73, 18, 9, 5, 1, 10, 170],
 [7, 16, 22, 1, 3, 171],
 [13, 3, 9, 14, 1, 10, 172],
 [2, 83, 8, 11, 173, 4],
 [13, 3, 9, 14, 1, 10, 174],
 [106, 28, 6, 105, 116],
 [2, 29, 41, 8, 26, 27, 175, 4],
 [7, 14, 1, 3, 176],
 [13, 3, 31, 1, 57, 36, 9, 5, 10, 177],
 [7, 50, 5, 1, 3, 178],
 [7, 14, 1, 3, 179],
 [13, 3, 9, 14, 1, 10, 180],
 [34, 47, 6, 8, 11, 181, 4],
 [7, 16, 22, 1

In [None]:
# # Tokenize labels
# label_encoder = LabelEncoder()
# all_labels = [label for sentence in labels for label in sentence]  # Flatten list
# label_encoder.fit(all_labels)

# # Convert labels to numbers
# y = [[label_encoder.transform([lbl])[0] for lbl in sentence] for sentence in labels]
# y

[[np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(3)],
 [np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(0)],
 [np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(2),
  np.int64(4)],
 [np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(3)],
 [np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(3)],
 [np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(2),
  np.int64(4)],
 [np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(0)],
 [np.int64(5), np.int64(5), np.int64(2), np.int64(4), np.int64(5)],
 [np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(0)],
 [np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(5),
  np.int64(2),
  np.int6

In [None]:
# # Padding sequences
# max_len = max(len(seq) for seq in X)
# X = pad_sequences(X, maxlen=max_len, padding="post")
# y = pad_sequences(y, maxlen=max_len, padding="post", value=label_encoder.transform(["O"])[0])

# # One-hot encoding for labels
# y = np.array([to_categorical(seq, num_classes=len(label_encoder.classes_)) for seq in y])

# print("Padded Sentences:", X[:2])
# print("Padded Labels:", y[:2])

Padded Sentences: [[ 15  20  23   6  24 153   0   0   0   0   0   0]
 [ 40   6  73  18   9   5   1  10 154   0   0   0]]
Padded Labels: [[[0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]]

 [[0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1.]]]


In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional
# from tensorflow.keras.utils import to_categorical

# # Define Model
# model = Sequential([
#     Embedding(input_dim=len(word_tokenizer.word_index) + 1, output_dim=50, input_length=max_len),
#     Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.2)),
#     TimeDistributed(Dense(len(label_encoder.classes_), activation="softmax"))
# ])

# model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
# model.fit(X, y, batch_size=8, epochs=30, verbose=1)  # More epochs, larger batch size

Epoch 1/30




[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 23ms/step - accuracy: 0.9360 - loss: 0.3263
Epoch 2/30
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 25ms/step - accuracy: 0.9999 - loss: 0.0010
Epoch 3/30
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 31ms/step - accuracy: 0.9999 - loss: 8.5801e-04
Epoch 4/30
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 31ms/step - accuracy: 1.0000 - loss: 3.3413e-04
Epoch 5/30
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 29ms/step - accuracy: 1.0000 - loss: 3.2895e-04
Epoch 6/30
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 27ms/step - accuracy: 1.0000 - loss: 3.3267e-04
Epoch 7/30
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 32ms/step - accuracy: 0.9999 - loss: 3.7023e-04
Epoch 8/30
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 31ms/step - accuracy: 0.9999 - loss: 2.6793

<keras.src.callbacks.history.History at 0x2750f3b2000>

In [None]:
# def predict_entities(sentence):
#     words = nltk.word_tokenize(sentence)
#     sequence = word_tokenizer.texts_to_sequences([words])
#     sequence = pad_sequences(sequence, maxlen=max_len, padding="post")

#     predictions = model.predict(sequence)[0]
#     predicted_labels = [label_encoder.inverse_transform([np.argmax(p)])[0] for p in predictions]

#     # Extract only words labeled as B- or I-
#     extracted_entities = []
#     current_entity = []
#     current_label = None

#     for word, label in zip(words, predicted_labels):
#         if label.startswith("B-"):
#             if current_entity:
#                 extracted_entities.append((" ".join(current_entity), current_label))  # Store previous entity
#             current_entity = [word]  # Start new entity
#             current_label = label[2:]  # Remove "B-" prefix
#         elif label.startswith("I-") and current_entity:
#             current_entity.append(word)  # Add to current entity

#     # Store last entity if exists
#     if current_entity:
#         extracted_entities.append((" ".join(current_entity), current_label))

#     return extracted_entities

# # Test Example
# test_sentence = "My name is 8797124489"
# predictions = predict_entities(test_sentence)
# print("Predicted Entities:", predictions)



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
Predicted Entities: []
