In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast
import torch
from torch.utils.data import Dataset



In [99]:
import tensorflow as tf
if tf.config.list_physical_devices('GPU'):
    print("GPU is available. Using GPU settings.")
else:
    print("GPU not available. Falling back to CPU settings.")




config = {
          'MAX_LEN': 128,
          'batch_size':8,
          'model_name':'NER_model.h5'
         }



GPU not available. Falling back to CPU settings.


In [3]:
df = pd.read_csv(
    "ner_dataset.csv",
    delimiter=',',
    quoting=3,
    encoding='latin1',
    names=["Sentence #", "Word", "POS", "Tag"],
    skiprows=1,
    on_bad_lines='skip'  # Skips malformed rows
)

print(df.head(10))


    Sentence #           Word  POS    Tag
0  Sentence: 1      Thousands  NNS      O
1          NaN             of   IN      O
2          NaN  demonstrators  NNS      O
3          NaN           have  VBP      O
4          NaN        marched  VBN      O
5          NaN        through   IN      O
6          NaN         London  NNP  B-geo
7          NaN             to   TO      O
8          NaN        protest   VB      O
9          NaN            the   DT      O


In [4]:

# Fill sentence numbers
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')

df["Sentence #"]


  df['Sentence #'] = df['Sentence #'].fillna(method='ffill')


0              Sentence: 1
1              Sentence: 1
2              Sentence: 1
3              Sentence: 1
4              Sentence: 1
                ...       
1014292    Sentence: 47959
1014293    Sentence: 47959
1014294    Sentence: 47959
1014295    Sentence: 47959
1014296    Sentence: 47959
Name: Sentence #, Length: 1014297, dtype: object

In [5]:
# Group sentences
agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(), s["Tag"].values.tolist())]
sentences = df.groupby("Sentence #").apply(agg_func).tolist()

  sentences = df.groupby("Sentence #").apply(agg_func).tolist()


In [6]:
sentences[1]

[('Iranian', 'B-gpe'),
 ('officials', 'O'),
 ('say', 'O'),
 ('they', 'O'),
 ('expect', 'O'),
 ('to', 'O'),
 ('get', 'O'),
 ('access', 'O'),
 ('to', 'O'),
 ('sealed', 'O'),
 ('sensitive', 'O'),
 ('parts', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('plant', 'O'),
 ('Wednesday', 'B-tim'),
 ('after', 'O'),
 ('an', 'O'),
 ('IAEA', 'B-org'),
 ('surveillance', 'O'),
 ('system', 'O'),
 ('begins', 'O'),
 ('functioning', 'O'),
 ('.', 'O')]

In [7]:
# Build tag vocab
unique_tags = list(set(df["Tag"].dropna()))
tag2id = {tag: i+1 for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}
num_labels = len(tag2id)

In [8]:
tag2id

{'I-org': 1,
 'I-gpe': 2,
 'B-tim': 3,
 'I-geo': 4,
 'I-per': 5,
 'I-art': 6,
 'B-nat': 7,
 'I-eve': 8,
 'O': 9,
 'I-nat': 10,
 'B-org': 11,
 'B-eve': 12,
 'B-per': 13,
 'B-gpe': 14,
 'B-art': 15,
 'B-geo': 16,
 'I-tim': 17}

In [9]:
id2tag

{1: 'I-org',
 2: 'I-gpe',
 3: 'B-tim',
 4: 'I-geo',
 5: 'I-per',
 6: 'I-art',
 7: 'B-nat',
 8: 'I-eve',
 9: 'O',
 10: 'I-nat',
 11: 'B-org',
 12: 'B-eve',
 13: 'B-per',
 14: 'B-gpe',
 15: 'B-art',
 16: 'B-geo',
 17: 'I-tim'}

In [10]:
num_labels

17

In [11]:
# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

In [12]:
unique_tags = sorted(set(tag for sent in sentences for _, tag in sent if pd.notnull(tag)))
label2id = {tag: idx for idx, tag in enumerate(unique_tags)}


In [13]:
label2id

{'B-art': 0,
 'B-eve': 1,
 'B-geo': 2,
 'B-gpe': 3,
 'B-nat': 4,
 'B-org': 5,
 'B-per': 6,
 'B-tim': 7,
 'I-art': 8,
 'I-eve': 9,
 'I-geo': 10,
 'I-gpe': 11,
 'I-nat': 12,
 'I-org': 13,
 'I-per': 14,
 'I-tim': 15,
 'O': 16}

In [98]:
def tokenize_and_align_labels(sentences, label2id, tokenizer):
    tokenized_inputs = []
    label_ids_list = []

    for idx, sentence in enumerate(sentences):
        sentence = [(w, t) for w, t in sentence if pd.notnull(w) and pd.notnull(t)]

        if not sentence:
            continue  # skip empty or all-nan sentences

        words, tags = zip(*sentence)

        # Tokenize the sentence
        encoding = tokenizer(
            list(words),
            is_split_into_words=True,
            return_offsets_mapping=True,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt' 
        )

        word_ids = encoding.word_ids()
        label_ids = []

        #for word_idx in word_ids:
         #   if word_idx is None:
          #      label_ids.append(-100)
           # else:
            #    label_ids.append(label2id.get(tags[word_idx], 0

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(0)  # Padding position
            else:
                tag = tags[word_idx]
                label_ids.append(label2id.get(tag, 0))  # Unknown tags → PAD (0)



        tokenized_input = {
            "sentence_id": f"Sentence: {idx + 1}",
            "words": list(words),
            "tokens": tokenizer.convert_ids_to_tokens(encoding["input_ids"][0]),
            "input_ids": encoding["input_ids"][0].tolist(),
            "token_type_ids": encoding["token_type_ids"][0].tolist(),
            "attention_mask": encoding["attention_mask"][0].tolist(),
            "offset_mapping": [tuple(x.tolist()) for x in encoding["offset_mapping"][0]],
            "labels": label_ids
        }


        #encoding["labels"] = label_ids
        tokenized_inputs.append(tokenized_input)
        label_ids_list.append(label_ids)

    return tokenized_inputs, label_ids_list

In [15]:
tokenized_inputs, label_ids = tokenize_and_align_labels(sentences, label2id, tokenizer)

In [16]:
tokenized_inputs[0]

{'sentence_id': 'Sentence: 1',
 'words': ['Thousands',
  'of',
  'demonstrators',
  'have',
  'marched',
  'through',
  'London',
  'to',
  'protest',
  'the',
  'war',
  'in',
  'Iraq',
  'and',
  'demand',
  'the',
  'withdrawal',
  'of',
  'British',
  'troops',
  'from',
  'that',
  'country',
  '.'],
 'tokens': ['[CLS]',
  'Thousands',
  'of',
  'demons',
  '##tra',
  '##tors',
  'have',
  'marched',
  'through',
  'London',
  'to',
  'protest',
  'the',
  'war',
  'in',
  'Iraq',
  'and',
  'demand',
  'the',
  'withdrawal',
  'of',
  'British',
  'troops',
  'from',
  'that',
  'country',
  '.',
  '[SEP]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[P

In [17]:
# Custom Dataset
from torch.utils.data import Dataset
import torch
class NERDataset(Dataset):
    
    def __init__(self, tokenized_inputs):
        self.data = tokenized_inputs
    
    def __len__(self):
        return len(self.data)
    

    def __getitem__(self, idx):
        item = {key: torch.tensor(val) for key, val in self.data[idx].items()
                if key in ['input_ids', 'attention_mask', 'token_type_ids', 'labels']}
        return item




In [18]:
dataset = NERDataset(tokenized_inputs)

In [19]:
dataset[0]

{'input_ids': tensor([  101, 26159,  1104,  8568,  4487,  5067,  1138,  9639,  1194,  1498,
          1106,  5641,  1103,  1594,  1107,  5008,  1105,  4555,  1103, 10602,
          1104,  1418,  2830,  1121,  1115,  1583,   119,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [20]:
import pandas as pd

# Step 1: Convert each sample into a list of dicts
records = []

for item in dataset:  # Assuming dataset is like a list of samples (dicts)
    record = {
        'input_ids': item['input_ids'].tolist(),
        'token_type_ids': item['token_type_ids'].tolist(),
        'attention_mask': item['attention_mask'].tolist(),
        'labels': item['labels'].tolist()
    }
    records.append(record)

# Step 2: Convert to DataFrame
df = pd.DataFrame(records)

# Show result
print(df.head())


                                           input_ids  \
0  [101, 26159, 1104, 8568, 4487, 5067, 1138, 963...   
1  [101, 7239, 3878, 1474, 1152, 5363, 1106, 1243...   
2  [101, 1124, 8031, 4184, 2083, 3832, 12526, 430...   
3  [101, 1220, 1286, 1170, 170, 8901, 2396, 118, ...   
4  [101, 158, 119, 151, 119, 3893, 13443, 4945, 1...   

                                      token_type_ids  \
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   

                                      attention_mask  \
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [21]:
df.head()

Unnamed: 0,input_ids,token_type_ids,attention_mask,labels
0,"[101, 26159, 1104, 8568, 4487, 5067, 1138, 963...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 16, 16, 16, 16, 16, 16, 16, 16, 2, 16, 16,..."
1,"[101, 7239, 3878, 1474, 1152, 5363, 1106, 1243...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,..."
2,"[101, 1124, 8031, 4184, 2083, 3832, 12526, 430...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 16, 16, 16, 16, 16, 16, 7, 16, 16, 16, 16,..."
3,"[101, 1220, 1286, 1170, 170, 8901, 2396, 118, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16..."
4,"[101, 158, 119, 151, 119, 3893, 13443, 4945, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 2, 2, 2, 2, 16, 16, 6, 14, 14, 14, 16, 7, ..."


In [22]:
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=100)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=100)

In [23]:
train_data.shape, val_data.shape , test_data.shape

((38367, 4), (4796, 4), (4796, 4))

In [24]:
train_input_ids = np.array(train_data['input_ids'].tolist())
train_token_type_ids = np.array(train_data['token_type_ids'].tolist())
train_attention_mask = np.array(train_data['attention_mask'].tolist())
train_labels = np.array(train_data['labels'].tolist())

valid_input_ids = np.array(val_data['input_ids'].tolist())
valid_token_type_ids = np.array(val_data['token_type_ids'].tolist())
valid_attention_mask = np.array(val_data['attention_mask'].tolist())
valid_labels = np.array(val_data['labels'].tolist())

test_input_ids = np.array(test_data['input_ids'].tolist())
test_token_type_ids = np.array(test_data['token_type_ids'].tolist())
test_attention_mask = np.array(test_data['attention_mask'].tolist())
test_labels = np.array(test_data['labels'].tolist())

print(train_input_ids.shape, train_token_type_ids.shape, train_attention_mask.shape ,train_labels.shape)
print(valid_input_ids.shape, valid_token_type_ids.shape, valid_attention_mask.shape ,valid_labels.shape)
print(test_input_ids.shape, test_token_type_ids.shape, test_attention_mask.shape ,test_labels.shape)

(38367, 128) (38367, 128) (38367, 128) (38367, 128)
(4796, 128) (4796, 128) (4796, 128) (4796, 128)
(4796, 128) (4796, 128) (4796, 128) (4796, 128)


In [25]:
from tensorflow.keras.utils import to_categorical

num_classes = len(tag2id) + 1  # Including PAD class 0

# One-hot encode the labels
train_labels_cat = to_categorical(train_labels, num_classes=num_classes)
valid_labels_cat = to_categorical(valid_labels, num_classes=num_classes)
test_labels_cat = to_categorical(test_labels, num_classes=num_classes)

In [26]:
import tensorflow.keras.backend as K
K.clear_session()

In [27]:
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import tensorflow as tf


In [28]:
from transformers import TFBertForTokenClassification
from tensorflow.keras.losses import CategoricalCrossentropy


encoder = TFBertForTokenClassification.from_pretrained("bert-base-cased", output_hidden_states=True)

# NER Model
input_ids = Input(shape=(config['MAX_LEN'],), dtype=tf.int32)
token_type_ids = Input(shape=(config['MAX_LEN'],), dtype=tf.int32)
attention_mask = Input(shape=(config['MAX_LEN'],), dtype=tf.int32)

embedding = encoder(input_ids,
                    token_type_ids=token_type_ids,
                    attention_mask=attention_mask
                   )[0]

output_logits = Dense(len(tag2id)+1, activation='softmax')(embedding)

model = Model(inputs=[input_ids, token_type_ids, attention_mask],
              outputs=[output_logits]
             )
optimizer = Adam(learning_rate=3e-5)
loss =  CategoricalCrossentropy()

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 input_3 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 tf_bert_for_token_classifi  TFTokenClassifierOutput(lo   1077212   ['input_1[0][0]',             
 cation (TFBertForTokenClas  ss=None, logits=(None, 128   18         'input_3[0][0]',         

In [33]:
history = model.fit([train_input_ids,
                     train_token_type_ids,
                     train_attention_mask],
                    train_labels_cat,
                    epochs=1,
                    validation_data=([valid_input_ids,
                                      valid_token_type_ids,
                                      valid_attention_mask],
                                     valid_labels_cat),
                    batch_size=config['batch_size'],
                    callbacks=[save_model, tensorboard_callback]
                   )

Epoch 1: val_loss improved from inf to 0.14653, saving model to .\model_512d_001.h5


In [95]:
import os
os.system("taskkill /F /IM tensorboard.exe")


128

In [92]:
%reload_ext tensorboard



In [93]:
import datetime
import tensorflow as tf

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=log_dir,
    histogram_freq=1,
    profile_batch=5
)

model_name = "./model_512d_{epoch:03d}.h5"
save_model = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_name,
    save_weights_only=True,
    save_best_only=True,
    verbose=1
)

In [94]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6006 (pid 9840), started 1 day, 10:16:21 ago. (Use '!kill 9840' to kill it.)

In [41]:
np.unique(valid_labels)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16])

In [37]:
#tokenizer.save_pretrained('./my_tokenizer')

tokenizer.save_pretrained("ner_model_tokenizer/")

('ner_model_tokenizer/tokenizer_config.json',
 'ner_model_tokenizer/special_tokens_map.json',
 'ner_model_tokenizer/vocab.txt',
 'ner_model_tokenizer/added_tokens.json',
 'ner_model_tokenizer/tokenizer.json')

In [42]:
# Save the full model (architecture + weights)
model.save("ner_bert_model")  # saves as a directory


INFO:tensorflow:Assets written to: ner_bert_model\assets


INFO:tensorflow:Assets written to: ner_bert_model\assets


In [44]:
# Load the weights
model.load_weights("model_512d_001.h5")  # or your best checkpoint


In [72]:
from seqeval.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

def evaluate_ner_model(model,
    valid_input_ids,
    valid_token_type_ids,
    valid_attention_mask,
    valid_labels, id2tag):
    
    
    y_pred = model.predict([valid_input_ids, valid_token_type_ids, valid_attention_mask], verbose=1)
    y_pred_ids = np.argmax(y_pred, axis=-1)

    true_tags = []
    pred_tags = []

    for i in range(len(valid_labels)):
        true_seq = []
        pred_seq = []
        for j in range(len(valid_labels[i])):
            true_id = valid_labels[i][j]
            pred_id = y_pred_ids[i][j]
            if true_id != 0 and pred_id != 0:
                true_seq.append(id2tag[true_id])
                pred_seq.append(id2tag[pred_id])

        true_tags.append(true_seq)
        pred_tags.append(pred_seq)

    print("\nClassification Report:\n")
    print(classification_report(true_tags, pred_tags))
    print(f"Accuracy: {accuracy_score(true_tags, pred_tags):.4f}")
    print(f"Precision: {precision_score(true_tags, pred_tags):.4f}")
    print(f"Recall: {recall_score(true_tags, pred_tags):.4f}")
    print(f"F1 Score: {f1_score(true_tags, pred_tags):.4f}")


In [49]:
evaluate_ner_model(
    model,
    valid_input_ids,
    valid_token_type_ids,
    valid_attention_mask,
    valid_labels,
    id2tag
)



Classification Report:



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         art       0.00      0.00      0.00      2167
         eve       0.00      0.00      0.00        21
         geo       0.93      0.95      0.94     92545
         gpe       0.80      0.36      0.50      7014
         nat       0.50      0.18      0.26      2749
         org       0.00      0.03      0.00        40
         per       0.64      0.65      0.65      3850
         tim       0.00      0.00      0.00       162

   micro avg       0.89      0.86      0.87    108548
   macro avg       0.36      0.27      0.29    108548
weighted avg       0.88      0.86      0.86    108548

Accuracy: 0.8639
Precision: 0.8903
Recall: 0.8569
F1 Score: 0.8733


In [71]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ner_model_tokenizer/")

def ner_inference(text, model, tokenizer, id2tag):
    
    encoding = tokenizer(
        text.split(), 
        return_tensors="np",
        truncation=True,
        padding='max_length',
        max_length=128,
        is_split_into_words=True
    )

    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]
    token_type_ids = encoding["token_type_ids"]

    
    predictions = model.predict([input_ids, token_type_ids, attention_mask])
    pred_ids = np.argmax(predictions, axis=-1)

    # Map tokens back to words
    word_ids = encoding.word_ids(batch_index=0)

    previous_word_idx = None
    result = []

    for idx, word_idx in enumerate(word_ids):
        if word_idx is None or word_idx == previous_word_idx:
            continue  # skip special tokens and subwords
        predicted_label = id2tag.get(pred_ids[0][idx], "O")
        word = text.split()[word_idx]
        result.append((word, predicted_label))
        previous_word_idx = word_idx

    return result


In [61]:
if 0 not in id2tag:
    id2tag[0] = 'PAD'


In [70]:
text = "dhoni visited America"

result = ner_inference(text, model, tokenizer, id2tag)

for token, tag in result:
    print(f"{token}: {tag}")

dhoni: I-per
visited: B-geo
America: I-geo


if we run more epochs we get more better scores and tags,  but i didn't have that much effecient lap. i have only cpu lap no gpu supported

for single epoch we get this much accuracy if we run 5 to 6 epoch we get 90+ accuracy

TensorBoard is asked me to restart the kernal so if i do that then i lost everything like model training so only i leave it like this.