In [1]:
import json
import spacy
from spacy import displacy

In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
doc = nlp("Donald Trump was President of USA")

In [4]:
doc.ents

(Donald Trump, USA)

In [5]:
displacy.render(doc, style="ent", jupyter=True)

In [7]:
doc = nlp('''
The patient was prescribed Aspirin for their heart condition.
The doctor recommended Ibuprofen to alleviate the patient's headache.
The patient is suffering from diabetes, and they need to take Metformin regularly.
After the surgery, the patient experienced some post-operative complications, including infection.
The patient is currently on a regimen of Lisinopril to manage their high blood pressure.
The antibiotic course for treating the bacterial infection should be completed as prescribed.
The patient's insulin dosage needs to be adjusted to better control their blood sugar levels.
The physician suspects that the patient may have pneumonia and has ordered a chest X-ray.
The patient's cholesterol levels are high, and they have been advised to take Atorvastatin.
The allergy to penicillin was noted in the patient's medical history.
''')
displacy.render(doc, style="ent", jupyter=True)

In [13]:
with open("corona2.json") as med:
    data = json.loads(med.read())

In [15]:
training_data = [
        {
            "text": example["content"],
            "entities": [
                (annotation["start"], annotation["end"], annotation["tag_name"].upper())
                for annotation in example["annotations"]
            ],
        }
        for example in data["examples"]
    ]

In [18]:
from spacy.tokens import DocBin
from tqdm import tqdm

In [19]:
c_nlp = spacy.blank("en")
doc_bin = DocBin()

In [20]:
from spacy.util import filter_spans

In [22]:
for training_example in tqdm(training_data):
    text = training_example["text"]
    labels = training_example["entities"]
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.set_ents(filtered_ents)
    doc_bin.add(doc)
doc_bin.to_disk("train.spacy")

100%|██████████| 31/31 [00:00<00:00, 257.61it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity





In [None]:
!python -m spacy init fill-config base_config.cfg config.cf

In [None]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./train.spacy --output ./

In [6]:
nlp_ner = spacy.load("model-best")

In [8]:
doc = nlp_ner('''
The patient was prescribed Aspirin for their heart condition.
The doctor recommended Ibuprofen to alleviate the patient's headache.
The patient is suffering from diabetes, and they need to take Metformin regularly.
After the surgery, the patient experienced some post-operative complications, including infection.
The patient is currently on a regimen of Lisinopril to manage their high blood pressure.
The antibiotic course for treating the bacterial infection should be completed as prescribed.
The patient's insulin dosage needs to be adjusted to better control their blood sugar levels.
The physician suspects that the patient may have pneumonia and has ordered a chest X-ray.
The patient's cholesterol levels are high, and they have been advised to take Atorvastatin.
The allergy to penicillin was noted in the patient's medical history.
''')

In [9]:
spacy.displacy.render(doc, style="ent", jupyter=True)

# Custom NER Model using Keras 

In [2]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer

# Example dataset
sentences = [
    ["John", "lives", "in", "Paris"],
    ["Steve", "works", "at", "Google"]
]

tags = [
    ["B-PER", "O", "O", "B-LOC"],
    ["B-PER", "O", "O", "B-ORG"]
]

# Word Tokenizer
tokenizer = Tokenizer(lower=False, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Convert words to integer sequences
X = tokenizer.texts_to_sequences(sentences)
print(X)


[[2, 3, 4, 5], [6, 7, 8, 9]]


In [3]:
# Label Encoder for the tags
tag_encoder = LabelEncoder()
tag_encoder.fit([tag for sentence in tags for tag in sentence])
tag_index = tag_encoder.classes_
num_tags = len(tag_index)

# Convert tags to integer sequences
y = [[tag_encoder.transform(tag) for tag in sentence] for sentence in tags]
# print(y)

ValueError: y should be a 1d array, got an array of shape () instead.

In [None]:
# Pad sequences for equal length
max_len = max(len(s) for s in X)
X_padded = pad_sequences(X, maxlen=max_len, padding="post")
y_padded = pad_sequences(y, maxlen=max_len, padding="post")

# Convert y to one-hot encoded format for categorical cross-entropy
y_padded = [to_categorical(i, num_classes=num_tags) for i in y_padded]
y_padded = np.array(y_padded)

print(f"Vocabulary Size: {vocab_size}")
print(f"Number of Tags: {num_tags}")

# Custom NER model using bert based models

Reference: https://medium.com/@pasdan/building-custom-named-entity-recognition-ner-models-transformers-9759f8d547d8

In [25]:
epochs = 15
model_checkpoint = 'bert-base-cased'
model_output_checkpoint = 'transformers/bnk_stmt_token_2022'

entity_groups = [
  'TIME',
  'PERIOD',
  'TEAM',
  'PLAYER',
  'POSITION',
  'FORMATION',
  'EVENT',
  'DIRECTION',
  'QUANTITY',
  'UNITS'
]
entity_groups = [
    "MODE",
    "BANK",
    "VPA",
    "FLAT",
    "REFERENCE",
    "TRANSACTION",
    "BANK_INT",
]

labels = ['O'] + \
  [f'B-{label}' for label in entity_groups] + \
  [f'I-{label}' for label in entity_groups]

label2id = { label:i for i, label in enumerate(labels) }
id2label = { i:label for i, label in enumerate(labels) }

In [26]:
import os
import re
import json
import random
from datasets import Dataset
from transformers import DataCollatorForTokenClassification
from extr_ds.manager.utils.filesystem import load_document

def align_labels(tokenized_inputs, label_list):
  labels = []
  for word_idx in tokenized_inputs.word_ids(batch_index=0):
    label_id = -100
    if not word_idx is None:
      label =  re.sub(r'^[BI]-(.+)$', r'I-\g<1>', label_list[word_idx]) \
        if word_idx == previous_word_idx \
        else label_list[word_idx]

      label_id = label2id[label]

    labels.append(label_id)
    previous_word_idx = word_idx

  return labels

def get_dataset(tokenizer, model):
  def tokenize_and_align_labels(record):
    tokenized_inputs = tokenizer(
      record['tokens'],
      truncation=True,
      is_split_into_words=True
    )
  
    tokenized_inputs['labels'] = align_labels(
      tokenized_inputs,
      record['labels']
    )
  
    return tokenized_inputs

  file_nm = "ents-iob.json"
  file_nm = "D:\\Abiz\\Technical\\code\\python\\poc-trial-solution\\src\\account\\stmt-iob.json"
  ents_dataset = json.loads(
    load_document(file_nm)
  )

  random.shuffle(ents_dataset)

  pivot = int(len(ents_dataset) * .8)
  data_collator = DataCollatorForTokenClassification(
    tokenizer,
    return_tensors='tf'
  )
  
  train_dataset = Dataset.from_list(ents_dataset[:pivot])
  tf_train_set = model.prepare_tf_dataset(
    train_dataset.map(
      tokenize_and_align_labels,
      batched=False
    ),
    shuffle=True,
    collate_fn=data_collator,
  )

  test_dataset = Dataset.from_list(ents_dataset[pivot:])
  tf_test_set = model.prepare_tf_dataset(
    test_dataset.map(
      tokenize_and_align_labels,
      batched=False
    ),
    shuffle=True,
    collate_fn=data_collator,
  )

  return tf_train_set, tf_test_set

In [27]:
import numpy
import evaluate
from transformers.keras_callbacks import KerasMetricCallback

seqeval = evaluate.load('seqeval')

def compute_metrics(preds):
  predictions, actuals = preds
  predictions = numpy.argmax(predictions, axis=2)

  results = seqeval.compute(
    predictions=[
      [labels[p] for p, l in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, actuals)
    ],
    references=[
      [labels[l] for p, l in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, actuals)
    ]
  )

  return {
    key: results[f'overall_{key}']
    for key in ['precision', 'recall', 'f1', 'accuracy']
  }


In [28]:
import re
import tensorflow as tf
import tf_keras
from transformers import AutoTokenizer, \
                         TFAutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained(
  model_checkpoint
)

model = TFAutoModelForTokenClassification.from_pretrained(
  model_checkpoint,
  num_labels=len(labels),
  id2label=id2label,
  label2id=label2id
)

tf_train_set, tf_test_set = get_dataset(tokenizer, model)

optimizer = tf_keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer)

callbacks = [
  KerasMetricCallback(
    metric_fn=compute_metrics,
    eval_dataset=
    tf_test_set
  ),
  tf_keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
]

model.fit(
  x=tf_train_set,
  validation_data=tf_test_set,
  epochs=epochs,
  callbacks=callbacks
)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1420/1420 [00:00<00:00, 1567.35 examples/s]
Map: 100%|██████████| 356/356 [00:00<00:00, 2000.64 examples/s]


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15


<tf_keras.src.callbacks.History at 0x2ab7c979ca0>

In [29]:
for model_to_save in [tokenizer, model]:
  model_to_save.save_pretrained(model_output_checkpoint)

In [32]:
from transformers import pipeline

classifier = pipeline(
    'ner', 
    model=model_output_checkpoint,
    aggregation_strategy='simple'
)

examples = [
  '(6:51 - 1st) (Shotgun) P.Mahomes scrambles right end to LAC 34 for 2 yards (S.Joseph; K.Van Noy). FUMBLES (S.Joseph), and recovers at LAC 34.',
]
examples = [
    "BIL/INFT/DI14574246/407September202/P VIDYA SAGAR /",
    "INF/INFT/037554667241/A3062770ecf4cd9ad5c04ef9b6679c8e11/",
    "INF/INFT/037554196201/A6012769586032fcc6e441818d4d7ef6d4/VIVISHTECHNOLO"
]

responses = classifier(examples)
print(responses)



Some layers from the model checkpoint at transformers/bnk_stmt_token_2022 were not used when initializing TFBertForTokenClassification: ['dropout_417']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at transformers/bnk_stmt_token_2022.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


[[{'entity_group': 'MODE', 'score': 0.9994918, 'word': 'INFT', 'start': 4, 'end': 8}, {'entity_group': 'TRANSACTION', 'score': 0.99929965, 'word': 'DI14574246', 'start': 9, 'end': 19}, {'entity_group': 'FLAT', 'score': 0.9981221, 'word': '407', 'start': 20, 'end': 23}], [{'entity_group': 'MODE', 'score': 0.99952734, 'word': 'INFT', 'start': 4, 'end': 8}, {'entity_group': 'REFERENCE', 'score': 0.99970555, 'word': '037554667241', 'start': 9, 'end': 21}], [{'entity_group': 'MODE', 'score': 0.9995543, 'word': 'INFT', 'start': 4, 'end': 8}, {'entity_group': 'REFERENCE', 'score': 0.9996999, 'word': '037554196201', 'start': 9, 'end': 21}, {'entity_group': 'FLAT', 'score': 0.93533444, 'word': '##60', 'start': 23, 'end': 25}, {'entity_group': 'BANK', 'score': 0.99970233, 'word': 'VIVISHTECHNOLO', 'start': 57, 'end': 71}]]
