In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("tokenizer")
model = AutoModelForTokenClassification.from_pretrained("model")


##### Process text sample (from wikipedia)

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")




  from .autonotebook import tqdm as notebook_tqdm


In [3]:
nlp("Schedule a meeting with mohit on 24th jan at the Lake view hotel at 2pm")

[{'entity_group': 'DATE',
  'score': 0.917465,
  'word': '24th jan',
  'start': 32,
  'end': 41},
 {'entity_group': 'LOC',
  'score': 0.9353217,
  'word': 'Lake view hotel',
  'start': 48,
  'end': 64}]

In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("djagatiya/ner-bert-base-cased-ontonotesv5-englishv4")
model = AutoModelForTokenClassification.from_pretrained("djagatiya/ner-bert-base-cased-ontonotesv5-englishv4")

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model=model, tokenizer=tokenizer)

In [29]:
model.save_pretrained("model_date")
tokenizer.save_pretrained("tokenizer_date")

('tokenizer_date\\tokenizer_config.json',
 'tokenizer_date\\special_tokens_map.json',
 'tokenizer_date\\vocab.txt',
 'tokenizer_date\\added_tokens.json',
 'tokenizer_date\\tokenizer.json')

In [15]:
import torch  

# Define a function to process the sentence and return entities
def identify_entities(sentence):
  encoded_input = tokenizer(sentence, return_tensors="pt")  # Tokenize and convert to tensors
  with torch.no_grad():  # Disable gradient calculation for faster inference
    outputs = model(**encoded_input)  # Pass input through the model
    logits = outputs.logits  # Get the logits
    predictions = torch.argmax(logits, dim=-1)  # Get the most likely label for each token (already integers)

  decoded_tokens = tokenizer.convert_ids_to_tokens(encoded_input["input_ids"].squeeze().tolist())

  # Create a list to store entities
  entities = []
  entity_name = ""
  entity_type = ""
  for token, prediction in zip(decoded_tokens, predictions.squeeze().tolist()):
    # Check for B- or I- tags to identify entity start and continuation
    if prediction.startswith("B-"):
      entity_type = prediction[2:]  # Extract entity type from label (e.g., B-LOC)
      entity_name = token  # Start a new entity
    elif prediction.startswith("I-"):
      # If the type matches the current entity, continue adding tokens
      if entity_type == prediction[2:]:
        entity_name += " " + token
      else:
        # Different entity type encountered, save the previous entity and start a new one
        entities.append((entity_name, entity_type))
        entity_type = prediction[2:]
        entity_name = token
    # Check for single-word entities (O tag not required)
    elif token != "[PAD]":  # Ignore padding tokens
      entities.append((token, "O"))  # "O" indicates "Outside" of an entity
  # Add the last entity if it wasn't saved yet
  if entity_name:
    entities.append((entity_name, entity_type))

  return entities

# Example usage
sentence = "Schedule a meeting with mohit on 24th Jan at 2pm"
entities = identify_entities(sentence)

# Print the identified entities
print(entities)


AttributeError: 'int' object has no attribute 'startswith'

In [16]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="djagatiya/ner-bert-base-cased-ontonotesv5-englishv4")

In [28]:


def extract_date_time(entities):
  """
  Extracts date and time entities from the model output.

  Args:
      entities: A list of dictionaries representing model predictions.
          Each dictionary contains information like 'entity' type, 'word', etc.

  Returns:
      A dictionary with keys 'date' and 'time' containing the identified date and time,
      or None if none are found.
  """
  date_entity = None
  time_entity = None
  for entity in entities:
    # Consider only B-DATE or B-TIME entities
    if entity['entity'].startswith('B-'):
      # If B-DATE encountered, reset time_entity and store date
      if entity['entity'] == 'B-DATE':
        time_entity = None
        date_entity = entity['word']
      # If B-TIME encountered, store time only if no previous I-TIME was seen
      elif entity['entity'] == 'B-TIME' and not time_entity:
        time_entity = entity['word']

  # Combine date parts if necessary (assuming '28th' and 'Jan' are separate entities)
#   if date_entity and entities[entities.index(entity) + 1]['entity'] == 'I-DATE':
#     date_entity += " " + entities[entities.index(entity) + 1]['word']
    if date_entity and entities.index(entity) < len(entities) - 1:  # Check if next element exists
        next_entity = entities[entities.index(entity) + 1]
        if next_entity['entity'] == 'I-DATE':
            date_entity += " " + next_entity['word']

  return {'date': date_entity, 'time': time_entity}

# Example usage:
# model_output = [{'entity': 'B-DATE', ...}, ...] 
sentence = pipe("I have a meeting with the CEO on 28th Jan at 2AM")
date_time_info = extract_date_time(sentence)
print(date_time_info)


{'date': '28th Jan', 'time': '2'}
