In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, BertTokenizerFast, BertForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("tokenizer_date")
model = AutoModelForTokenClassification.from_pretrained("model_date")


  from .autonotebook import tqdm as notebook_tqdm


In [19]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model=model, tokenizer=tokenizer)

In [35]:
pipe("I have a meeting on 24th April at 3PM and 25th April at 4PM")

[{'entity': 'B-DATE',
  'score': 0.99277323,
  'index': 6,
  'word': '24th',
  'start': 20,
  'end': 24},
 {'entity': 'I-DATE',
  'score': 0.94976634,
  'index': 7,
  'word': 'April',
  'start': 25,
  'end': 30},
 {'entity': 'B-TIME',
  'score': 0.9579224,
  'index': 9,
  'word': '3',
  'start': 34,
  'end': 35},
 {'entity': 'I-TIME',
  'score': 0.9816777,
  'index': 10,
  'word': '##PM',
  'start': 35,
  'end': 37},
 {'entity': 'B-DATE',
  'score': 0.97738975,
  'index': 12,
  'word': '25th',
  'start': 42,
  'end': 46},
 {'entity': 'I-DATE',
  'score': 0.7439361,
  'index': 13,
  'word': 'April',
  'start': 47,
  'end': 52},
 {'entity': 'I-TIME',
  'score': 0.58697575,
  'index': 14,
  'word': 'at',
  'start': 53,
  'end': 55},
 {'entity': 'B-TIME',
  'score': 0.94118047,
  'index': 15,
  'word': '4',
  'start': 56,
  'end': 57},
 {'entity': 'I-TIME',
  'score': 0.97838575,
  'index': 16,
  'word': '##PM',
  'start': 57,
  'end': 59}]

In [30]:
entities_ = pipe("I have a meeting on monday")

date = None
time = None

for entity in entities:
    if entity['entity'] == 'B-DATE':
        date = entity['word']
    elif entity['entity'] == 'I-DATE':
        date += ' ' + entity['word']
    elif entity['entity'] == 'B-TIME':
        time = entity['word']
    elif entity['entity'] == 'I-TIME':
        time += entity['word'].replace('##', '')

result = {
    'date': date,
    'time': time
}

print(result)

{'date': 'Monday', 'time': '3pm'}


In [34]:
entities = pipe("I have a meeting on 24th April at 3PM and 25th April at 4PM")

date = []
time = []

for entity in entities:
    if entity['entity'] == 'B-DATE':
        word_date = entity['word']
    elif entity['entity'] == 'I-DATE':
        word_date += ' ' + entity['word']
        date.append(word_date)
    elif entity['entity'] == 'B-TIME':
        word_time = entity['word']
    elif entity['entity'] == 'I-TIME':
        word_time += entity['word'].replace('##', '')
        time.append(word_time)

result = {
    'date': date,
    'time': time
}

print(result)

{'date': ['24th April', '25th April'], 'time': ['3PM', '3PMat', '4PM']}


In [1]:
entities = pipe("Meet with John tomorrow at 5PM and on Monday at 3PM")

import datetime

def merge_entities(entities):
    merged_entities = {'date': [], 'time': []}
    current_date = []
    current_time = []
    
    for entity in entities:
        if entity['entity'].startswith('B-DATE'):
            if current_date:
                merged_entities['date'].append(''.join(current_date))
            current_date = [entity['word']]
        elif entity['entity'].startswith('I-DATE'):
            current_date.append(entity['word'])
        elif entity['entity'].startswith('B-TIME'):
            if current_time:
                merged_entities['time'].append(''.join(current_time).replace('##', ''))
            current_time = [entity['word'].replace('##', '')]
        elif entity['entity'].startswith('I-TIME'):
            current_time.append(entity['word'].replace('##', ''))
    
    if current_date:
        merged_entities['date'].append(''.join(current_date))
    if current_time:
        merged_entities['time'].append(''.join(current_time))
    
    
    return merged_entities

merged = merge_entities(entities)

print(merged)

NameError: name 'pipe' is not defined

In [None]:
model = 

In [23]:
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, BertTokenizer
from datasets import load_dataset
from torch.utils.data import Dataset


model = BertForTokenClassification.from_pretrained('model_loc')
tokenizer = BertTokenizerFast.from_pretrained('tokenizer_loc')

class CoNLL2003Dataset(Dataset):
    def __init__(self, split, tokenizer, max_len=128):
        self.tokenizer = tokenizer
        self.max_len = max_len
        dataset = load_dataset("conll2003")
        self.data = dataset[split]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Extract tokens and NER tags for the current sentence
        words = self.data[idx]['tokens']
        ner_tags = self.data[idx]['ner_tags']  # These are already integers

        # Tokenize words and encode NER tags
        tokenized_input = self.tokenizer(words, is_split_into_words=True, truncation=True, padding='max_length', max_length=self.max_len, return_tensors="pt")
        input_ids = tokenized_input['input_ids'].squeeze()
        attention_mask = tokenized_input['attention_mask'].squeeze()

        # Prepare label tensor, initializing with -100 to ignore loss calculation for padding
        labels = torch.full((self.max_len,), fill_value=-100, dtype=torch.long)

        # Update labels with actual NER tags, considering the tokenizer's word-to-token mapping
        token_to_word_map = tokenized_input.word_ids(batch_index=0)
        label_index = 0
        for token_index, word_index in enumerate(token_to_word_map):
            if word_index is not None:  # Tokens corresponding to words carry over the NER tag
                if token_index < self.max_len:  # Ensure we don't exceed max_len due to padding
                    labels[token_index] = ner_tags[word_index]

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }


valid_dataset = CoNLL2003Dataset("validation", tokenizer)   

In [24]:
from seqeval.metrics import precision_score, recall_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(preds, labels)
    ]
    true_labels = [
        [label_list[l] for (l, p) in zip(label, pred) if l != -100]
        for label, pred in zip(labels, preds)
    ]

    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [25]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [26]:
from transformers import Trainer, TrainingArguments
import torch 


training_args = TrainingArguments(
    output_dir="path/to/output/dir",
    evaluation_strategy="epoch",
    per_device_eval_batch_size=16,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

eval_results = trainer.evaluate(eval_dataset=valid_dataset)
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")






[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A

NameError: name 'label_list' is not defined

In [19]:
import time

# Get the current timezone name (abbreviation)
def get_tz():
    current_zone = time.tzname[time.daylight]
    return current_zone



Current timezone abbreviation: India Standard Time
