In [27]:
!pip install transformers datasets seqeval torch accelerate




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [112]:
import json
from datasets import Dataset

# Load dataset
# with open("datasets/generated_ner_dataset.json", "r", encoding="utf-8") as f:
#     data = json.load(f)
# with open("datasets/refined_ner_dataset.json", "r", encoding="utf-8") as f:
with open("datasets/output.json", "r", encoding="utf-8") as f:
    data = json.load(f)
print(len(data))
with open("datasets/shared/temp.json", "r", encoding="utf-8") as f:
    expanded_data = json.load(f)
# Convert to Hugging Face Dataset format
for item in expanded_data:
    data.append(item)
print(len(data))
dataset = Dataset.from_list(data)

dataset = dataset.train_test_split(test_size=0.1)  # Split into train and test sets
train_dataset = dataset["train"]
test_dataset = dataset["test"]


289
349


In [113]:
# Extract unique tags
unique_tags = list(set(tag for sample in data for tag in sample["ner_tags"]))
print(unique_tags)  # Check unique tags

# Create label mapping
label2id = {label: i for i, label in enumerate(unique_tags)}
id2label = {i: label for label, i in label2id.items()}

print(label2id)  # Check label mappings


['I-ACTION', 'B-COMP_OP', 'I-QUESTION_TYPE', '0', 'B-TITLE', 'B-LOCATION', 'B-ORG', 'I-TIME_FRAME', 'I-LOCATION', 'I-DATE', 'B-ACTION', 'I-COUNT', 'I-ALERT_TYPE', 'O', 'B-ALERT_TYPE', 'B-DATE', 'B-COUNT', 'I-PERSON', 'B-PERSON', 'I-COMP_OP', 'B-TIME_FRAME', 'B-QUESTION_TYPE']
{'I-ACTION': 0, 'B-COMP_OP': 1, 'I-QUESTION_TYPE': 2, '0': 3, 'B-TITLE': 4, 'B-LOCATION': 5, 'B-ORG': 6, 'I-TIME_FRAME': 7, 'I-LOCATION': 8, 'I-DATE': 9, 'B-ACTION': 10, 'I-COUNT': 11, 'I-ALERT_TYPE': 12, 'O': 13, 'B-ALERT_TYPE': 14, 'B-DATE': 15, 'B-COUNT': 16, 'I-PERSON': 17, 'B-PERSON': 18, 'I-COMP_OP': 19, 'B-TIME_FRAME': 20, 'B-QUESTION_TYPE': 21}


In [88]:
from transformers import AutoTokenizer

model_checkpoint = "Davlan/xlm-roberta-base-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [114]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], is_split_into_words=True, truncation=True)

    labels = []
    word_ids = tokenized_inputs.word_ids()  # Maps subwords to words
    ner_tags = example["ner_tags"]  # Original labels

    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)  # Ignore special tokens
        elif word_idx != previous_word_idx:
            if word_idx < len(ner_tags):
                labels.append(label2id[ner_tags[word_idx]])  # Assign full label to first subword
            else:
                labels.append(-100)  # Handle rare out-of-bounds cases
        else:
            if word_idx < len(ner_tags):
                labels.append(label2id[ner_tags[word_idx]])  # Keep the same label
            else:
                labels.append(-100)

        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_dataset = train_dataset.map(tokenize_and_align_labels)
test_dataset = test_dataset.map(tokenize_and_align_labels)

Map: 100%|██████████| 314/314 [00:00<00:00, 3878.51 examples/s]
Map: 100%|██████████| 35/35 [00:00<00:00, 2927.00 examples/s]


# Checking if the mapping was done successfully:

In [111]:
sample = train_dataset[315]  # Get the first example
tokens = tokenizer.convert_ids_to_tokens(sample["input_ids"])  # Convert token IDs to words
labels = sample["labels"]  # Get corresponding labels

for token, label in zip(tokens, labels):
    print(f"{token:15} --> {id2label.get(label, 'IGNORED')}")


IndexError: Invalid key: 315 is out of bounds for size 314

In [118]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True 
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/xlm-roberta-base-ner-hrl and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([22]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([22, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
print("Label to ID Mapping:", label2id)
print("ID to Label Mapping:", id2label)
print("Number of Labels:", len(label2id))

Label to ID Mapping: {'I-COUNT': 0, 'B-LOCATION': 1, 'B-ALERT_TYPE': 2, 'I-ALERT_TYPE': 3, 'B-QUESTION_TYPE': 4, 'B-COMP_OP': 5, 'O': 6, 'I-QUESTION_TYPE': 7, 'I-TIME_FRAME': 8, 'I-COMP_OP': 9, 'B-COUNT': 10, 'B-DATE': 11, 'B-TIME_FRAME': 12, 'I-ACTION': 13, 'B-ACTION': 14, 'I-LOCATION': 15, 'I-DATE': 16, '0': 17}
ID to Label Mapping: {0: 'I-COUNT', 1: 'B-LOCATION', 2: 'B-ALERT_TYPE', 3: 'I-ALERT_TYPE', 4: 'B-QUESTION_TYPE', 5: 'B-COMP_OP', 6: 'O', 7: 'I-QUESTION_TYPE', 8: 'I-TIME_FRAME', 9: 'I-COMP_OP', 10: 'B-COUNT', 11: 'B-DATE', 12: 'B-TIME_FRAME', 13: 'I-ACTION', 14: 'B-ACTION', 15: 'I-LOCATION', 16: 'I-DATE', 17: '0'}
Number of Labels: 18


In [119]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./ner_expanded_bert_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,  # Adjust as needed
    weight_decay=0.01,
    push_to_hub=False,
)


In [120]:
from transformers import Trainer, DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)


In [121]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.62886
2,No log,0.254966
3,No log,0.185381
4,No log,0.154395
5,No log,0.137424
6,No log,0.144185
7,No log,0.083427
8,No log,0.157427
9,No log,0.112022
10,No log,0.120749


TrainOutput(global_step=800, training_loss=0.15705700397491454, metrics={'train_runtime': 1621.6009, 'train_samples_per_second': 3.873, 'train_steps_per_second': 0.493, 'total_flos': 69766598593440.0, 'train_loss': 0.15705700397491454, 'epoch': 20.0})

In [122]:
!pip install evaluate




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [123]:
import numpy as np
import evaluate  # ✅ New library for metrics

# Load the seqeval metric for NER
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [[id2label[p] for p, l in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

trainer.compute_metrics = compute_metrics
results = trainer.evaluate()
print(results)


{'eval_loss': 0.140985906124115, 'eval_precision': 0.9487179487179487, 'eval_recall': 0.9487179487179487, 'eval_f1': 0.9487179487179487, 'eval_accuracy': 0.9696969696969697, 'eval_runtime': 0.4339, 'eval_samples_per_second': 80.671, 'eval_steps_per_second': 11.524, 'epoch': 20.0}


In [124]:
trainer.save_model("./ner_expanded_model")
tokenizer.save_pretrained("./ner_expanded_model")


('./ner_expanded_model\\tokenizer_config.json',
 './ner_expanded_model\\special_tokens_map.json',
 './ner_expanded_model\\sentencepiece.bpe.model',
 './ner_expanded_model\\added_tokens.json',
 './ner_expanded_model\\tokenizer.json')

In [151]:
from transformers import pipeline

nlp_ner = pipeline("ner", model="./ner_expanded_model", tokenizer="./ner_expanded_model", aggregation_strategy="simple")

# text = "i would like you to present me with how much ice cream i ate in the last 5 days?"
text = "Missiles and rockets were fired at Tel Aviv yesterday."
text = "is there any ice cream left in the freezer?"
text = "תראה לי את כל התראות טילים במתן."
text = "היו אתמול התראות חדירת כלי טיס עוין בתל אביב?"
# text = "היו גלידות בסופר בנצרת אתמול??"
result = nlp_ner(text)
result
# labels = [entity["entity_group"] for entity in result]
# labels
# if not "ALERT_TYPE" in [entity["entity_group"] for entity in result]:
#     print(result)
# text = "הייתה חדירת מחבלים אתמול??"
# text = "كم عدد إنذارات الصواريخ التي كانت في تل أبيب أمس؟"


Device set to use cpu


[{'entity_group': 'QUESTION_TYPE',
  'score': np.float32(0.85400486),
  'word': 'היו',
  'start': 0,
  'end': 3},
 {'entity_group': 'TIME_FRAME',
  'score': np.float32(0.97834855),
  'word': 'אתמול',
  'start': 3,
  'end': 9},
 {'entity_group': 'ALERT_TYPE',
  'score': np.float32(0.31018448),
  'word': 'ראות',
  'start': 12,
  'end': 16},
 {'entity_group': 'LOCATION',
  'score': np.float32(0.9909981),
  'word': 'בתל אביב',
  'start': 35,
  'end': 44},
 {'entity_group': 'LOCATION',
  'score': np.float32(0.39654586),
  'word': '?',
  'start': 44,
  'end': 45}]

# We need to add more exsamples of questions in different topics to avoid overffiting model

In [73]:
from datasets import load_dataset

# Load the Yahoo Answers dataset
dataset = load_dataset("yahoo_answers_topics")

In [128]:
dataset["train"][1]
text = dataset["train"][4]["question_title"]
text = "Missiles and rockets were fired at Tel Aviv yesterday."
result = nlp_ner(text)
print(result)


KeyError: 'question_title'

In [None]:
def merge_subwords(ner_results):
    merged_entities = []
    current_entity = None

    for entity in ner_results:
        if current_entity and entity["entity_group"] == current_entity["entity_group"]:
            # Ensure words are consecutive, otherwise treat as separate
            if entity["start"] == current_entity["end"]:  
                current_entity["word"] += entity["word"]  # Merge subword
            else:
                merged_entities.append(current_entity)
                current_entity = entity.copy()  # Start a new entity

            current_entity["end"] = entity["end"]  # Update end position
            current_entity["score"] = (current_entity["score"] + entity["score"]) / 2  # Average confidence
        else:
            if current_entity:
                merged_entities.append(current_entity)
            current_entity = entity.copy()

    if current_entity:
        merged_entities.append(current_entity)

    return merged_entities

In [79]:
import json
expanded = []
for i in range(0,150):
    expanded.append(dataset["train"][i]["question_title"])
with open("datasets/expanded.json", "w", encoding="utf-8") as f:
    json.dump(expanded, f, ensure_ascii=False, indent=4)
# print()

In [84]:
with open("datasets/shared/temp.json", "r", encoding="utf-8") as f:
    data = json.load(f)
len(data)

60

In [154]:
!pip install dateparser

Collecting dateparser
  Downloading dateparser-1.2.1-py3-none-any.whl.metadata (29 kB)
Collecting tzlocal>=0.2 (from dateparser)
  Downloading tzlocal-5.3.1-py3-none-any.whl.metadata (7.6 kB)
Downloading dateparser-1.2.1-py3-none-any.whl (295 kB)
   ---------------------------------------- 0.0/295.7 kB ? eta -:--:--
   ---- ----------------------------------- 30.7/295.7 kB 1.3 MB/s eta 0:00:01
   --------- ----------------------------- 71.7/295.7 kB 991.0 kB/s eta 0:00:01
   -------------------- ------------------- 153.6/295.7 kB 1.5 MB/s eta 0:00:01
   --------------------------- ------------ 204.8/295.7 kB 1.4 MB/s eta 0:00:01
   ---------------------------------------- 295.7/295.7 kB 1.7 MB/s eta 0:00:00
Downloading tzlocal-5.3.1-py3-none-any.whl (18 kB)
Installing collected packages: tzlocal, dateparser
Successfully installed dateparser-1.2.1 tzlocal-5.3.1



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [155]:
import dateparser
from datetime import datetime

def convert_time_frame(time_string):
    """
    Parses natural language time expressions and converts them to a date format.
    """
    parsed_date = dateparser.parse(time_string)
    if parsed_date:
        return parsed_date.strftime("%Y-%m-%d")  # Convert to string for query
    return time_string  # Return original if not recognized

In [165]:
print(convert_time_frame("1 of july 2024"))  # Test the function

2024-07-01


In [167]:
!pip install fuzzywuzzy




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [168]:
from fuzzywuzzy import process

def get_best_match(user_input, category):
    """
    Finds the closest entity match from the database/config.
    """
    options = ENTITY_MAPPINGS.get(category, {}).keys()
    best_match, score = process.extractOne(user_input, options)
    
    # If confidence is high, return the normalized value
    return ENTITY_MAPPINGS[category].get(best_match, user_input) if score > 80 else user_input

