In [27]:
!pip install transformers datasets seqeval torch accelerate




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
import json
from datasets import Dataset

# Load dataset
# with open("datasets/generated_ner_dataset.json", "r", encoding="utf-8") as f:
#     data = json.load(f)
# with open("datasets/refined_ner_dataset.json", "r", encoding="utf-8") as f:
with open("datasets/output.json", "r", encoding="utf-8") as f:
    data = json.load(f)
# Convert to Hugging Face Dataset format
dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.1)  # Split into train and test sets

train_dataset = dataset["train"]
test_dataset = dataset["test"]


In [15]:
# Extract unique tags
unique_tags = list(set(tag for sample in data for tag in sample["ner_tags"]))
print(unique_tags)  # Check unique tags

# Create label mapping
label2id = {label: i for i, label in enumerate(unique_tags)}
id2label = {i: label for label, i in label2id.items()}

print(label2id)  # Check label mappings


['I-COUNT', 'B-LOCATION', 'B-ALERT_TYPE', 'I-ALERT_TYPE', 'B-QUESTION_TYPE', 'B-COMP_OP', 'O', 'I-QUESTION_TYPE', 'I-TIME_FRAME', 'I-COMP_OP', 'B-COUNT', 'B-DATE', 'B-TIME_FRAME', 'I-ACTION', 'B-ACTION', 'I-LOCATION', 'I-DATE', '0']
{'I-COUNT': 0, 'B-LOCATION': 1, 'B-ALERT_TYPE': 2, 'I-ALERT_TYPE': 3, 'B-QUESTION_TYPE': 4, 'B-COMP_OP': 5, 'O': 6, 'I-QUESTION_TYPE': 7, 'I-TIME_FRAME': 8, 'I-COMP_OP': 9, 'B-COUNT': 10, 'B-DATE': 11, 'B-TIME_FRAME': 12, 'I-ACTION': 13, 'B-ACTION': 14, 'I-LOCATION': 15, 'I-DATE': 16, '0': 17}


In [16]:
from transformers import AutoTokenizer

model_checkpoint = "Davlan/xlm-roberta-base-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [17]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], is_split_into_words=True, truncation=True)

    labels = []
    word_ids = tokenized_inputs.word_ids()  # Maps subwords to words
    ner_tags = example["ner_tags"]  # Original labels

    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)  # Ignore special tokens
        elif word_idx != previous_word_idx:
            if word_idx < len(ner_tags):
                labels.append(label2id[ner_tags[word_idx]])  # Assign full label to first subword
            else:
                labels.append(-100)  # Handle rare out-of-bounds cases
        else:
            if word_idx < len(ner_tags):
                labels.append(label2id[ner_tags[word_idx]])  # Keep the same label
            else:
                labels.append(-100)

        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_dataset = train_dataset.map(tokenize_and_align_labels)
test_dataset = test_dataset.map(tokenize_and_align_labels)

Map: 100%|██████████| 260/260 [00:00<00:00, 3570.67 examples/s]
Map: 100%|██████████| 29/29 [00:00<00:00, 2900.21 examples/s]


# Checking if the mapping was done successfully:

In [18]:
sample = train_dataset[100]  # Get the first example
tokens = tokenizer.convert_ids_to_tokens(sample["input_ids"])  # Convert token IDs to words
labels = sample["labels"]  # Get corresponding labels

for token, label in zip(tokens, labels):
    print(f"{token:15} --> {id2label.get(label, 'IGNORED')}")


<s>             --> IGNORED
▁Retrieve       --> B-ACTION
▁the            --> O
▁number         --> B-COUNT
▁of             --> O
▁drill          --> B-ALERT_TYPE
▁alert          --> I-ALERT_TYPE
s               --> I-ALERT_TYPE
▁on             --> O
▁March          --> B-DATE
▁5              --> I-DATE
▁20             --> I-DATE
24              --> I-DATE
</s>            --> IGNORED


In [19]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True 
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/xlm-roberta-base-ner-hrl and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([18]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([18, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
print("Label to ID Mapping:", label2id)
print("ID to Label Mapping:", id2label)
print("Number of Labels:", len(label2id))

Label to ID Mapping: {'I-COUNT': 0, 'B-LOCATION': 1, 'B-ALERT_TYPE': 2, 'I-ALERT_TYPE': 3, 'B-QUESTION_TYPE': 4, 'B-COMP_OP': 5, 'O': 6, 'I-QUESTION_TYPE': 7, 'I-TIME_FRAME': 8, 'I-COMP_OP': 9, 'B-COUNT': 10, 'B-DATE': 11, 'B-TIME_FRAME': 12, 'I-ACTION': 13, 'B-ACTION': 14, 'I-LOCATION': 15, 'I-DATE': 16, '0': 17}
ID to Label Mapping: {0: 'I-COUNT', 1: 'B-LOCATION', 2: 'B-ALERT_TYPE', 3: 'I-ALERT_TYPE', 4: 'B-QUESTION_TYPE', 5: 'B-COMP_OP', 6: 'O', 7: 'I-QUESTION_TYPE', 8: 'I-TIME_FRAME', 9: 'I-COMP_OP', 10: 'B-COUNT', 11: 'B-DATE', 12: 'B-TIME_FRAME', 13: 'I-ACTION', 14: 'B-ACTION', 15: 'I-LOCATION', 16: 'I-DATE', 17: '0'}
Number of Labels: 18


In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./ner289_bert_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,  # Adjust as needed
    weight_decay=0.01,
    push_to_hub=False,
)




In [22]:
from transformers import Trainer, DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)


In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.5221
2,No log,0.151079
3,No log,0.115601
4,No log,0.121565
5,No log,0.115242
6,No log,0.126951
7,No log,0.126246
8,No log,0.121083
9,No log,0.141645
10,No log,0.133065


TrainOutput(global_step=660, training_loss=0.16136073126937403, metrics={'train_runtime': 1199.3207, 'train_samples_per_second': 4.336, 'train_steps_per_second': 0.55, 'total_flos': 56583014428512.0, 'train_loss': 0.16136073126937403, 'epoch': 20.0})

In [132]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
   ---------------------------------------- 0.0/84.0 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/84.0 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/84.0 kB ? eta -:--:--
   ------------------- -------------------- 41.0/84.0 kB 487.6 kB/s eta 0:00:01
   ---------------------------------------- 84.0/84.0 kB 784.8 kB/s eta 0:00:00
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
import numpy as np
import evaluate  # ✅ New library for metrics

# Load the seqeval metric for NER
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [[id2label[p] for p, l in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

trainer.compute_metrics = compute_metrics
results = trainer.evaluate()
print(results)


{'eval_loss': 0.16159310936927795, 'eval_precision': 0.9784172661870504, 'eval_recall': 0.9444444444444444, 'eval_f1': 0.96113074204947, 'eval_accuracy': 0.9778325123152709, 'eval_runtime': 0.332, 'eval_samples_per_second': 87.342, 'eval_steps_per_second': 12.047, 'epoch': 20.0}


In [25]:
trainer.save_model("./ner289_model")
tokenizer.save_pretrained("./ner289_model")


('./ner289_model\\tokenizer_config.json',
 './ner289_model\\special_tokens_map.json',
 './ner289_model\\sentencepiece.bpe.model',
 './ner289_model\\added_tokens.json',
 './ner289_model\\tokenizer.json')

In [10]:
from transformers import pipeline

nlp_ner = pipeline("ner", model="./ner289_model", tokenizer="./ner289_model", aggregation_strategy="simple")

# text = "i would like you to present me with how much ice cream i ate in the last 5 days?"
text = "Missiles and rockets were fired at Tel Aviv yesterday."
text = "is there any ice cream left in the freezer?"
# text = "תראה לי את כל התראות טילים במתן."
# text = "كم عدد إنذارات الصواريخ التي كانت في تل أبيب أمس؟"
result = nlp_ner(text)
print(result)


Device set to use cpu


[{'entity_group': 'QUESTION_TYPE', 'score': np.float32(0.98894346), 'word': 'is there', 'start': 0, 'end': 8}, {'entity_group': 'ALERT_TYPE', 'score': np.float32(0.97951037), 'word': '', 'start': 12, 'end': 13}, {'entity_group': 'ALERT_TYPE', 'score': np.float32(0.9857738), 'word': 'ice', 'start': 13, 'end': 16}, {'entity_group': 'ALERT_TYPE', 'score': np.float32(0.910088), 'word': 'cream', 'start': 16, 'end': 22}, {'entity_group': 'ALERT_TYPE', 'score': np.float32(0.5212113), 'word': 'free', 'start': 34, 'end': 39}]


# We need to add more exsamples of questions in different topics to avoid overffiting model

In [73]:
from datasets import load_dataset

# Load the Yahoo Answers dataset
dataset = load_dataset("yahoo_answers_topics")

In [74]:
dataset["train"][1]
text = dataset["train"][4]["question_title"]
text = "Missiles and rockets were fired at Tel Aviv yesterday."
result = nlp_ner(text)
print(result)


[{'entity_group': 'ALERT_TYPE', 'score': np.float32(0.9979), 'word': 'Miss', 'start': 0, 'end': 4}, {'entity_group': 'ALERT_TYPE', 'score': np.float32(0.9985795), 'word': 'ile', 'start': 4, 'end': 7}, {'entity_group': 'ALERT_TYPE', 'score': np.float32(0.99788636), 'word': 's', 'start': 7, 'end': 8}, {'entity_group': 'ALERT_TYPE', 'score': np.float32(0.9987429), 'word': 'rock', 'start': 12, 'end': 17}, {'entity_group': 'ALERT_TYPE', 'score': np.float32(0.99791807), 'word': 'ets', 'start': 17, 'end': 20}, {'entity_group': 'LOCATION', 'score': np.float32(0.99774563), 'word': 'Tel Aviv', 'start': 34, 'end': 43}, {'entity_group': 'TIME_FRAME', 'score': np.float32(0.9790999), 'word': 'yesterday', 'start': 43, 'end': 53}]


In [None]:
def merge_subwords(ner_results):
    merged_entities = []
    current_entity = None

    for entity in ner_results:
        if current_entity and entity["entity_group"] == current_entity["entity_group"]:
            # Ensure words are consecutive, otherwise treat as separate
            if entity["start"] == current_entity["end"]:  
                current_entity["word"] += entity["word"]  # Merge subword
            else:
                merged_entities.append(current_entity)
                current_entity = entity.copy()  # Start a new entity

            current_entity["end"] = entity["end"]  # Update end position
            current_entity["score"] = (current_entity["score"] + entity["score"]) / 2  # Average confidence
        else:
            if current_entity:
                merged_entities.append(current_entity)
            current_entity = entity.copy()

    if current_entity:
        merged_entities.append(current_entity)

    return merged_entities

In [79]:
import json
expanded = []
for i in range(0,150):
    expanded.append(dataset["train"][i]["question_title"])
with open("datasets/expanded.json", "w", encoding="utf-8") as f:
    json.dump(expanded, f, ensure_ascii=False, indent=4)
# print()