In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# torch==2.2.1
# numpy==1.26.4
!pip install datasets==2.18.0 transformers==4.38.2 seqeval accelerate -q
# git+https://github.com/csebuetnlp/normalizer

In [None]:
import torch
import random
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict

from transformers import AutoTokenizer
from datasets import ClassLabel, Sequence, load_metric
from sklearn.model_selection import train_test_split

from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "google-bert/bert-base-uncased"
batch_size = 64

In [None]:
import ast
dataset = pd.read_csv('/content/drive/MyDrive/NER_CS/NER_main_Datas.csv', converters={'tokens': ast.literal_eval, 'tags': ast.literal_eval})

In [None]:
dataset["tags"][0][3]

'I-version'

In [None]:
dataset["tokens"][0]

['ORF', 'aspnet_client', 'casino', '4.8']

In [None]:
dataset.shape

(5126, 2)

In [None]:
dataset = dataset.rename(columns={'tags': 'ner_tags'})
dataset.head()

Unnamed: 0,tokens,ner_tags
0,"[ORF, aspnet_client, casino, 4.8]","[O, O, O, I-version]"
1,"[3.6.3, CVE-2010-3646, Forms]","[B-version, B-cve id, B-application]"
2,"[0091, 1.15, Mandrill, repository/lib.php, HDW...","[B-version, B-version, B-application, B-file, ..."
3,"[do, gfxTextRun::GetUserData, vxveautil.kv_bin...","[O, B-function, B-function, O, I-version]"
4,"[util/doh/runner.html, JSDependentString::unde...","[B-file, B-function, I-version]"


In [None]:
 # Split the data into training, validation, and test sets
train, X_temp = train_test_split(dataset, test_size=0.25, random_state=42)
validation, test = train_test_split(X_temp, test_size=0.05, random_state=42)

In [None]:
train = train.reset_index(drop=True)
validation = validation.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
 # Create a datasets.Dataset object
train = Dataset.from_dict(train)
validation = Dataset.from_dict(validation)
test = Dataset.from_dict(test)
# Print the dataset to check the format
print("Train Dataset : ",train)
print("Validation Dataset : ", validation)
print("Test Dataset : ", test)

Train Dataset :  Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 3844
})
Validation Dataset :  Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 1217
})
Test Dataset :  Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 65
})


In [None]:
# Create DatasetDict
datasets = DatasetDict({
    'validation': validation,
    'test': test,
    'train': train
})
print("Dataset: \n", datasets)

Dataset: 
 DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1217
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 65
    })
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3844
    })
})


In [None]:
datasets["test"][0]

{'tokens': ['c_timelimit', 'Japan', '7z', 'iconvert.c'],
 'ner_tags': [23, 23, 14, 23]}

In [None]:
from datasets import ClassLabel, Sequence

# provide the label_names yourself
# label_names = [...]
# OR get them from the dataset
label_names = sorted(set(label for labels in datasets["train"]["ner_tags"] for label in labels))

# Cast to ClassLabel
datasets = datasets.cast_column("ner_tags", Sequence(ClassLabel(names=label_names)))

Casting the dataset:   0%|          | 0/1217 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/65 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3844 [00:00<?, ? examples/s]

In [None]:
print(datasets["train"].features[f"ner_tags"])

Sequence(feature=ClassLabel(names=['B-PL', 'B-application', 'B-cve id', 'B-edition', 'B-file', 'B-function', 'B-hardware', 'B-language', 'B-method', 'B-os', 'B-parameter', 'B-relevant_term', 'B-update', 'B-vendor', 'B-version', 'I-application', 'I-edition', 'I-hardware', 'I-os', 'I-relevant_term', 'I-update', 'I-vendor', 'I-version', 'O'], id=None), length=-1, id=None)


In [None]:
label_list = datasets["train"].features[f"{task}_tags"].feature.names
print("\nLabel Test: \n", label_list)


Label Test: 
 ['B-PL', 'B-application', 'B-cve id', 'B-edition', 'B-file', 'B-function', 'B-hardware', 'B-language', 'B-method', 'B-os', 'B-parameter', 'B-relevant_term', 'B-update', 'B-vendor', 'B-version', 'I-application', 'I-edition', 'I-hardware', 'I-os', 'I-relevant_term', 'I-update', 'I-vendor', 'I-version', 'O']


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
example = datasets["train"][4]
print(example["tokens"][0])

2.13.2


In [None]:
datasets["train"][4]

{'tokens': ['2.13.2',
  'JOSSO',
  'CVE-2013-2266',
  'PCI_EJ_BASE',
  'Operations',
  'CSCto71445'],
 'ner_tags': [22, 13, 2, 23, 1, 23]}

In [None]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', '2', '.', '13', '.', '2', 'jo', '##sso', 'cv', '##e', '-', '2013', '-', '226', '##6', 'pc', '##i', '_', 'e', '##j', '_', 'base', 'operations', 'cs', '##ct', '##o', '##7', '##14', '##45', '[SEP]']


In [None]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', '2', '.', '13', '.', '2', 'jo', '##sso', 'cv', '##e', '-', '2013', '-', '226', '##6', 'pc', '##i', '_', 'e', '##j', '_', 'base', 'operations', 'cs', '##ct', '##o', '##7', '##14', '##45', '[SEP]']


In [None]:
len(example[f"{task}_tags"]), len(tokenized_input["input_ids"])

(6, 30)

In [None]:
print(tokenized_input.word_ids())

[None, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 5, 5, None]


In [None]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"{task}_tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

30 30


In [None]:
label_all_tokens = True

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [None]:
tokenize_and_align_labels(datasets['train'][:5])

{'input_ids': [[101, 3191, 1035, 3796, 10381, 4948, 2100, 22330, 6761, 6914, 5302, 2094, 1020, 1012, 1014, 1012, 1016, 1012, 13741, 12997, 21246, 1013, 10975, 22199, 16748, 3370, 1012, 2079, 102], [101, 1015, 1012, 1021, 1012, 1017, 4408, 21397, 1013, 4748, 10020, 1013, 9239, 1013, 8816, 1012, 2079, 1054, 7295, 2278, 2546, 2290, 14176, 102], [101, 1021, 1012, 1014, 1012, 1017, 10236, 102], [101, 26226, 2063, 1011, 2230, 1011, 19988, 2509, 4031, 1011, 2862, 1012, 25718, 9927, 4023, 1035, 5549, 1015, 1012, 1018, 1012, 4464, 1012, 1015, 6327, 1013, 18176, 1013, 4520, 7616, 1012, 25718, 1018, 1012, 1018, 1012, 1014, 102], [101, 1016, 1012, 2410, 1012, 1016, 8183, 24137, 26226, 2063, 1011, 2286, 1011, 21035, 2575, 7473, 2072, 1035, 1041, 3501, 1035, 2918, 3136, 20116, 6593, 2080, 2581, 16932, 19961, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0

In [None]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/1217 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Map:   0%|          | 0/3844 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list)).to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

labels = [label_list[i] for i in example[f"{task}_tags"]]
metric.compute(predictions=[labels], references=[labels])


  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'application': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'cve id': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'vendor': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'version': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    f"/content/drive/MyDrive/NER_CS/Google-CS-{task}",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_dir='./logs',
    logging_steps=int(1550/5),  # Set to None to log based on epochs
    evaluation_strategy='steps',
    save_strategy='steps',
    save_steps=1550,  # Adjust as needed
    logging_first_step=False,  # Ensure logging starts from the first epoch
    num_train_epochs=50  # Set the number of training epochs
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()



dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
310,0.6007,0.962583,0.758664,0.717521,0.737519,0.718482
620,0.1616,1.538191,0.746193,0.732151,0.739105,0.715589
930,0.0622,1.773418,0.733418,0.745438,0.739379,0.714625
1240,0.0303,1.922833,0.734386,0.747712,0.740989,0.714005
1550,0.0193,1.978809,0.737492,0.743783,0.740624,0.714832


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1550, training_loss=0.17481418209691202, metrics={'train_runtime': 1750.8114, 'train_samples_per_second': 109.778, 'train_steps_per_second': 0.885, 'total_flos': 5673915613320768.0, 'train_loss': 0.17481418209691202, 'epoch': 50.0})

In [None]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.9788085222244263,
 'eval_precision': 0.7374923108468321,
 'eval_recall': 0.7437832807734064,
 'eval_f1': 0.7406244369514298,
 'eval_accuracy': 0.7148317779537863,
 'eval_runtime': 5.4111,
 'eval_samples_per_second': 224.907,
 'eval_steps_per_second': 1.848,
 'epoch': 50.0}

In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

In [None]:
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_predictions

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'O',
  'O',
  'B-application',
  'B-application',
  'B-application',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-function'],
 ['B-vendor',
  'B-vendor',
  'B-vendor',
  'B-vendor',
  'B-vendor',
  'I-application',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-version',
  'B-version',
  'B-version',
  'B-version',
  'B-version',
  'B-version',
  'B-version'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'I-version',
  'I-version',
  'I-version',
  'I-version',
  'I-version',
  'I-version',
  'I-version',
  'I-version',
  'I-version',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B

In [None]:
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels

[['B-parameter',
  'B-parameter',
  'B-parameter',
  'B-parameter',
  'B-parameter',
  'B-parameter',
  'B-parameter',
  'B-parameter',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'O',
  'O',
  'B-vendor',
  'B-vendor',
  'B-vendor',
  'B-parameter',
  'B-parameter',
  'B-parameter',
  'B-parameter',
  'B-parameter',
  'B-parameter',
  'B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-function'],
 ['B-vendor',
  'B-vendor',
  'B-vendor',
  'B-vendor',
  'B-vendor',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-version',
  'B-version',
  'B-version',
  'B-version',
  'B-version',
  'B-version',
  'B-version'],
 ['B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-function',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
  'B-file',
 

In [None]:
results = metric.compute(predictions=true_predictions, references=true_labels)
print(results)

{'PL': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 5}, 'application': {'precision': 0.5009331840238895, 'recall': 0.5610367892976589, 'f1': 0.5292841648590021, 'number': 2392}, 'cve id': {'precision': 1.0, 'recall': 0.9985583853916387, 'f1': 0.9992786727578745, 'number': 2081}, 'edition': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 34}, 'file': {'precision': 0.9528392685274302, 'recall': 0.970429668354844, 'f1': 0.961554026709834, 'number': 6121}, 'function': {'precision': 0.7843892194609731, 'recall': 0.8524153670597185, 'f1': 0.8169886985052861, 'number': 2629}, 'hardware': {'precision': 0.3496932515337423, 'recall': 0.3081081081081081, 'f1': 0.32758620689655177, 'number': 185}, 'language': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 4}, 'method': {'precision': 0.22274881516587677, 'recall': 0.23039215686274508, 'f1': 0.22650602409638554, 'number': 204}, 'os': {'precision': 0.041666666666666664, 'recall': 0.024390243902439025, 'f1': 0.0307692307692307

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from pathlib import Path
import torch
# Define the device if using GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
# Provide the path to the tokenizer and model directory

tokenizer = AutoTokenizer.from_pretrained(Path("/content/drive/MyDrive/NER_CS/Google-CS-ner/checkpoint-1550"), device=device)
model = AutoModelForTokenClassification.from_pretrained(Path("/content/drive/MyDrive/NER_CS/Google-CS-ner/checkpoint-1550")).to(device)

# Create NER pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True, device=device)

# Split the text into words
word_list = "repository/lib.php"
# print(word_list)

# Print the list of words
# print(word_list)
# [0091, 1.15, Mandrill, repository/lib.php, HDW...	[B-version, B-version, B-application, B-file

# Get NER results
ner_results = nlp(word_list)
print(ner_results)


[{'entity_group': 'LABEL_4', 'score': 0.99962085, 'word': 'repository / lib. php', 'start': 0, 'end': 18}]




In [None]:
label_list

['B-PL',
 'B-application',
 'B-cve id',
 'B-edition',
 'B-file',
 'B-function',
 'B-hardware',
 'B-language',
 'B-method',
 'B-os',
 'B-parameter',
 'B-relevant_term',
 'B-update',
 'B-vendor',
 'B-version',
 'I-application',
 'I-edition',
 'I-hardware',
 'I-os',
 'I-relevant_term',
 'I-update',
 'I-vendor',
 'I-version',
 'O']

In [None]:
# Define a dictionary to map labels to categories
label_mapping = {
    'LABEL_0': 'B-Programming Language',
    'LABEL_1': 'B-application',
    'LABEL_2': 'B-cve id',
    'LABEL_3': 'B-edition',
    'LABEL_4': 'B-file',
    'LABEL_5': 'B-function',
    'LABEL_6': 'B-hardware',
    'LABEL_7': 'B-language',
    'LABEL_8': 'B-method',
    'LABEL_9': 'B-os',
    'LABEL_10': 'B-parameter',
    'LABEL_11': 'B-relevant_term',
    'LABEL_12': 'B-update',
    'LABEL_13': 'B-vendor',
    'LABEL_14': 'B-version',
    'LABEL_15': 'I-application',
    'LABEL_16': 'I-edition',
    'LABEL_17': 'I-hardware',
    'LABEL_18': 'I-os',
    'LABEL_19': 'I-relevant_term',
    'LABEL_20': 'I-update',
    'LABEL_21': 'I-vendor',
    'LABEL_22': 'I-version',
    'LABEL_23': 'O'
}

# Iterate through each word and its corresponding NER result
for word, result in zip(word_list, ner_results):
    # Check if NER result is empty
    if result:
        # Extracting the NER label from the result
        ner_label = result['entity_group']
        # Check if the label is valid
        if ner_label in label_mapping:
            # Convert NER label to its corresponding category
            ner_category = label_mapping[ner_label]
            # Print the word and its NER category
            print(f"{word_list} ---> {ner_category}")
        else:
            print(f"Invalid NER label: {ner_label}")
    else:
        # Print message for empty NER result
        print(f"No NER result found for: {word_list}")


repository/lib.php ---> B-file


In [None]:
# # Define a function to convert labels to desired format
# def convert_label(label):
#     if label == 'LABEL_0':
#         return 'B-Programming Language'
#     elif label == 'LABEL_1':
#         return 'B-application'
#     elif label == 'LABEL_2':
#         return 'B-cve id'
#     elif label == 'LABEL_3':
#         return 'B-edition'
#     elif label == 'LABEL_4':
#         return 'B-file'
#     elif label == 'LABEL_5':
#         return 'B-function'
#     elif label == 'LABEL_6':
#         return 'B-hardware'
#     elif label == 'LABEL_7':
#         return 'B-language'
#     elif label == 'LABEL_8':
#         return 'B-method'
#     elif label == 'LABEL_9':
#         return 'B-os'
#     elif label == 'LABEL_10':
#         return 'B-parameter'
#     elif label == 'LABEL_11':
#         return 'B-relevant_term'
#     elif label == 'LABEL_12':
#         return 'B-update'
#     elif label == 'LABEL_13':
#         return 'B-vendor'
#     elif label == 'LABEL_14':
#         return 'B-version'
#     elif label == 'LABEL_15':
#         return 'I-application'
#     elif label == 'LABEL_16':
#         return 'I-edition'
#     elif label == 'LABEL_17':
#         return 'I-hardware'
#     elif label == 'LABEL_18':
#         return 'I-os'
#     elif label == 'LABEL_19':
#         return 'I-relevant_term'
#     elif label == 'LABEL_20':
#         return 'I-update'
#     elif label == 'LABEL_21':
#         return 'I-vendor'
#     elif label == 'LABEL_22':
#         return 'I-version'
#     elif label == 'LABEL_23':
#         return 'O'

# # Iterate through each word and its corresponding NER result
# for word, result in zip(word_list, ner_results):
#   # Check if NER result is empty
#   if result:
#     # print(result['entity_group'])
#     # Extracting the NER label from the result
#     ner_label = result['entity_group']
#     # Check if the label is 'LABEL_1' or 'LABEL_5'
#     if ner_label in ('LABEL_0', 'LABEL_1', 'LABEL_2', 'LABEL_3', 'LABEL_4', 'LABEL_5', 'LABEL_6', 'LABEL_7', 'LABEL_8', 'LABEL_9',
#               'LABEL_10', 'LABEL_11', 'LABEL_12', 'LABEL_13', 'LABEL_14', 'LABEL_15', 'LABEL_16', 'LABEL_17', 'LABEL_18',
#               'LABEL_19', 'LABEL_20', 'LABEL_21', 'LABEL_22', 'LABEL_23'):
#         # Convert NER label to its corresponding category
#         ner_category = convert_label(ner_label)
#         # Print the word and its NER category
#         print(f"{word_list} ---> {ner_category}")
#   else:
#     # Print message for empty NER result
#     print(f"No NER result found for: {word_list}")


repository/lib.php ---> B-file


# **To use the model for zero-shot named entity recognition, Utilize the following prompt:**

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

def process(text, prompt, treshold=0.5):
  tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/NER_CS/Google-CS-ner/checkpoint-1550")
  model = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/NER_CS/Google-CS-ner/checkpoint-1550")

  nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy = 'first', grouped_entities=True, device=device)

  # Concatenate text and prompt for full input
  input_ = f"{prompt}\n{text}"

  results = nlp(input_) # Run NLP on full input

  processed_results = []

  prompt_length = len(prompt) # Get prompt length

  for result in results:
    # check whether score is higher than treshold
    if result['score']<treshold:
        continue
    # Adjust indices by subtracting prompt length
    start = result['start'] - prompt_length

    # If indexes belongs to the prompt - continue
    if start<0:
        continue

    end = result['end'] - prompt_length

    # Extract span from original text using adjusted indices
    span = text[start:end]

    # Create processed result dict
    processed_result = {
      'span': span,
      'start': start,
      'end': end,
      'score': result['score'],
      'Lavel': results,
    }

    processed_results.append(processed_result)

  return processed_results

prompt = """Identify the following entity classes in the text: computer

Text:
"""
text = """Apple was founded as Apple Computer Company on April 1, 1976, by Steve Wozniak, Steve Jobs (1955–2011) and Ronald Wayne to develop and sell Wozniak's Apple I personal computer.
It was incorporated by Jobs and Wozniak as Apple Computer, Inc. in 1977. The company's second computer, the Apple II, became a best seller and one of the first mass-produced microcomputers.
Apple went public in 1980 to instant financial success."""

results = process(text, prompt)

print(results)


In [None]:
prompt = """Identify the following entity classes in the text: computer

Text:
"""
text = """Apple was founded as Apple Computer Company on April 1, 1976, by Steve Wozniak, Steve Jobs (1955–2011) and Ronald Wayne to develop and sell Wozniak's Apple I personal computer.
It was incorporated by Jobs and Wozniak as Apple Computer, Inc. in 1977. The company's second computer, the Apple II, became a best seller and one of the first mass-produced microcomputers.
Apple went public in 1980 to instant financial success."""

results = process(text, prompt)

print(results)

[{'span': 'omputer ', 'start': 28, 'end': 36, 'score': 0.90777117, 'Lavel': [{'entity_group': 'LABEL_23', 'score': 0.95195997, 'word': 'identify the following entity classes in the text : computer text : apple', 'start': 0, 'end': 73}, {'entity_group': 'LABEL_13', 'score': 0.45654166, 'word': 'was founded', 'start': 74, 'end': 85}, {'entity_group': 'LABEL_23', 'score': 0.38399115, 'word': 'as apple', 'start': 86, 'end': 94}, {'entity_group': 'LABEL_1', 'score': 0.90777117, 'word': 'computer', 'start': 95, 'end': 103}, {'entity_group': 'LABEL_23', 'score': 0.43455374, 'word': 'company on', 'start': 104, 'end': 114}, {'entity_group': 'LABEL_22', 'score': 0.6652792, 'word': 'april 1,', 'start': 115, 'end': 123}, {'entity_group': 'LABEL_23', 'score': 0.51163095, 'word': '1976, by steve wozniak, steve jobs ( 1955 – 2011 ) and', 'start': 124, 'end': 174}, {'entity_group': 'LABEL_13', 'score': 0.24162382, 'word': 'ronald', 'start': 175, 'end': 181}, {'entity_group': 'LABEL_23', 'score': 0.360

