<a href="https://colab.research.google.com/github/serdarildercaglar/my-colab-notebooks/blob/main/NuExtract_for_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install llama-cpp-python

In [None]:
from llama_cpp import Llama

llm = Llama.from_pretrained(
    repo_id="RichardErkhov/numind_-_NuExtract-tiny-v1.5-gguf",
    filename="*NuExtract-tiny-v1.5.Q5_K_M.gguf",
    verbose=False
)

In [None]:
import json

def predict(llm, texts, entities, batch_size=1, max_length=10_000, max_new_tokens=4_000):
    # Create template dictionary with entities as keys and empty lists as values
    template_dict = {entity: [] for entity in entities}

    # Convert to JSON string with proper formatting
    template = json.dumps(template_dict, indent=4)

    # Create prompts using the template
    prompts = [
        f"""<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"""
        for text in texts
    ]

    outputs = []
    for i in range(0, len(prompts), batch_size):
        output = llm(
            prompts[i],
            max_tokens=max_new_tokens,
            echo=True
        )
        json_str = output["choices"][0]["text"].split("<|output|>")[1]
        entities_dict = json.loads(json_str)
        output_entities = []
        for label, entities in entities_dict.items():
            for entity in entities:
                start = text.find(entity)
                end = start + len(entity)
                output_entities.append({"label": label, "text": entity, "start": start, "end": end})
        outputs.append(output_entities)
    return outputs

text = """Libretto by Marius Petipa, based on the 1822 novella "Trilby, ou Le Lutin d'Argail" by Charles Nodier, first presented by the Ballet of the Moscow Imperial Bolshoi Theatre on January 25/February 6 (Julian/Gregorian calendar dates), 1870, in Moscow with Polina Karpakova as Trilby and Ludiia Geiten as Miranda and restaged by Petipa for the Imperial Ballet at the Imperial Bolshoi Kamenny Theatre on January 17–29, 1871 in St. Petersburg with Adèle Grantzow as Trilby and Lev Ivanov as Count Leopold."""
entities = ["person", "book", "location", "date", "female actor", "male actor", "character"]
predict(llm, [text], entities)


[[{'label': 'person', 'text': 'Polina Karpakova', 'start': 253, 'end': 269},
  {'label': 'person', 'text': 'Ludiia Geiten', 'start': 284, 'end': 297},
  {'label': 'person', 'text': 'Adélét Grantzow', 'start': -1, 'end': 14},
  {'label': 'person', 'text': 'Lev Ivanov', 'start': 471, 'end': 481},
  {'label': 'book',
   'text': "Trilby, ou Le Lutin d'Argail",
   'start': 54,
   'end': 82},
  {'label': 'book',
   'text': 'Libretto by Marius Petipa',
   'start': 0,
   'end': 25},
  {'label': 'location', 'text': 'Moscow', 'start': 140, 'end': 146},
  {'label': 'location', 'text': 'St. Petersburg', 'start': 422, 'end': 436},
  {'label': 'date',
   'text': 'January 25/February 6 (Julian/Gregorian calendar dates)',
   'start': 175,
   'end': 230},
  {'label': 'date',
   'text': 'January 17–29, 1871 in St. Petersburg',
   'start': 399,
   'end': 436},
  {'label': 'female actor',
   'text': 'Polina Karpakova',
   'start': 253,
   'end': 269},
  {'label': 'female actor', 'text': 'Ludiia Geiten', '