In [3]:
!pip install spacy_llm

Collecting spacy_llm
  Downloading spacy_llm-0.7.3-py2.py3-none-any.whl.metadata (9.9 kB)
Downloading spacy_llm-0.7.3-py2.py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spacy_llm
Successfully installed spacy_llm-0.7.3


In [4]:
import spacy
import spacy_llm
from spacy_llm.registry import registry
from spacy.tokens import Doc
from typing import Iterable
from transformers import pipeline
import torch

In [5]:
# Register the custom Mistral model with spaCy's registry
# This allows spaCy to find and use our custom model by name

@registry.llm_models("custom.LocalMistral.v1")
def local_mistral_model(model_path):
    """
    Factory function to create a LocalMistralModel instance.

    Args: model_path: Path to the Mistral model on disk

    Returns: LocalMistralModel instance
    """
    return LocalMistralModel(model_path)


In [None]:
class LocalMistralModel:
    """
    Wrapper for local Mistral model that works with spaCy LLMs.
    Loads the model once and processes prompts to generate responses.
    """

    def __init__(self, model_path):
        """Load Mistral model from local path using Hugging Face pipeline."""
        print(f"Loading Mistral from {model_path}...")
        self.pipe = pipeline(
            "text-generation",
            model=model_path,
            torch_dtype=torch.float16,
            device_map="auto"
        )

    def __call__(self, prompts: Iterable[str]):
        """
        Generate responses for given prompts.
        Returns only the generated text without the original prompt.
        """
        responses = []
        for prompt in prompts:
            if isinstance(prompt, list):  # Handle nested prompts from spacy-llm
                prompt = prompt[0]
            output = self.pipe(prompt, max_new_tokens=256, do_sample=False)
            generated = output[0]["generated_text"]
            response = generated[len(prompt):].strip()  # Remove prompt from output
            responses.append(response)
        return responses

In [None]:
# Register triplet task
@registry.llm_tasks("custom.TripletExtraction.v1")
def make_triplet_task():
    return TripletExtractionTask()

class TripletExtractionTask:
    def __init__(self):
        if not Doc.has_extension("triplets"):
            Doc.set_extension("triplets", default=[])

    def generate_prompts(self, docs: Iterable[Doc]) -> Iterable[str]:
        prompts = []
        for doc in docs:
            prompt = f"""Extract role-practice-counterrole triplets from this text.

IMPORTANT RULES:
- Role: Name of organization or actor (e.g., "EIT Digital", "Microsoft")
- Practice: SINGLE ACTION VERB ONLY (e.g., "helps", "funds", "collaborates", "supports")
- Counterrole: Name of partner or recipient (e.g., "MatchX", "startups")

Format each triplet EXACTLY like this:
Role: [organization name]
Practice: [single verb]
Counterrole: [partner name]

Text: {doc.text}

Triplets:"""
            prompts.append(prompt)
        return prompts

    def parse_responses(self, docs: Iterable[Doc], responses: Iterable[str]):
        docs_list = list(docs)
        responses_list = list(responses)

        for doc, response in zip(docs_list, responses_list):
            triplets = []
            current = {}
            for line in response.strip().split('\n'):
                if line.startswith('Role:'):
                    current = {'role': line.replace('Role:', '').strip()}
                elif line.startswith('Practice:'):
                    current['practice'] = line.replace('Practice:', '').strip()
                elif line.startswith('Counterrole:'):
                    current['counterrole'] = line.replace('Counterrole:', '').strip()
                    triplets.append(current)
                    current = {}
            doc._.triplets = triplets

        return iter(docs_list)

In [None]:
nlp = spacy.blank("en")

In [None]:
config = {
    "task": {"@llm_tasks": "custom.TripletExtraction.v1"},
    "model": {
        "@llm_models": "custom.LocalMistral.v1",
        "model_path": "/project/sgona/hf_cache/hub/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots/0d4b76e1efeb5eb6f6b5e757c79870472e04bd3a"
    },
    "validate_types": False
}

nlp.add_pipe("llm", config=config)

#with open("/project/sgona/data/combined_output_cleaned.txt", "r") as file:
    #text = file.read()


In [None]:
text = ""

In [None]:
doc = nlp(text)

print("Extracted Triplets:")
for t in doc._.triplets:
    print(f"Role: {t['role']}\nPractice: {t['practice']}\nCounterrole: {t['counterrole']}\n")