In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer, util
import torch, json
import re
import time
from tqdm import tqdm

In [9]:
model_id = "mistralai/Mistral-7B-Instruct-v0.3"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",)

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [12]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,)

Device set to use cuda:0


In [13]:
with open("clean_text", "r", encoding="utf-8") as f:
    text = f.read()

text = re.sub(r"\s+", " ", text).strip()

In [14]:
meaningful_verbs = {
    "fund", "finance", "grant", "invest", "support", "sponsor", "award",
    "partner", "collaborate", "work", "team", "ally", "join",
    "create", "launch", "develop", "establish", "build", "initiate",
    "set up", "start", "found", "open", "introduce",
    "provide", "offer", "deliver", "train", "educate", "enable",
    "accelerate", "incubate", "scale", "facilitate", "connect",
    "select", "mentor", "coach", "coordinate", "manage",
    "operate", "organize", "implement", "run", "mobilize",
    "campaign", "advocate", "host", "innovate", "research", "pilot", "test"
}
verbs_list = ", ".join(sorted(meaningful_verbs))

In [15]:
model_name = "sentence-transformers/all-mpnet-base-v2"
embedder = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [16]:
canonical_verbs = sorted(list(meaningful_verbs))
canonical_embeddings = embedder.encode(
    canonical_verbs,
    convert_to_tensor=True,
    normalize_embeddings=True
)

In [17]:
def normalize_verb(phrase, threshold = 0.3):

    new_embedding = embedder.encode(
        phrase,
        convert_to_tensor=True,
        normalize_embeddings=True
    )

    scores = util.cos_sim(new_embedding, canonical_embeddings)

    best_idx = torch.argmax(scores).item()
    best_verb = canonical_verbs[best_idx]
    best_score = scores[0][best_idx].item()


    if best_score < threshold:
      return None, best_score

    return best_verb, best_score

In [18]:
# phrase = "NetApp guided the young enterprenuers to start a tech start-up"
# best_verb, best_score = normalize_verb(phrase)

# print("Original:", phrase)
# print("Normalized:", best_verb)
# print("Similarity score:", best_score * 100)


In [24]:
def chunk_text(text, max_words=900):
    chunks = []
    words = text.split()
    for i in range(0, len(words), max_words):
        chunk = " ".join(words[i:i + max_words])
        chunks.append(chunk)
    return chunks

In [27]:
with open("clean_text", "r", encoding="utf-8") as f:
    clean_text = f.read()

chunks = chunk_text(clean_text, max_words=900)

In [25]:
def make_prompt(text_chunk):
    return f"""
You are an expert in extracting organizational relationship triples.

Focus ONLY on verbs related to institutional actions like:
{', '.join(canonical_verbs)}

Each triple must follow this format:
Role: [organization taking action]
Practice: [main institutional action verb — must come from the above list or its semantic equivalent]
Counterrole: [partner or recipient organization]
Context: [short quote from the text supporting the relation]

Ignore vague or non-relational verbs (e.g. "discusses", "mentions", "focuses on").

TEXT:
{text_chunk}
"""

In [22]:
output_path = "eit_triples_local.json"
all_triples = []

In [29]:
for i, chunk in enumerate(tqdm(chunks, desc="Generating triples")):
    prompt = make_prompt(chunk)

    try:
        result = pipe(
            prompt,
            max_new_tokens=400,
            temperature=0.3,
            top_p=0.9,
            return_full_text=False
        )

        generated = result[0]["generated_text"].strip()

        normalized_output = []
        for line in generated.splitlines():
            if line.lower().startswith("practice:"):
                verb_phrase = line.split(":", 1)[1].strip()
                normalized = normalize_verb(verb_phrase)
                if normalized:
                    line = f"Practice: {normalized}"
            normalized_output.append(line)

        normalized_text = "\n".join(normalized_output)

        all_triples.append({"chunk_id": i + 1, "text": normalized_text})

        if i % 3 == 0:
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(all_triples, f, ensure_ascii=False, indent=2)

        time.sleep(0.5)

    except Exception as e:
        print(f"Error in chunk {i + 1}: {e}")
        continue

Generating triples:   0%|          | 0/82 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating triples:   1%|          | 1/82 [00:14<19:51, 14.71s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating triples:   2%|▏         | 2/82 [00:29<19:34, 14.68s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating triples:   4%|▎         | 3/82 [00:43<19:15, 14.63s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating triples:   5%|▍         | 4/82 [00:58<19:02, 14.65s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating triples:   6%|▌         | 5/82 [01:13<18:46, 14.64s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating triples:   7%|▋         | 6/82 [01:27<18:33, 14.66s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Generating triples:   9%|▊         | 7/82 [01:42<18:19, 14.65s/it]Sett

In [30]:
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(all_triples, f, ensure_ascii=False, indent=2)

In [34]:
import spacy

try:
    nlp = spacy.load("en_core_web_trf")
    print("Loaded spaCy transformer model (en_core_web_trf).")
except OSError:
    nlp = spacy.load("en_core_web_sm")
    print("Transformer model not found so using en_core_web_sm instead")

Transformer model not found so using en_core_web_sm instead


In [35]:
with open("eit_triples_local.json", "r", encoding="utf-8") as f:
    raw_triples = json.load(f)

In [36]:
triple_pattern = re.compile(
    r"Role:\s*(.+?)\s*"
    r"Practice:\s*(.+?)\s*"
    r"Counterrole:\s*(.+?)\s*"
    r"Context:\s*(.+?)(?=Role:|$)",
    re.DOTALL | re.IGNORECASE
)

In [37]:
rich_triples = []

for entry in tqdm(raw_triples, desc="Enriching triples"):
    text = entry.get("text", "")
    matches = triple_pattern.findall(text)

    for role, practice, counterrole, context in matches:
        context = context.strip()

        # spacy NER on the context
        doc = nlp(context)
        ner_data = [
            {"text": ent.text, "label": ent.label_}
            for ent in doc.ents
            if ent.label_ in {"ORG", "INSTITUTION", "GPE"}
        ]

        # Clean up whitespace
        rich_triples.append({
            "role": role.strip(),
            "practice": practice.strip(),
            "counterrole": counterrole.strip(),
            "context": context,
            "ner": ner_data
        })


Enriching triples: 100%|██████████| 82/82 [00:03<00:00, 21.01it/s]


In [38]:
output_path = "eit_rich_triples.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(rich_triples, f, ensure_ascii=False, indent=2)

print(f"Created {len(rich_triples)} rich triples with NER.")
print(f"Saved to {output_path}")

Created 386 rich triples with NER.
Saved to eit_rich_triples.json


In [49]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("CEO of EIT Digital A central element of this partnership is the launch of the National Dual-Use Technology Accelerator")
spacy.displacy.serve(doc, style="dep")


Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
