In [3]:
!pip install google-generativeai



In [2]:
!pip install spacy_llm

Collecting spacy_llm
  Downloading spacy_llm-0.7.3-py2.py3-none-any.whl.metadata (9.9 kB)
Downloading spacy_llm-0.7.3-py2.py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spacy_llm
Successfully installed spacy_llm-0.7.3


In [4]:
import os
import google.generativeai as genai
import spacy
import spacy_llm
from spacy_llm.registry import registry
from spacy.tokens import Doc
from typing import Iterable


In [6]:
from google.colab import userdata
gemini_key = userdata.get('GEMINI_API')

In [15]:
# Custom Gemini Model
class GeminiModel:
    def __init__(self, api_key: str):
        print(f"Initializing Gemini...")
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel("gemini-2.5-flash-lite")
        print("Gemini ready")

    def __call__(self, prompts: Iterable[str]):
        responses = []
        for prompt in prompts:
            if isinstance(prompt, list):
                prompt = prompt[0]
            try:
                response = self.model.generate_content(prompt)
                responses.append(response.text.strip())
            except Exception as e:
                print(f"Error generating content: {e}")
                responses.append("Error generating response")
        return responses


In [16]:
# Register Gemini model
@registry.llm_models("custom.Gemini.v1")
def gemini_model(api_key: str):
    return GeminiModel(api_key)

# Custom Triplet Extraction Task
@registry.llm_tasks("custom.TripletExtraction.v1")
def make_triplet_task():
    return TripletExtractionTask()


In [17]:
class TripletExtractionTask:
    def __init__(self):
        if not Doc.has_extension("triplets"):
            Doc.set_extension("triplets", default=[])

        # Load SpaCy model for entity detection
        self.nlp_filter = spacy.load("en_core_web_sm")

        # Action verbs that indicate meaningful organizational relationships
        self.meaningful_verbs = {
            'funds', 'supports', 'partners', 'collaborates', 'establishes', 'creates',
            'develops', 'provides', 'delivers', 'launches', 'invests', 'awards',
            'trains', 'educates', 'connects', 'facilitates', 'enables', 'selects',
            'accelerates', 'incubates', 'scales', 'finances', 'grants', 'coordinates',
            'manages', 'operates', 'builds', 'implements', 'deploys', 'organizes'
        }

    def _has_meaningful_action(self, text):
        """Check if sentence contains meaningful action verbs"""
        doc = self.nlp_filter(text.lower())
        # Check if any meaningful verb is in the sentence
        for token in doc:
            if token.lemma_ in self.meaningful_verbs:
                return True
        return False

    def _has_entities(self, text):
        """Check if text has at least 2 relevant entities"""
        doc = self.nlp_filter(text)
        entities = [ent for ent in doc.ents if ent.label_ in ["ORG", "PERSON", "GPE", "NORP", "PRODUCT"]]
        return len(entities) >= 2

    def generate_prompts(self, docs: Iterable[Doc]) -> Iterable[str]:
        prompts = []
        for doc in docs:
            # Process through nlp_filter
            processed_doc = self.nlp_filter(doc.text)
            sentences = [sent.text.strip() for sent in processed_doc.sents]

            # STRICT filtering: Must have entities AND meaningful action verbs
            candidate_sentences = [
                sent for sent in sentences
                if len(sent) > 40 and
                self._has_entities(sent) and
                self._has_meaningful_action(sent)
            ]

            if not candidate_sentences:
                prompts.append("No relationships found.")
                continue

            # Process up to 10 high-quality sentences
            filtered_text = " ".join(candidate_sentences[:10])

            # Enhanced prompt with better examples
            prompt = f"""Extract ONLY strategic organizational relationships that show ongoing institutional practices.

WHAT TO EXTRACT:
Focus ONLY on relationships that show:
- Funding/Investment: "X funds Y", "X invests in Y"
- Support/Acceleration: "X supports Y", "X accelerates Y"
- Partnership: "X partners with Y", "X collaborates with Y"
- Service Provision: "X provides services to Y", "X trains Y"
- Creation/Launch: "X launches Y", "X establishes Y"
- Selection/Awards: "X selects Y", "X awards Y"

WHAT TO IGNORE:
- Presentations, meetings, speeches ("presented to", "spoke at", "attended")
- Generic updates ("informed", "updated", "announced")
- Internal governance ("appointed", "elected", "thanked")
- Abstract/vague actions ("focus on", "work towards", "aim to")

RULES:
- "role_identity": The organization taking action (be specific: "EIT Digital", not just "EIT")
- "practice": ONE clear action verb or short verb phrase (2-4 words max)
- "counterrole": The specific recipient/beneficiary (not vague terms like "Europe" or "stakeholders")
- Split multiple counterroles into separate triplets

GOOD EXAMPLES:
"EIT InnoEnergy invested €2.5 million in cleantech startup Northvolt to scale battery production."
→ Role: EIT InnoEnergy
→ Practice: invests in
→ Counterrole: Northvolt

"EIT Digital Accelerator supports over 200 scaleups annually through mentorship and market access."
→ Role: EIT Digital Accelerator
→ Practice: supports
→ Counterrole: scaleups

"Climate-KIC partners with major European cities to develop climate adaptation strategies."
→ Role: Climate-KIC
→ Practice: partners with
→ Counterrole: European cities

"EIT Health awards funding to healthcare startups through the Headstart programme."
→ Role: EIT Health
→ Practice: awards funding to
→ Counterrole: healthcare startups

BAD EXAMPLES (DO NOT EXTRACT):
"The Chairman presented the strategy to the Board" ❌ (presentation, not strategic practice)
"EIT focuses on innovation in Europe" ❌ (too vague)
"The Director thanked the participants" ❌ (social courtesy, not institutional practice)

RETURN FORMAT:
Role: [specific organization]
Practice: [action verb phrase]
Counterrole: [specific recipient]
---

TEXT TO ANALYZE:
{filtered_text}

Extract ONLY strategic, ongoing organizational practices:"""
            prompts.append(prompt)
        return prompts

    def parse_responses(self, docs: Iterable[Doc], responses: Iterable[str]):
        docs_list = list(docs)
        responses_list = list(responses)

        # Expanded list of weak/reporting verbs to filter out
        weak_practices = {
            'presents', 'presented', 'presents to', 'present to', 'present',
            'announces', 'announced', 'announces to', 'inform', 'informs', 'informed',
            'updates', 'updated', 'update', 'speaks', 'spoke', 'speak',
            'attends', 'attended', 'attend', 'discusses', 'discussed', 'discuss',
            'meets', 'met', 'meet', 'thanks', 'thanked', 'thank',
            'appoints', 'appointed', 'appoint', 'elects', 'elected', 'elect',
            'is', 'are', 'was', 'were', 'be', 'being', 'been',
            'focuses', 'focused', 'focus', 'aims', 'aimed', 'aim',
            'explains', 'explained', 'explain', 'showcases', 'showcased',
            'assured', 'assures', 'bringing forward', 'making success'
        }

        # Generic/vague counterroles to filter
        vague_counterroles = {
            'europe', 'stakeholders', 'participants', 'audiences', 'parties',
            'community', 'public', 'society', 'world', 'people'
        }

        for doc, response in zip(docs_list, responses_list):
            if "No relationships found" in response or "Error generating" in response:
                doc._.triplets = []
                continue

            triplets = []
            current = {}

            for line in response.strip().split('\n'):
                line = line.strip()
                if line.startswith('Role:'):
                    current = {'role': line.replace('Role:', '').strip()}
                elif line.startswith('Practice:'):
                    current['practice'] = line.replace('Practice:', '').strip()
                elif line.startswith('Counterrole:'):
                    current['counterrole'] = line.replace('Counterrole:', '').strip()

                    # STRICT validation
                    if len(current) == 3:
                        practice_lower = current['practice'].lower().strip()
                        counterrole_lower = current['counterrole'].lower().strip()

                        # Filter out weak practices and vague counterroles
                        if (practice_lower not in weak_practices and
                            counterrole_lower not in vague_counterroles and
                            len(current['role']) > 2 and
                            len(current['counterrole']) > 2 and
                            len(current['practice']) > 2):
                            triplets.append(current)
                    current = {}

            doc._.triplets = triplets

        return iter(docs_list)

In [18]:
config = {
    "task": {"@llm_tasks": "custom.TripletExtraction.v1"},
    "model": {
        "@llm_models": "custom.Gemini.v1",
        "api_key": gemini_key
    },
    "validate_types": False
}


In [12]:
with open("/content/combined_with_dates.txt" ,"r") as file:
  text = file.read()

In [19]:
nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")
nlp.add_pipe("llm", config=config)

Initializing Gemini...
Gemini ready


<spacy_llm.pipeline.llm.LLMWrapper at 0x78ffc65f3610>

In [20]:
def chunk_text(text, chunk_size=2500):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size+300])
        chunks.append(chunk)
    return chunks

print("\nFiltering for high-quality sentences with meaningful actions...")
text_chunks = chunk_text(text, chunk_size=2500)
print(f"Created {len(text_chunks)} chunks to process\n")



Filtering for high-quality sentences with meaningful actions...
Created 30 chunks to process



In [21]:
# Process chunks
all_triplets = []
for i, chunk in enumerate(text_chunks, 1):
    print(f"Processing chunk {i}/{len(text_chunks)}...", end=" ")
    try:
        doc = nlp(chunk)
        chunk_triplets = doc._.triplets
        all_triplets.extend(chunk_triplets)
        print(f"Found {len(chunk_triplets)} quality triplets")
    except Exception as e:
        print(f"✗ Error: {e}")
        continue

# Display results
print("\n" + "=" * 80)
print("EXTRACTED STRATEGIC ORGANIZATIONAL TRIPLETS")
print("=" * 80)

for i, t in enumerate(all_triplets, 1):
    print(f"\n{i}.")
    print(f"  Role Identity:  {t['role']}")
    print(f"  Practice:       {t['practice']}")
    print(f"  Counterrole:    {t['counterrole']}")

print("\n" + "=" * 80)
print(f"TOTAL HIGH-QUALITY TRIPLETS: {len(all_triplets)}")
print("=" * 80)

# Save outputs
with open("/content/extracted_triplets.txt", "w") as f:
    for i, t in enumerate(all_triplets, 1):
        f.write(f"{i}. {t['role']} -> {t['practice']} -> {t['counterrole']}\n")

print("\n Files saved:")
print("   - extracted_triplets.txt")

Processing chunk 1/30... Found 0 quality triplets
Processing chunk 2/30... Found 0 quality triplets
Processing chunk 3/30... Found 0 quality triplets
Processing chunk 4/30... Found 2 quality triplets
Processing chunk 5/30... Found 6 quality triplets
Processing chunk 6/30... Found 6 quality triplets
Processing chunk 7/30... Found 3 quality triplets
Processing chunk 8/30... Found 0 quality triplets
Processing chunk 9/30... Found 0 quality triplets
Processing chunk 10/30... Found 0 quality triplets
Processing chunk 11/30... Found 3 quality triplets
Processing chunk 12/30... Found 0 quality triplets
Processing chunk 13/30... Found 0 quality triplets
Processing chunk 14/30... Found 0 quality triplets
Processing chunk 15/30... Found 0 quality triplets
Processing chunk 16/30... 



Error generating content: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 15
Please retry in 14.027285886s.
Found 0 quality triplets
Processing chunk 17/30... Found 14 quality triplets
Processing chunk 18/30... Found 5 quality triplets
Processing chunk 19/30... Found 0 quality triplets
Processing chunk 20/30... Found 0 quality triplets
Processing chunk 21/30... Found 0 quality triplets
Processing chunk 22/30... Found 0 quality triplets
Processing chunk 23/30... Found 0 quality triplets
Processing chunk 24/30... Found 0 quality triplets
Processing chunk 25/30... Found 0 quality triplets
Processing chunk 26/30... Found 0 qua