In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import re
import spacy
from spacy.pipeline import EntityRuler
import json

In [3]:
nlp = spacy.load("en_core_web_sm") # load English NLP model

In [4]:
ruler = nlp.add_pipe("entity_ruler", before="ner")
ruler.add_patterns([
    {"label": "ORG", "pattern": "EIT"},
    {"label": "ORG", "pattern": [{"LOWER": "eit"}, {"LOWER": "digital"}]},
    {"label": "ORG", "pattern": [{"LOWER": "eit"}, {"LOWER": "health"}]},
    {"label": "ORG", "pattern": [{"LOWER": "eit"}, {"LOWER": "rawmaterials"}]},
    {"label": "ORG", "pattern": [{"LOWER": "eit"}, {"LOWER": "climate"}, {"IS_PUNCT": True, "OP": "?"}, {"LOWER": "kic"}]},
    {"label": "ORG", "pattern": [{"LOWER": "eit"}, {"LOWER": "urban"}, {"LOWER": "mobility"}]},
    {"label": "ORG", "pattern": [{"LOWER": "eit"}, {"LOWER": "innoenergy"}]},
    {"label": "ORG", "pattern": [{"LOWER": "eit"}, {"LOWER": "food"}]},
    {"label": "ORG", "pattern": [{"LOWER": "eit"}, {"LOWER": "manufacturing"}]},
])

In [5]:
# normalize org name variants and pronouns to one canonical form

ORG_MAP = {
  "eit": "EIT",
  "eit alumni": "EIT Alumni",
  "eitalumni": "EIT Alumni",
  "eit alumni board": "EIT Alumni Board",
  "eitalumniboard": "EIT Alumni Board",
  "eit alumni connect": "EIT Alumni CONNECT",
  "eitalumniconnect": "EIT Alumni CONNECT",
  "eit alumni president": "EIT Alumni President",
  "eitalumnipresident": "EIT Alumni President",
  "eit award": "EIT Award",
  "eitaward": "EIT Award",
  "eit chairman": "EIT Chairman",
  "eitchairman": "EIT Chairman",
  "eit change award": "EIT Change Award",
  "eitchangeaward": "EIT Change Award",
  "eit climate": "EIT Climate",
  "eitclimate": "EIT Climate",
  "eit climate kic": "EIT Climate-KIC",
  "eit climate-kic": "EIT Climate-KIC",
  "eitclimate-kic": "EIT Climate-KIC",
  "eit culture": "EIT Culture",
  "eitculture": "EIT Culture",
  "eit deep tech talent initiative": "EIT Deep Tech Talent Initiative",
  "eitdeeptechtalentinitiative": "EIT Deep Tech Talent Initiative",
  "eit deep tech talent initiative pledger": "EIT Deep Tech Talent Initiative Pledger",
  "eitdeeptechtalentinitiativepledger": "EIT Deep Tech Talent Initiative Pledger",
  "eit digital": "EIT Digital",
  "eitdigital": "EIT Digital",
  "eit digital accelerator": "EIT Digital Accelerator",
  "eitdigitalaccelerator": "EIT Digital Accelerator",
  "eit digital alumni": "EIT Digital Alumni",
  "eitdigitalalumni": "EIT Digital Alumni",
  "eit digital challenge": "EIT Digital Challenge",
  "eitdigitalchallenge": "EIT Digital Challenge",
  "eit digital master school": "EIT Digital Master School",
  "eitdigitalmasterschool": "EIT Digital Master School",
  "eit digital venture program": "EIT Digital Venture Program",
  "eitdigitalventureprogram": "EIT Digital Venture Program",
  "eit director": "EIT Director",
  "eitdirector": "EIT Director",
  "eit entrepreneurship award": "EIT Entrepreneurship Award",
  "eitentrepreneurshipaward": "EIT Entrepreneurship Award",
  "eit entrepreneurship seminar": "EIT Entrepreneurship Seminar",
  "eitentrepreneurshipseminar": "EIT Entrepreneurship Seminar",
  "eit fan": "EIT FAN",
  "eitfan": "EIT FAN",
  "eit food": "EIT Food",
  "eitfood": "EIT Food",
  "eit food accelerator network": "EIT Food Accelerator Network",
  "eitfoodacceleratornetwork": "EIT Food Accelerator Network",
  "eit girls go circular": "EIT Girls Go Circular",
  "eitgirlsgocircular": "EIT Girls Go Circular",
  "eit governing board": "EIT Governing Board",
  "eitgoverningboard": "EIT Governing Board",
  "eit hei initiative": "EIT HEI Initiative",
  "eitheiinitiative": "EIT HEI Initiative",
  "eit headquarters": "EIT Headquarters",
  "eitheadquarters": "EIT Headquarters",
  "eit health": "EIT Health",
  "eithealth": "EIT Health",
  "eit health accelerator": "EIT Health Accelerator",
  "eithealthaccelerator": "EIT Health Accelerator",
  "eit health bridgehead europe": "EIT Health Bridgehead Europe",
  "eithealthbridgeheadeurope": "EIT Health Bridgehead Europe",
  "eit health high value care forum": "EIT Health High Value Care Forum",
  "eithealthhighvaluecareforum": "EIT Health High Value Care Forum",
  "eit health hubs": "EIT Health Hubs",
  "eithealthhubs": "EIT Health Hubs",
  "eit health innostars": "EIT Health InnoStars",
  "eithealthinnostars": "EIT Health InnoStars",
  "eit health partners": "EIT Health Partners",
  "eithealthpartners": "EIT Health Partners",
  "eit health summer schools": "EIT Health Summer Schools",
  "eithealthsummerschools": "EIT Health Summer Schools",
  "eit health wild card": "EIT Health Wild Card",
  "eithealthwildcard": "EIT Health Wild Card",
  "eit health supported": "EIT Health-supported",
  "eit health-supported": "EIT Health-supported",
  "eithealth-supported": "EIT Health-supported",
  "eit higher education initiative": "EIT Higher Education Initiative",
  "eithighereducationinitiative": "EIT Higher Education Initiative",
  "eit house": "EIT House",
  "eithouse": "EIT House",
  "eit ict labs": "EIT ICT Labs",
  "eitictlabs": "EIT ICT Labs",
  "eit ict labs master school": "EIT ICT Labs Master School",
  "eitictlabsmasterschool": "EIT ICT Labs Master School",
  "eit ict labs summer schools": "EIT ICT Labs Summer Schools",
  "eitictlabssummerschools": "EIT ICT Labs Summer Schools",
  "eit information day": "EIT Information Day",
  "eitinformationday": "EIT Information Day",
  "eit innoenergy": "EIT InnoEnergy",
  "eitinnoenergy": "EIT InnoEnergy",
  "eit innoenergy master": "EIT InnoEnergy Master",
  "eitinnoenergymaster": "EIT InnoEnergy Master",
  "eit innoenergy scandinavia": "EIT InnoEnergy Scandinavia",
  "eitinnoenergyscandinavia": "EIT InnoEnergy Scandinavia",
  "eit innovation communities": "EIT Innovation Communities",
  "eitinnovationcommunities": "EIT Innovation Communities",
  "eit innovators award": "EIT Innovators Award",
  "eitinnovatorsaward": "EIT Innovators Award",
  "eit interim director": "EIT Interim Director",
  "eitinterimdirector": "EIT Interim Director",
  "eit jumpstarter": "EIT Jumpstarter",
  "eitjumpstarter": "EIT Jumpstarter",
  "eit kic": "EIT KIC",
  "eitkic": "EIT KIC",
  "eit kic factsheets": "EIT KIC Factsheets",
  "eitkicfactsheets": "EIT KIC Factsheets",
  "eit knowledge": "EIT Knowledge",
  "eitknowledge": "EIT Knowledge",
  "eit label": "EIT Label",
  "eitlabel": "EIT Label",
  "eit manufacturing": "EIT Manufacturing",
  "eitmanufacturing": "EIT Manufacturing",
  "eit model": "EIT Model",
  "eitmodel": "EIT Model",
  "eit public award": "EIT PUBLIC Award",
  "eitpublicaward": "EIT PUBLIC Award",
  "eit rib": "EIT RIB",
  "eitrib": "EIT RIB",
  "eit ris": "EIT RIS",
  "eitris": "EIT RIS",
  "eit raw materials": "EIT RawMaterials",
  "eit rawmaterials": "EIT RawMaterials",
  "eitrawmaterials": "EIT RawMaterials",
  "eit regional innovation booster": "EIT Regional Innovation Booster",
  "eitregionalinnovationbooster": "EIT Regional Innovation Booster",
  "eit regional innovation scheme": "EIT Regional Innovation Scheme",
  "eitregionalinnovationscheme": "EIT Regional Innovation Scheme",
  "eit regulation": "EIT Regulation",
  "eitregulation": "EIT Regulation",
  "eit roundtable": "EIT Roundtable",
  "eitroundtable": "EIT Roundtable",
  "eit stakeholder forum": "EIT Stakeholder Forum",
  "eitstakeholderforum": "EIT Stakeholder Forum",
  "eit strategic innovation agenda": "EIT Strategic Innovation Agenda",
  "eitstrategicinnovationagenda": "EIT Strategic Innovation Agenda",
  "eit supernovas": "EIT Supernovas",
  "eitsupernovas": "EIT Supernovas",
  "eit urban mobility": "EIT Urban Mobility",
  "eiturbanmobility": "EIT Urban Mobility",
  "eit urban mobility supported": "EIT Urban Mobility-supported",
  "eit urban mobility-supported": "EIT Urban Mobility-supported",
  "eiturbanmobility-supported": "EIT Urban Mobility-supported",
  "eit venture award": "EIT Venture Award",
  "eitventureaward": "EIT Venture Award",
  "eit water": "EIT Water",
  "eitwater": "EIT Water",
  "eit women leadership": "EIT Women Leadership",
  "eitwomenleadership": "EIT Women Leadership"
}

In [6]:
def clean_text_thoroughly(text):
    import re

    # Removes special characters
    text = re.sub(r'\[tweetable[^]]*\]', '', text)
    text = re.sub(r'\[/tweetable\]', '', text)
    text = re.sub(r'\[[^\]]*\]', '', text)
    text = re.sub(r'["\\\n\r\t]', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

In [7]:
def normalize_entity(text):
    normalized = text.lower().strip()
    return ORG_MAP.get(normalized, text)

In [8]:
def get_entity_with_context(token, sent):
    """Expanding noun phrases and normalizing"""

    # Skip pronouns
    skip_words = {'that', 'which', 'who', 'this', 'we', 'it', 'they'}
    if token.text.lower() in skip_words:
        # Try to find nearby named entities in the sentence
        for ent in sent.ents:
            if ent.label_ in ["ORG"]:
                return normalize_entity(ent.text)
        return None

    # If it's part of a named entity, get the full entity
    if token.ent_type_:
        for ent in sent.ents:
            if token.i >= ent.start and token.i < ent.end:
                return normalize_entity(ent.text)

    expanded_text = expand_noun_phrase(token)

    # Normalize the expanded text
    normalized = normalize_entity(expanded_text)

    # Filter out common words
    if len(normalized) < 3 or normalized.lower() in {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}:
        return None

    return normalized

In [9]:
def expand_noun_phrase(token):
    start_idx = token.i
    end_idx = token.i + 1

    for left_token in token.lefts:
        if left_token.dep_ in ["compound", "amod", "det", "nummod"]:
            start_idx = min(start_idx, left_token.i)

    for right_token in token.rights:
        if right_token.dep_ in ["compound", "amod"]:
            end_idx = max(end_idx, right_token.i + 1)

    expanded = token.doc[start_idx:end_idx].text
    return expanded.strip()

In [10]:
def extract_triplets_integrated(text):

    doc = nlp(text)
    triplets = []

    # Define meaningful organizational verbs
    meaningful_verbs = {
        'support', 'provide', 'offer', 'create', 'develop', 'establish', 'fund',
        'manage', 'coordinate', 'collaborate', 'train', 'educate', 'connect',
        'enable', 'facilitate', 'invest', 'partner', 'award', 'grant', 'deliver',
        'serve', 'help', 'assist', 'foster', 'promote', 'build', 'strengthen'
    }

    # Process each sentence
    for sent in doc.sents:
        # Find all verbs
        for token in sent:
            if token.pos_ == "VERB" and token.lemma_.lower() in meaningful_verbs:

                subject = None
                obj = None

                # Find subject and object for this verb
                for child in token.children:
                    # Subject relationships
                    if child.dep_ in ["nsubj", "nsubjpass"]:
                        subject = get_entity_with_context(child, sent)
                    # Object relationships
                    elif child.dep_ in ["dobj"]:
                        obj = get_entity_with_context(child, sent)
                    # Prepositional objects
                    elif child.dep_ == "prep":
                        for grandchild in child.children:
                            if grandchild.dep_ == "pobj":
                                obj = get_entity_with_context(grandchild, sent)

                # If we found both subject and object, create triplet
                if subject and obj and subject != obj:
                    triplet = {
                        "role": subject,
                        "practice": token.lemma_,  # Using lemma for consistency
                        "counterrole": obj,
                        "context": sent.text.strip()
                    }
                    triplets.append(triplet)

    return triplets

In [11]:
import json

if __name__ == "__main__":

    data_path = "/content/combined_output_cleaned.txt"
    with open(data_path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()

    text = clean_text_thoroughly(text)
    rows = extract_triplets_integrated(text)

    out_path = "triples.json"

    # Save as JSON
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(rows, f, ensure_ascii=False, indent=2)

    print(f"\nSaved {len(rows)} triples to {out_path}")



Saved 603 triples to triples.json
