In [1]:
"""
This notebook scrapes EIT (European Institute of Innovation & Technology) news articles and uses a Large Language Model (LLM) to extract triples.

Goal: Generate structured triplets capturing organizational relationships: (Role Identity -> Practice -> Counterrole)

Workflow:
 1. Collect and scrape EIT news articles by year (2008–2025)
 2. Save all raw text content into organized folders
 3. Clean and combine article texts into a single dataset
 4. Use an LLM (via HuggingFace inference API) to extract semantic triplets
 5. Store and print results as JSON objects for later analysis

"""

'\nThis notebook scrapes EIT (European Institute of Innovation & Technology) news articles and uses a Large Language Model (LLM) to extract triples.\n\nGoal: Generate structured triplets capturing organizational relationships: (Role Identity -> Practice -> Counterrole) \n\nWorkflow: \n 1. Collect and scrape EIT news articles by year (2008–2025)\n 2. Save all raw text content into organized folders\n 3. Clean and combine article texts into a single dataset\n 4. Use an LLM (via HuggingFace inference API) to extract semantic triplets\n 5. Store and print results as JSON objects for later analysis\n\n'

In [2]:
import os
from openai import OpenAI
import requests
from bs4 import BeautifulSoup
import time
from google.colab import userdata
from urllib.parse import urljoin, urlparse
import transformers
import torch
import re
from pathlib import Path
import shutil

In [3]:
url = "https://www.eit.europa.eu/news-events/news?f%5B0%5D=kic%3A11&f%5B1%5D=kic%3A12&f%5B2%5D=kic%3A13&f%5B3%5D=kic%3A14&f%5B4%5D=kic%3A111&f%5B5%5D=kic%3A112&f%5B6%5D=kic%3A113&f%5B7%5D=kic%3A165&f%5B8%5D=kic%3A247&f%5B9%5D=kic%3A248&f%5B10%5D=kic%3A1172&f%5B11%5D=year%3A2008&f%5B12%5D=year%3A2009&f%5B13%5D=year%3A2010&f%5B14%5D=year%3A2011&f%5B15%5D=year%3A2012&f%5B16%5D=year%3A2013&f%5B17%5D=year%3A2014&f%5B18%5D=year%3A2015&f%5B19%5D=year%3A2016&f%5B20%5D=year%3A2017&f%5B21%5D=year%3A2018&f%5B22%5D=year%3A2019&f%5B23%5D=year%3A2020&f%5B24%5D=year%3A2021&f%5B25%5D=year%3A2022&f%5B26%5D=year%3A2023&f%5B27%5D=year%3A2024&f%5B28%5D=year%3A2025"

In [4]:
url = url
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"} # Set User-Agent
response = requests.get(url, headers=headers) # Send a GET Request
print(response.status_code) # HTTP status code: 200

200


In [5]:
# Set your token here
hf_token = userdata.get('HF_TOKEN4')
os.environ["HF_TOKEN"] = hf_token

In [6]:
# Initialize the client
client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key= hf_token # Your HF token
    )

In [7]:
# This script scrapes EIT (European Institute of Innovation & Technology) news articles
# Collect all news article links for each year (2008–2025)
# Visit each link, extract the article text and save it into output folder

BASE_URL = "https://www.eit.europa.eu"
FILTERS = (
    "f%5B0%5D=kic%3A11&f%5B1%5D=kic%3A12&f%5B2%5D=kic%3A13&f%5B3%5D=kic%3A14&"
    "f%5B4%5D=kic%3A111&f%5B5%5D=kic%3A112&f%5B6%5D=kic%3A113&f%5B7%5D=kic%3A165&"
    "f%5B8%5D=kic%3A247&f%5B9%5D=kic%3A248&f%5B10%5D=kic%3A1172"
)
HEADERS = {"User-Agent": "Mozilla/5.0"}
OUTPUT_DIR = Path("output")
YEARS = list(range(2008, 2026))

all_links_by_year = {}

for year in YEARS:
    print(f"\n Collecting links for year {year}")
    article_links = []
    page = 0

    while True:
        url = f"https://www.eit.europa.eu/news-events/news?{FILTERS}&f%5B{11+(year-2008)}%5D=year%3A{year}&page={page}"
        print(f" Page {page}: {url}")

        try:
            response = requests.get(url, headers=HEADERS)
            if response.status_code != 200:
                print(f"Failed to fetch page {page}")
                break
        except Exception as e:
            print(f"Exception: {e}")
            break

        soup = BeautifulSoup(response.text, "html.parser")


        links_found = soup.find_all("a", href=True)
        page_article_count = 0

        for link in links_found:
            href = link.get("href")
            if href and "/news/" in href:  # Only get news article links
                if href.startswith("/"):
                    article_url = BASE_URL + href
                else:
                    article_url = href

                if article_url not in article_links:  # Avoid duplicates
                    article_links.append(article_url)
                    page_article_count += 1

        print(f"Found {page_article_count} new articles on page {page}")

        # Stop if no articles found on this page
        if page_article_count == 0:
            print(f"No more articles on page {page}")
            break

        page += 1
        time.sleep(5.0)

    all_links_by_year[year] = tuple(article_links)
    print(f"Collected {len(article_links)} links for {year}")

# Scrape articles using stored tuples
for year in YEARS:
    links = all_links_by_year[year]
    year_dir = OUTPUT_DIR / str(year)
    year_dir.mkdir(parents=True, exist_ok=True)

    print(f"\nScraping {len(links)} articles for {year}")
    for i, link in enumerate(links, start=1):
        print(f"[{year}] Article {i}: {link}")

        try:
            resp = requests.get(link, headers=HEADERS)
            if resp.status_code != 200:
                print(f"Failed to fetch article: {resp.status_code}")
                continue
        except Exception as e:
            print(f"Exception: {e}")
            continue

        soup = BeautifulSoup(resp.text, "html.parser")
        content_div = soup.select_one("div.node__content") or soup.select_one("article")

        if content_div:
            text = content_div.get_text(separator="\n", strip=True)
        else:
            text = "No content found."

        filename = year_dir / f"news_{i}.txt"
        with open(filename, "w", encoding="utf-8") as f:
            f.write(text)

        print(f"Saved: {filename}")

        time.sleep(5.0)


 Collecting links for year 2008
 Page 0: https://www.eit.europa.eu/news-events/news?f%5B0%5D=kic%3A11&f%5B1%5D=kic%3A12&f%5B2%5D=kic%3A13&f%5B3%5D=kic%3A14&f%5B4%5D=kic%3A111&f%5B5%5D=kic%3A112&f%5B6%5D=kic%3A113&f%5B7%5D=kic%3A165&f%5B8%5D=kic%3A247&f%5B9%5D=kic%3A248&f%5B10%5D=kic%3A1172&f%5B11%5D=year%3A2008&page=0
Found 6 new articles on page 0
 Page 1: https://www.eit.europa.eu/news-events/news?f%5B0%5D=kic%3A11&f%5B1%5D=kic%3A12&f%5B2%5D=kic%3A13&f%5B3%5D=kic%3A14&f%5B4%5D=kic%3A111&f%5B5%5D=kic%3A112&f%5B6%5D=kic%3A113&f%5B7%5D=kic%3A165&f%5B8%5D=kic%3A247&f%5B9%5D=kic%3A248&f%5B10%5D=kic%3A1172&f%5B11%5D=year%3A2008&page=1
Found 0 new articles on page 1
No more articles on page 1
Collected 6 links for 2008

 Collecting links for year 2009
 Page 0: https://www.eit.europa.eu/news-events/news?f%5B0%5D=kic%3A11&f%5B1%5D=kic%3A12&f%5B2%5D=kic%3A13&f%5B3%5D=kic%3A14&f%5B4%5D=kic%3A111&f%5B5%5D=kic%3A112&f%5B6%5D=kic%3A113&f%5B7%5D=kic%3A165&f%5B8%5D=kic%3A247&f%5B9%5D=kic%3A248&f%5B

KeyboardInterrupt: 

In [None]:
# Zip the entire output folder
shutil.make_archive("output", 'zip', "output")

# Download the zip file
from google.colab import files
files.download("output.zip")

In [None]:
'path = "/content/output"

year_folders = os.listdir(path)

for year_folder in year_folders:
  year_folder_path = f"{path}/{year_folder}"

  if not os.listdir(year_folder_path):
    continue

  for file_name in os.listdir(year_folder_path):
    if not file_name.endswith(".txt"):
      continue

    text_file_path = f"{path}/{year_folder}/{file_name}"

    with open(text_file_path, "r") as f:
      content = f.read()'


    prompt = f"""
    Extract semantic triplets that show relationships between an organization and external actors.

    ACADEMIC METHODOLOGY:
    1. Identify sentences that communicate relationships between at least one organizational actor and one external actor
    2. Convert all relationships to active voice
    3. Focus on role identity-practice-counterrole patterns

    RULES FOR EXTRACTION:
    1. **Role Identity**: The organizational actor (can be implicit "we/our organization" or explicit)
    2. **Practice**: The action/verb that creates a link (what the organization DOES)
    3. **Counterrole**: The external actor who receives or is affected by the practice
    4. **Date**: Mention the date of the News article. If no date available then fill NA


    EXAMPLE PATTERNS:
    - "The university teaches students" → Role: university, Practice: teaches, Counterrole: students
    - "We support entrepreneurs" → Role: organization, Practice: support, Counterrole: entrepreneurs
    - "The city provides services to residents" → Role: city, Practice: provides services, Counterrole: residents

    FOCUS ON:
    - Actions that show organizational relationships
    - Services, support, regulation, collaboration, provision
    - Clear organizational role performance toward constituencies

    EXTRACT ONLY:
    - Relationships where the organization is the actor (not recipient)
    - Concrete actions (not abstract concepts)
    - Identifiable counterroles (specific groups/categories)

    Return as JSON array with objects containing:
    - "role_identity": the organizational role being enacted
    - "practice": the specific action/service/relationship
    - "counterrole": the recipient/target of the practice
    - "sentence": the original sentence (for validation)

    TEXT TO ANALYZE:
    {content}

    JSON OUTPUT:
    """

    completion = client.chat.completions.create(
    model="openai/gpt-oss-120b", # Chosen model for triplet extraction
    messages=[ { "role": "user", "content": prompt }],
    temperature=0.1)

    print(str(completion.choices[0].message.content) + "\n")

In [None]:
# This script merges all scraped article text files into a single file (combined_output.txt)

import os

path = "/content/output"
output_file_path = "/content/combined_output.txt"

year_folders = os.listdir(path)

with open(output_file_path, "w") as output_file:
    for year_folder in year_folders:
        year_folder_path = os.path.join(path, year_folder)

        if not os.path.isdir(year_folder_path):
            continue

        files = os.listdir(year_folder_path)
        if not files:
            continue

        for file_name in files:
            if not file_name.endswith(".txt"):
                continue

            text_file_path = os.path.join(year_folder_path, file_name)

            with open(text_file_path, "r") as f:
                content = f.read()
                output_file.write(f"\n--- {year_folder}/{file_name} ---\n")
                output_file.write(content)
                output_file.write("\n")


In [None]:
"""
This script cleans the combined news text file by:
1. Removing boilerplate phrases (social media links, press release notes, etc.)
2. Removing empty lines
3. Removing very short lines (< 25 characters)
The cleaned text is saved into combined_output_cleaned.txt
"""

import re

input_file = "combined_output.txt"
output_file = "combined_output_cleaned.txt"

with open(input_file, "r", encoding="utf-8") as f:
    lines = f.readlines()

cleaned_lines = []
boilerplate_keywords = [
    "Find us on Facebook", "Subscribe to the EIT Newsletter",
    "Follow us on Twitter", "Linked in", "YouTube", "Instagram",
    "Image", "Catch up with the latest news", "Flickr album",
    "For photos from the event:", "Download the press release"
]

for line in lines:
    # Skip boilerplate
    if any(bad in line for bad in boilerplate_keywords):
        continue
    # Skip empty lines
    if len(line.strip()) == 0:
        continue
    # Skip very short lines
    if len(line.strip()) < 25:
        continue
    cleaned_lines.append(line)

with open(output_file, "w", encoding="utf-8") as f:
    f.writelines(cleaned_lines)

print(f"Original lines: {len(lines)}, Cleaned lines: {len(cleaned_lines)}")


In [9]:
"""
This script sends the cleaned news text to an LLM for semantic triple extraction.

Process:
1. Load the cleaned combined text (combined_output_cleaned.txt)
2. Construct a prompt with academic methodology and rules for extracting role practice counterrole patterns
3. Pass the prompt (first 2000 characters) to the model via Hugging Face API
4. Print the JSON-formatted triples returned by the model
"""

with open("combined_output_cleaned.txt", "r", encoding="utf-8") as f:
    content = f.readlines()


prompt = f"""
Extract semantic triplets that show relationships between an organization and external actors.

ACADEMIC METHODOLOGY:
1. Identify sentences that communicate relationships between at least one organizational actor and one external actor
2. Convert all relationships to active voice
3. Focus on role identity–practice–counterrole patterns

RULES FOR EXTRACTION:
- "role_identity": The organizational actor (explicit or implicit "we/our organization")
- "practice": Use a single, concise verb phrase that expresses ONE clear action.
   * Do not include long compound descriptions.
   * If a sentence contains multiple actions, split them into multiple triplets.
- "counterrole": The external actor who receives or is affected by the practice.
   * If multiple counterroles are mentioned (e.g., "researchers and businesses"),
     create a separate triplet for each.
   * Do not merge multiple counterroles into one field.
- Keep all practices in active voice.

EXAMPLES:
Sentence: "The university teaches students and supports faculty."
Triplets:
  {{"role_identity": "university", "practice": "teaches", "counterrole": "students", "sentence": "..."}}
  {{"role_identity": "university", "practice": "supports", "counterrole": "faculty", "sentence": "..."}}

Sentence: "The AI-on-Demand Platform serves researchers and businesses."
Triplets:
  {{"role_identity": "AI-on-Demand Platform", "practice": "serves", "counterrole": "researchers", "sentence": "..."}}
  {{"role_identity": "AI-on-Demand Platform", "practice": "serves", "counterrole": "businesses", "sentence": "..."}}

Sentence: "EIT Climate-KIC launches a forestry innovation programme to deliver European mitigation targets."
Triplet:
  {{"role_identity": "EIT Climate-KIC", "practice": "launches forestry innovation programme", "counterrole": "European mitigation targets", "sentence": "..."}}

FOCUS ON:
- Short, simple practices
- Separate triplets for each counterrole
- Concrete organizational actions (not abstract concepts)
- Relationships where the organization is the actor

RETURN FORMAT:
JSON array of objects with keys:
- "role_identity"
- "practice"
- "counterrole"
- "sentence" (original sentence for validation)

TEXT TO ANALYZE:
{content[:2000]}

JSON OUTPUT:
"""


# prompt = f"""
# Extract semantic triplets that show relationships between an organization and external actors.

# ACADEMIC METHODOLOGY:
# 1. Identify sentences that communicate relationships between at least one organizational actor and one external actor
# 2. Convert all relationships to active voice
# 3. Focus on role identity-practice-counterrole patterns

#         RULES FOR EXTRACTION:
#         1. **Role Identity**: The organizational actor (can be implicit "we/our organization" or explicit)
#         2. **Practice**: The action/verb that creates a link (what the organization DOES)
#         3. **Counterrole**: The external actor who receives or is affected by the practice
#         4. **Date**: Mention the date of the News article. If no date available then fill NA


#         EXAMPLE PATTERNS:
#         - "The university teaches students" → Role: university, Practice: teaches, Counterrole: students
#         - "We support entrepreneurs" → Role: organization, Practice: support, Counterrole: entrepreneurs
#         - "The city provides services to residents" → Role: city, Practice: provides services, Counterrole: residents

#         FOCUS ON:
#         - Actions that show organizational relationships
#         - Services, support, regulation, collaboration, provision
#         - Clear organizational role performance toward constituencies

#         EXTRACT ONLY:
#         - Relationships where the organization is the actor (not recipient)
#         - Concrete actions (not abstract concepts)
#         - Identifiable counterroles (specific groups/categories)

#         Return as JSON array with objects containing:
#         - "role_identity": the organizational role being enacted
#         - "practice": the specific action/service/relationship
#         - "counterrole": the recipient/target of the practice
#         - "sentence": the original sentence (for validation)

#         TEXT TO ANALYZE:
#         {content[:2000]}

#         JSON OUTPUT:
#         """

completion = client.chat.completions.create(
model="openai/gpt-oss-120b", # Chosen model for triplet extraction
messages=[ { "role": "user", "content": prompt }],
temperature=0.1)

print(str(completion.choices[0].message.content) + "\n")

[
  {
    "role_identity": "Baxter",
    "practice": "sells",
    "counterrole": "hospitals",
    "sentence": "Baxter operates a global manufacturing network and its products are sold in more than 100 countries around the world, and they are used in a variety of care settings, including hospitals."
  },
  {
    "role_identity": "Baxter",
    "practice": "sells",
    "counterrole": "kidney dialysis centres",
    "sentence": "Baxter operates a global manufacturing network and its products are sold in more than 100 countries around the world, and they are used in a variety of care settings, including ... kidney dialysis centres ..."
  },
  {
    "role_identity": "Baxter",
    "practice": "sells",
    "counterrole": "nursing homes",
    "sentence": "Baxter operates a global manufacturing network and its products are sold in more than 100 countries around the world, and they are used in a variety of care settings, including ... nursing homes ..."
  },
  {
    "role_identity": "Baxter",
    