In [None]:
!pip install spacy



In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import re
from openai import OpenAI
import os
import json
import gradio as gr
import requests
from bs4 import BeautifulSoup
import spacy
import pandas as pd
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from google.colab import userdata

In [2]:
website_list = {
"EIT Health": "https://eithealth.eu/who-we-are/",
"EIT Digital": "https://www.eitdigital.eu/our-community/purpose/",
"EIT Food": "https://www.eitfood.eu/about-us?",   # Check
"Climate-KIC": "https://www.climate-kic.org/who-we-are/about-climate-kic/",
"EIT RawMaterials": "https://eitrawmaterials.eu/about-us/our-mission",
"EIT Culture & Creativity": "https://eit-culture-creativity.eu/about-us/",
"EIT Urban Mobility": "https://www.eiturbanmobility.eu/who-we-are/about-us/",
"EIT Manufacturing": "https://www.eitmanufacturing.eu/about-us/",
"EIT InnoEnergy": "https://www.innoenergy.com/discover-innoenergy/",
}

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

In [None]:
print(type(website_list))

<class 'dict'>


In [None]:
# Set your token here
hf_token = userdata.get('HF_TOKEN')
os.environ["HF_TOKEN"] = hf_token

In [None]:
# Initialize the client
client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key= hf_token # Your HF token
    )

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
actor_data = []
results = []

In [None]:
for actor, url in website_list.items():
  response = requests.get(url, headers=headers, timeout=15, verify=False)
  response.raise_for_status()
  soup = BeautifulSoup(response.content, "html.parser")

  all_texts = []
  for text in soup.find_all(['p', 'li', 'h1', 'h2', 'h3', 'span']):
    all_texts.append(text.get_text(separator=' ', strip=True))
  page_text = ' '.join(all_texts)

  doc = nlp(page_text)
  dates = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
  locations = [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]

  prompt = f"""
    You are analyzing the About page of {actor}.

    Candidate dates: {dates}
    Candidate locations: {locations}

    Task:
    1. Identify the official launch/founding date of {actor}.
    2. Identify the primary location or headquarters of {actor}.
    3. Provide a short role description of what {actor} is.

    Return result as list:
    ["actor", "date_launched", "location"]

    Context text (may contain noise, but focus on dates/locations that answer the task):
    {page_text[:]}
    """

  completion = client.chat.completions.create(
      model="openai/gpt-oss-120b",
      messages=[{"role": "user", "content": prompt}],
      temperature=0
    )


  results.append(completion.choices[0].message.content)

In [None]:
print(results)

['["EIT Health", "2015", "Munich, Germany"]', '["EIT Digital", "2010", "Europe"]', '["EIT Food", "2016", "Europe"]', '["Climate-KIC", "2010", "Europe"]', '["EIT RawMaterials", "2015", "Europe"]', '["EIT Culture & Creativity", "2023", "Cologne, Germany"]', '["EIT Urban Mobility", "2019", "Barcelona"]']


In [None]:
rows = [json.loads(r) for r in results]

In [None]:
print(rows)

[['EIT Health', '2015', 'Munich, Germany'], ['EIT Digital', '2010', 'Europe'], ['EIT Food', '2016', 'Europe'], ['Climate-KIC', '2010', 'Europe'], ['EIT RawMaterials', '2015', 'Europe'], ['EIT Culture & Creativity', '2023', 'Cologne, Germany'], ['EIT Urban Mobility', '2019', 'Barcelona']]


In [None]:
with gr.Blocks() as demo:
    gr.Markdown("EIT Actor Catalog")
    table = gr.Dataframe(value=rows, headers=["Actor", "Date Launched", "Location"], datatype=["str", "str", "str", "str"])
demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://de3e764777e206f7a6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
import gradio as gr

# List of actors
actor_lists = {
    "EIT Organizations": [
        "EIT Health", "EIT Digital", "EIT RawMaterials", "EIT Manufacturing",
        "EIT Urban Mobility", "EIT Food", "EIT InnoEnergy", "EIT Health Belgium-Netherlands",
        "EIT Health Germany", "EIT Digital Accelerator"
    ],
    "Universities / Research Institutes": [
        "Delft University of Technology", "Eötvös Lorand University", "KTH Institute",
        "University of Maastricht", "LEITAT Technology Center", "University of Luxembourg",
        "University of Pécs", "University of Porto", "University of Debrecen", "University of Cambridge"
    ],
    "Companies / Corporates": [
        "Microsoft", "GE Healthcare", "Philips", "Siemens Healthineers", "Sanofi",
        "Bosch", "IBM", "Bayer", "Johnson & Johnson", "Roche"
    ],
    "Start-ups": [
        "Tracegrow", "Entremo", "Recycleye", "AMEN New Technologies", "InnoTractor",
        "LMAD", "SeizeIT", "NanoRacks", "Feno", "OvaExpert"
    ],
    "Government / Public Sector": [
        "European Commission", "Hungarian Ministry for Innovation and Technology",
        "National Research, Development and Innovation Office (Hungary)",
        "Ministry of Human Capacities (Hungary)",
        "German Federal Ministry of Education and Research",
        "City of Debrecen", "Central, Eastern and Southern Europe",
        "National Health Service (UK)", "Health Ministry"
    ]
}

def show_list(category):
    return "\n".join(actor_lists[category])

gr.Interface(
    fn=show_list,
    inputs=gr.Dropdown(choices=list(actor_lists.keys()), label="Select List"),
    outputs=gr.Textbox(label="Output", lines=12),

    title="Actor List Viewer",
    allow_flagging="never",

).launch()




It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://870e38ed04ef069d6d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


