In [189]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [190]:
df = pd.read_csv("../data/cordis.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Project Title  21 non-null     object 
 1   Short Title    23 non-null     object 
 2   Project ID     21 non-null     float64
 3   Program        21 non-null     object 
 4   Language       21 non-null     object 
 5   URL            22 non-null     object 
 6   Abstract       21 non-null     object 
dtypes: float64(1), object(6)
memory usage: 1.4+ KB


In [191]:
df.columns = df.columns.str.lower()

In [192]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   project title  21 non-null     object 
 1   short title    23 non-null     object 
 2   project id     21 non-null     float64
 3   program        21 non-null     object 
 4   language       21 non-null     object 
 5   url            22 non-null     object 
 6   abstract       21 non-null     object 
dtypes: float64(1), object(6)
memory usage: 1.4+ KB


In [193]:
# Remove row without url
df2 = df[df.url.notnull()].copy()
df2.head()

Unnamed: 0,project title,short title,project id,program,language,url,abstract
0,Collective Awareness Platform for Tropospheric...,CAPTOR,688110.0,H2020,en,https://cordis.europa.eu/project/id/688110,Air pollution is the environmental topic that ...
1,Citizens' observatory for coast and ocean opti...,CITCLOPS,308469.0,FP7,es,https://cordis.europa.eu/project/id/308469/es,"In the marine environment, anthropogenic press..."
2,Development of sensor-based Citizens' Observat...,CITI-SENSE,308524.0,FP7,en,https://cordis.europa.eu/project/id/308524,"CITI-SENSE will develop ""citizens' observatori..."
3,Citizen Science for Urban Environment and Health,CitieS-Health,824484.0,H2020,en,https://cordis.europa.eu/project/id/824484,Scientific evidence about the negative health ...
4,Citizen Observatory Web,COBWEB,308513.0,FP7,en,https://cordis.europa.eu/project/id/308513,COBWEB will leverage the UNESCO World Network ...


In [194]:
# Unify url format
df2['url'] = df2['url'].str.replace("/es", "")

In [200]:
# Functions to extract information from cordis urls

def safe_select(soup, selector, attr=None, default=None, split=None, index=None, clean=True):
    try:
        element = soup.select_one(selector)
        if not element:
            return default
        value = element[attr] if attr else element.text
        if clean and isinstance(value, str):
            value = value.strip().replace("  ", "").replace("\n", " ").replace("\t", " ").replace("  ", " ").replace("See on map", "")
        if split:
            value = value.split(split)[index]
        return value
    except Exception:
        return default


def get_second_href_in_container(soup):
    try:
        container = soup.select_one("div.t-margin-bottom-20")
        if not container:
            return None
        links = container.select("a[href]")
        if len(links) >= 2:
            return links[1]['href']
        return None
    except Exception:
        return None

def get_signature_date(soup):
    for div in soup.find_all("div"):
        text = div.get_text(strip=True)
        if text.startswith("EC signature date"):
            return text.replace("EC signature date", "").strip()
    return None 

# Función para extraer la información de cada URL
def extract_project_info(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        # Participantes
        try:
            number_participants = 0
            for h3 in soup.find_all("h3"):
                text = h3.get_text(strip=True)
                if text.startswith("Participants ("):
                    try:
                        number_participants = int(text.split("(")[-1].replace(")", ""))
                        break
                    except:
                        pass
        except:
                number_participants = 0
        data = {
            "project_website": safe_select(soup, ".ppas-project_weblink", attr="data-url"),
            "doi": safe_select(soup, "div.p-col-6:nth-child(1) > div:nth-child(6) > a:nth-child(3)", attr="href"),
            "status": safe_select(soup, ".c-project-info__status"),
            "signature_date": get_signature_date(soup),
            "start_date": safe_select(soup, ".c-project-info__timeline > div:nth-child(1) > div:nth-child(1)"),
            "end_date": safe_select(soup, ".c-project-info__timeline > div:nth-child(1) > div:nth-child(2)"),
            "funded_under": safe_select(soup, ".c-project-info__fund-list > li:nth-child(1)"),
            "total_cost": safe_select(soup, ".c-project-info__overall", split="€", index=1),
            "eu_contribution": safe_select(soup, ".c-project-info__eu", split="€", index=1),
            "coordinated_by": safe_select(soup, ".coordinated"),
            "objective": safe_select(soup, ".c-article__text"),
            "keywords": safe_select(soup, "div.c-factsheet__section:nth-child(3)").replace("Keywords", "") if safe_select(soup, "div.c-factsheet__section:nth-child(3)") else None,
            "programme": soup.select_one("#fundedunderprogrammes").next_sibling.next_sibling.text.strip().replace("\t", "").replace("\n", " ").replace("  ", " "),
            "topic": soup.select_one("#topicslist").next_sibling.next_sibling.text.strip(),
            "coordinator": safe_select(soup, "div.t-margin-bottom-20 > div:nth-child(1) > div:nth-child(1) > div:nth-child(3) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1)"),
            "coordinator_eu_contribution": safe_select(soup, "div.t-margin-bottom-20 > div:nth-child(1) > div:nth-child(1) > div:nth-child(3) > div:nth-child(1) > div:nth-child(2)", split="\n", index=-1),
            "coordinator_adress": safe_select(soup, "div.t-margin-bottom-20 > div:nth-child(1) > div:nth-child(1) > div:nth-child(3) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2)"),
            "coordinator_website": get_second_href_in_container(soup),
            "number_participants": number_participants,
        }

        participants = []
        for n in range(1, number_participants + 1):
            participant = {
                "participant_no": n,
                "participant_name": safe_select(soup, f"div.c-organizations-list__item:nth-child({n}) > div > div > div:nth-child(3) > div:nth-child(1) > div:nth-child(1) > div"),
                "participant_address": safe_select(soup, f"div.c-organizations-list__item:nth-child({n}) > div > div > div:nth-child(3) > div:nth-child(2) > div > div > div > div:nth-child(2)"),
                "participant_eu_contribution": safe_select(soup, f"div.c-organizations-list__item:nth-child({n}) > div > div > div:nth-child(3) > div:nth-child(1) > div:nth-child(2) > div:nth-child(2)"),
                "participant_total_cost": safe_select(soup, f"div.c-organizations-list__item:nth-child({n}) > div > div > div:nth-child(3) > div:nth-child(2) > div:nth-child(1) > div:nth-child(2) > div:nth-child(2)"),
                "participant_website": safe_select(soup, f"div.c-organizations-list__item:nth-child({n}) > div > div > div:nth-child(3) > div:nth-child(2) > div > div > div:nth-child(5) > span:nth-child(2) > a", attr="href"),
            }
            participants.append(participant)

        data["participants"] = participants
        return data

    except Exception as e:
        print(f"Error in URL {url}: {e}")
        return {}

In [201]:
# Apply function to every url
df_info = df2["url"].apply(extract_project_info).apply(pd.Series)

# Join with original dataframe
df_full = pd.concat([df2, df_info], axis=1)

# Clean columns and save result in file with one row by project
df_full['start_date'] = df_full['start_date'].str.replace("Start date ", "").str.strip()
df_full['end_date'] = df_full['end_date'].str.replace("End date ", "").str.strip()
df_full["total_cost"] = df_full["total_cost"].str.replace(" ", "")
df_full["eu_contribution"] = df_full["eu_contribution"].str.replace(" ", "")
df_full['coordinator_eu_contribution'] = df_full['coordinator_eu_contribution'].str.split(" € ").str[1].str.replace(" ", "")
df_full.loc[df_full['coordinator_website'].str.startswith("https://dashboard.tech.ec.europa.eu"), "coordinator_website"] = None
df_full.to_csv("../data/cordis_projects.csv", index=False)

# Explote participants list into rows and save result in file with one row by participant
participants_df = df_full.explode("participants").reset_index(drop=True)
participants_df = pd.concat([participants_df.drop("participants", axis=1), participants_df["participants"].apply(pd.Series)], axis=1)
participants_df["participant_eu_contribution"] = participants_df["participant_eu_contribution"].str.replace("€ ", "").str.replace(" ", "").str.replace("Nodata", "No data")
participants_df["participant_total_cost"] = participants_df["participant_total_cost"].str.replace("€ ", "").str.replace(" ", "").str.replace("Nodata", "No data")
participants_df.to_csv("../data/cordis_projects_and_participants.csv", index=False)