In [149]:
# Import additional necessary libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import random
import re

Process tested with platforms list:

In [150]:
platform_list = [
    "observation.org",
    "seasearch.org.uk",
    "natuurpunt.be",
    "slu.se/artdatabanken",
    "theroadlab.co.uk",
    "exploreyourshore.ie",
    "eyeonwater.org",
    "iseahorse.org",
    "redpromar.org",
    "coastwards.org",
]

In [151]:
df_platforms = pd.DataFrame(platform_list)
df_platforms.columns = ['platform_url']
df_platforms

Unnamed: 0,platform_url
0,observation.org
1,seasearch.org.uk
2,natuurpunt.be
3,slu.se/artdatabanken
4,theroadlab.co.uk
5,exploreyourshore.ie
6,eyeonwater.org
7,iseahorse.org
8,redpromar.org
9,coastwards.org


# active

In [152]:
def check_website(url):
    try:
        # Ensure URL has protocol
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        session = requests.Session()
        session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Referer": "https://www.google.com/"
        })
        response = session.get(url, timeout=5)
        return f"Status code: {response.status_code}"
    except requests.exceptions.RequestException as e:
        return f"Fallo: {str(e)}"

In [153]:
df_platforms['active'] = df_platforms['platform_url'].apply(check_website)

In [154]:
df_platforms['active'] 

0                                     Status code: 200
1                                     Status code: 200
2                                     Status code: 200
3                                     Status code: 200
4                                     Status code: 200
5                                     Status code: 200
6                                     Status code: 200
7                                     Status code: 200
8                                     Status code: 200
9    Fallo: HTTPSConnectionPool(host='coastwards.or...
Name: active, dtype: object

# platform_about

In [155]:
def extract_about_link(base_url):
    """
    Extracts the 'About' link from a website.
    Searches for common variations of 'About' links on the site.
    
    Args:
        base_url (str): Base URL of the platform
        
    Returns:
        str: URL of the About page if found, None if not
    """
    try:
        # Ensure URL has protocol
        if not base_url.startswith(('http://', 'https://')):
            base_url = 'https://' + base_url
        
        # Configure headers to avoid blocking
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }
        
        # Make HTTP request
        response = requests.get(base_url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Possible 'About' link texts
        about_texts = [
            'about', 'about us', 'about this site', 'who we are', 
            'quienes somos', 'acerca de', 'sobre nosotros', 'over ons',
            'chi siamo', 'wer sind wir', 'à propos', 'om oss'
        ]
        
        # Search for links containing 'About' related texts
        for link in soup.find_all('a', href=True):
            link_text = link.get_text(strip=True).lower()
            href = link.get('href')
            
            # Check if link text contains any variation of 'about'
            if any(about_text in link_text for about_text in about_texts):
                # Convert to absolute URL if relative
                if href.startswith('/'):
                    about_url = urljoin(base_url, href)
                elif href.startswith('http'):
                    about_url = href
                else:
                    about_url = urljoin(base_url, '/' + href)
                
                return about_url
        
        # If not found, search common 'about' page URLs
        common_about_paths = ['/about', '/about-us', '/about.html', '/who-we-are', '/info']
        for path in common_about_paths:
            test_url = urljoin(base_url, path)
            try:
                test_response = requests.head(test_url, headers=headers, timeout=5)
                if test_response.status_code == 200:
                    return test_url
            except:
                continue
                
        return None
        
    except Exception as e:
        print(f"Error procesando {base_url}: {e}")
        return None

In [156]:
df_platforms['platform_about'] = df_platforms['platform_url'].apply(extract_about_link)

Error procesando https://coastwards.org: HTTPSConnectionPool(host='coastwards.org', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7ff0c5511290>: Failed to establish a new connection: [Errno 111] Conexión rehusada'))


In [157]:
df_platforms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   platform_url    10 non-null     object
 1   active          10 non-null     object
 2   platform_about  7 non-null      object
dtypes: object(3)
memory usage: 372.0+ bytes


In [158]:
df_platforms

Unnamed: 0,platform_url,active,platform_about
0,observation.org,Status code: 200,https://observation.org/about
1,seasearch.org.uk,Status code: 200,https://www.seasearch.org.uk/about
2,natuurpunt.be,Status code: 200,https://natuurpunt.be/dit-is-natuurpunt
3,slu.se/artdatabanken,Status code: 200,
4,theroadlab.co.uk,Status code: 200,https://www.theroadlab.co.uk/about
5,exploreyourshore.ie,Status code: 200,https://exploreyourshore.ie/about-explore-your...
6,eyeonwater.org,Status code: 200,https://eyeonwater.org/about-us
7,iseahorse.org,Status code: 200,https://projectseahorse.org/about-us/
8,redpromar.org,Status code: 200,
9,coastwards.org,Fallo: HTTPSConnectionPool(host='coastwards.or...,


# year_creation

In [159]:
import whois

def get_domain_creation_year(url):
    # Ensure URL has protocol
    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url
    try:
        domain = whois.whois(url)
        creation_date = domain.creation_date
        if isinstance(creation_date, list):  # Some WHOIS return multiple dates
            creation_date = creation_date[0]
        return creation_date.year
    except Exception as e:
        return None

# Test function
year = get_domain_creation_year("minka-sdg.org")
print(f"Year of creation of the domain: {year}")

Year of creation of the domain: 2021


In [160]:
df_platforms['year_creation'] = df_platforms['platform_url'].apply(get_domain_creation_year)

In [161]:
df_platforms['year_creation']

0    2002.0
1    1999.0
2       NaN
3    1989.0
4    2022.0
5    2019.0
6    2015.0
7    2012.0
8    2020.0
9    2016.0
Name: year_creation, dtype: float64

# terms_use_link

Domain slu.se is a very old universitary domain, but artdatabanken can be created later. The function get_domain_creation_year gives just the year when the main domain was registered. But we can't check anything else.

In [162]:
def extract_terms_link(url):
    """
    Extracts the link to a web page's terms of use/conditions.
    Searches for common variations of terms links in multiple languages.

    Args:
        url (str): Platform URL

    Returns:
        str: URL of the terms page if found, None otherwise
    """
    try:
        # Ensure the URL has a protocol
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        
        # Configure headers to avoid blocking
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }
        
        # Make a request
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Possible term link texts (in multiple languages)
        terms_texts = [
            # English
            'terms', 'terms of use', 'terms of service', 'terms and conditions', 
            'conditions of use', 'user terms', 'service terms', 'legal terms',
            'terms & conditions', 'tos', 'terms of service agreement',
            
            # Spanish
            'términos', 'términos de uso', 'términos y condiciones', 
            'condiciones de uso', 'términos del servicio', 'condiciones',
            'términos legales', 'condiciones legales',
            
            # French
            'conditions', 'conditions d\'utilisation', 'termes', 
            'conditions générales', 'cgu', 'mentions légales',
            
            # German
            'nutzungsbedingungen', 'geschäftsbedingungen', 'bedingungen',
            'agb', 'nutzungsbestimmungen',
            
            # Dutch
            'gebruiksvoorwaarden', 'voorwaarden', 'algemene voorwaarden',
            
            # Italian
            'termini', 'condizioni', 'termini di utilizzo', 'condizioni d\'uso',
            
            # Other common patterns
            'legal', 'legal notice', 'disclaimer'
        ]
        
        # Search for links that contain texts related to terms
        for link in soup.find_all('a', href=True):
            link_text = link.get_text(strip=True).lower()
            href = link.get('href')
            
            # Check if the link text contains any term variations
            if any(term_text in link_text for term_text in terms_texts):
                # Convert to absolute URL if relative
                if href.startswith('/'):
                    terms_url = urljoin(url, href)
                elif href.startswith('http'):
                    terms_url = href
                else:
                    terms_url = urljoin(url, '/' + href)
                
                return terms_url
        
        # If not found in links, search in footer or common areas
        footer_elements = soup.find_all(['footer', 'div'], class_=re.compile(r'footer|legal|terms', re.I))
        for footer in footer_elements:
            for link in footer.find_all('a', href=True):
                link_text = link.get_text(strip=True).lower()
                href = link.get('href')
                
                if any(term_text in link_text for term_text in terms_texts):
                    if href.startswith('/'):
                        terms_url = urljoin(url, href)
                    elif href.startswith('http'):
                        terms_url = href
                    else:
                        terms_url = urljoin(url, '/' + href)
                    
                    return terms_url
        
        # If not found, search for common URLs of term pages
        common_terms_paths = [
            '/terms', '/terms-of-use', '/terms-of-service', '/terms-and-conditions',
            '/tos', '/legal', '/conditions', '/user-terms', '/service-terms',
            '/terms.html', '/terms.php', '/legal.html', '/conditions.html',
            '/privacy-policy', '/legal-notice', '/disclaimer'
        ]
        
        for path in common_terms_paths:
            test_url = urljoin(url, path)
            try:
                test_response = requests.head(test_url, headers=headers, timeout=5)
                if test_response.status_code == 200:
                    return test_url
            except:
                continue
                
        return None
        
    except Exception as e:
        print(f"Error processing terms in {url}: {e}")
        return None

In [163]:
df_platforms['terms_link'] = df_platforms['platform_url'].apply(extract_terms_link)

Error processing terms in https://coastwards.org: HTTPSConnectionPool(host='coastwards.org', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7ff0c45ad790>: Failed to establish a new connection: [Errno 111] Conexión rehusada'))


In [164]:
df_platforms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   platform_url    10 non-null     object 
 1   active          10 non-null     object 
 2   platform_about  7 non-null      object 
 3   year_creation   9 non-null      float64
 4   terms_link      7 non-null      object 
dtypes: float64(1), object(4)
memory usage: 532.0+ bytes


# privacy_policy

In [165]:
def extract_privacy_policy_link(url):
    """
    Extracts the privacy policy link from a web page.
    Finds common variations of privacy links in multiple languages.

    Args:
        url (str): Platform URL

    Returns:
        str: URL of the privacy policy page if found, None otherwise
    """
    try:
        # Ensure the URL has a protocol
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        
        # Configure headers to avoid blocking
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }
        
        # Making request
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Parsing HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Possible privacy link texts (in multiple languages)
        privacy_texts = [
            # English
            'privacy', 'privacy policy', 'privacy notice', 'privacy statement',
            'data protection', 'data policy', 'privacy terms', 'privacy agreement',
            'data privacy', 'privacy & cookies', 'cookie policy',
            
            # Spanish
            'privacidad', 'política de privacidad', 'aviso de privacidad',
            'política de datos', 'protección de datos', 'términos de privacidad',
            'política de cookies', 'aviso legal', 'protección de la privacidad',
            
            # French
            'confidentialité', 'politique de confidentialité', 
            'protection des données', 'vie privée', 'données personnelles',
            'politique de cookies', 'rgpd', 'gdpr',
            
            # German
            'datenschutz', 'datenschutzerklärung', 'datenschutzrichtlinie',
            'datenschutzbestimmungen', 'privatsphäre', 'dsgvo',
            
            # Dutch
            'privacy', 'privacybeleid', 'gegevensbescherming', 'avg',
            'privacyverklaring', 'cookiebeleid',
            
            # Italian
            'privacy', 'politica privacy', 'protezione dati', 'riservatezza',
            'gdpr', 'informativa privacy', 'politica dei cookie',
            
            # Other patterns
            'gdpr', 'rgpd', 'cookies', 'cookie notice', 'data handling'
        ]
        
        # Search for links that contain privacy-related texts
        for link in soup.find_all('a', href=True):
            link_text = link.get_text(strip=True).lower()
            href = link.get('href')
            
            # Check if the link text contains any privacy variations
            if any(privacy_text in link_text for privacy_text in privacy_texts):
                # Convert to absolute URL if relative
                if href.startswith('/'):
                    privacy_url = urljoin(url, href)
                elif href.startswith('http'):
                    privacy_url = href
                else:
                    privacy_url = urljoin(url, '/' + href)
                
                return privacy_url
        
        # If not found in main links, search in footer or common areas
        footer_elements = soup.find_all(['footer', 'div'], class_=re.compile(r'footer|legal|privacy|cookie', re.I))
        for footer in footer_elements:
            for link in footer.find_all('a', href=True):
                link_text = link.get_text(strip=True).lower()
                href = link.get('href')
                
                if any(privacy_text in link_text for privacy_text in privacy_texts):
                    if href.startswith('/'):
                        privacy_url = urljoin(url, href)
                    elif href.startswith('http'):
                        privacy_url = href
                    else:
                        privacy_url = urljoin(url, '/' + href)
                    
                    return privacy_url
        
        # If not found, search for common privacy page URLs
        common_privacy_paths = [
            '/privacy', '/privacy-policy', '/privacy-notice', '/privacy-statement',
            '/data-protection', '/data-policy', '/cookies', '/cookie-policy',
            '/privacy.html', '/privacy.php', '/datenschutz', '/confidentialite',
            '/privacidad', '/gdpr', '/data-privacy', '/cookie-notice'
        ]
        
        for path in common_privacy_paths:
            test_url = urljoin(url, path)
            try:
                test_response = requests.head(test_url, headers=headers, timeout=5)
                if test_response.status_code == 200:
                    return test_url
            except:
                continue
                
        return None
        
    except Exception as e:
        print(f"Error procesando privacidad en {url}: {e}")
        return None

In [166]:
df_platforms['privacy_link'] = df_platforms['platform_url'].apply(extract_privacy_policy_link)

Error procesando privacidad en https://coastwards.org: HTTPSConnectionPool(host='coastwards.org', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7ff0c3af0050>: Failed to establish a new connection: [Errno 111] Conexión rehusada'))


In [167]:
df_platforms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   platform_url    10 non-null     object 
 1   active          10 non-null     object 
 2   platform_about  7 non-null      object 
 3   year_creation   9 non-null      float64
 4   terms_link      7 non-null      object 
 5   privacy_link    7 non-null      object 
dtypes: float64(1), object(5)
memory usage: 612.0+ bytes


In [168]:
df_platforms

Unnamed: 0,platform_url,active,platform_about,year_creation,terms_link,privacy_link
0,observation.org,Status code: 200,https://observation.org/about,2002.0,https://observation.org/terms,https://observation.org/privacy
1,seasearch.org.uk,Status code: 200,https://www.seasearch.org.uk/about,1999.0,https://www.seasearch.org.uk/website-terms-con...,https://www.seasearch.org.uk/privacy
2,natuurpunt.be,Status code: 200,https://natuurpunt.be/dit-is-natuurpunt,,https://natuurpunt.be/algemene-voorwaarden-en-...,https://natuurpunt.be/algemene-voorwaarden-en-...
3,slu.se/artdatabanken,Status code: 200,,1989.0,,
4,theroadlab.co.uk,Status code: 200,https://www.theroadlab.co.uk/about,2022.0,,
5,exploreyourshore.ie,Status code: 200,https://exploreyourshore.ie/about-explore-your...,2019.0,https://exploreyourshore.ie/legals/,https://exploreyourshore.ie/cookie-policy/
6,eyeonwater.org,Status code: 200,https://eyeonwater.org/about-us,2015.0,https://eyeonwater.org/privacy-policy,https://eyeonwater.org/privacy-policy
7,iseahorse.org,Status code: 200,https://projectseahorse.org/about-us/,2012.0,https://projectseahorse.org/regulating-trade/c...,https://projectseahorse.org/privacy-statement-ca/
8,redpromar.org,Status code: 200,,2020.0,https://redpromar.org/sightings,https://redpromar.org/legal-notice
9,coastwards.org,Fallo: HTTPSConnectionPool(host='coastwards.or...,,2016.0,,


# code_repository

In [169]:
def extract_code_repository_link(url):
    """
    Extracts the link to a web platform's code repository.
    Searches for links to GitHub, GitLab, Bitbucket, and other code repositories.

    Args:
        url (str): Platform URL

    Returns:
        str: Code repository URL if found, None otherwise
    """
    try:
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Known code repository domains
        repo_domains = [
            'github.com', 'gitlab.com', 'bitbucket.org', 'sourceforge.net',
            'codeberg.org', 'git.sr.ht', 'gitea.com', 'gitee.com'
        ]
        
        # HIGH PRIORITY: Find direct links to known repositories
        for link in soup.find_all('a', href=True):
            href = link.get('href')
            
            # Check if the link points to a known repository
            for domain in repo_domains:
                if domain in href:
                    # Verify that it is not just the main domain and that it is a valid repo
                    if href.count('/') >= 2 and not href.endswith(domain):
                        # Check that it does not end in extensions that are not repos
                        if not any(ext in href.lower() for ext in ['.png', '.jpg', '.gif', '.svg', '.css', '.js']):
                            return href
        
        # Search for very specific texts in source code
        specific_repo_texts = [
            'source code', 'github', 'gitlab', 'view source', 'download source',
            'fork on github', 'clone', 'git repository', 'código fuente'
        ]
        
        # Search for links with VERY specific text from repositories
        for link in soup.find_all('a', href=True):
            link_text = link.get_text(strip=True).lower()
            href = link.get('href')
            
            # Only search for very specific texts that point to known domains
            if any(specific_text in link_text for specific_text in specific_repo_texts):
                for domain in repo_domains:
                    if domain in href:
                        return href
        
        # Search developer/footer areas with strict criteria
        developer_areas = soup.find_all(['footer', 'div'], 
                                      class_=re.compile(r'developer|code|source|footer', re.I))
        
        for area in developer_areas:
            for link in area.find_all('a', href=True):
                href = link.get('href')
                link_text = link.get_text(strip=True).lower()
                
                # Only search known domains with very specific text
                for domain in repo_domains:
                    if domain in href and any(text in link_text for text in specific_repo_texts):
                        return href
        
        # Search in meta tags (known domains only)
        meta_tags = soup.find_all('meta')
        for meta in meta_tags:
            content = meta.get('content', '')
            for domain in repo_domains:
                if domain in content and content.count('/') >= 2:
                    return content
        
        # 5. Search GitHub badges (very specific)
        for img in soup.find_all('img'):
            src = img.get('src', '')
            if 'shields.io' in src and 'github' in src:
                parent_link = img.find_parent('a')
                if parent_link and parent_link.get('href'):
                    href = parent_link.get('href')
                    if 'github.com' in href:
                        return href
                
        return None
        
    except Exception as e:
        print(f"Error processing repository in {url}: {e}")
        return None

In [170]:
df_platforms['code_repository'] = df_platforms['platform_url'].apply(extract_code_repository_link)

df_platforms

Error processing repository in https://coastwards.org: HTTPSConnectionPool(host='coastwards.org', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7ff0c45ae590>: Failed to establish a new connection: [Errno 111] Conexión rehusada'))


Unnamed: 0,platform_url,active,platform_about,year_creation,terms_link,privacy_link,code_repository
0,observation.org,Status code: 200,https://observation.org/about,2002.0,https://observation.org/terms,https://observation.org/privacy,https://github.com/TecharoHQ/anubis
1,seasearch.org.uk,Status code: 200,https://www.seasearch.org.uk/about,1999.0,https://www.seasearch.org.uk/website-terms-con...,https://www.seasearch.org.uk/privacy,
2,natuurpunt.be,Status code: 200,https://natuurpunt.be/dit-is-natuurpunt,,https://natuurpunt.be/algemene-voorwaarden-en-...,https://natuurpunt.be/algemene-voorwaarden-en-...,
3,slu.se/artdatabanken,Status code: 200,,1989.0,,,
4,theroadlab.co.uk,Status code: 200,https://www.theroadlab.co.uk/about,2022.0,,,
5,exploreyourshore.ie,Status code: 200,https://exploreyourshore.ie/about-explore-your...,2019.0,https://exploreyourshore.ie/legals/,https://exploreyourshore.ie/cookie-policy/,
6,eyeonwater.org,Status code: 200,https://eyeonwater.org/about-us,2015.0,https://eyeonwater.org/privacy-policy,https://eyeonwater.org/privacy-policy,
7,iseahorse.org,Status code: 200,https://projectseahorse.org/about-us/,2012.0,https://projectseahorse.org/regulating-trade/c...,https://projectseahorse.org/privacy-statement-ca/,
8,redpromar.org,Status code: 200,,2020.0,https://redpromar.org/sightings,https://redpromar.org/legal-notice,
9,coastwards.org,Fallo: HTTPSConnectionPool(host='coastwards.or...,,2016.0,,,


In [171]:
df_platforms.to_csv("../data/df_platforms.csv", index=False)

# language_code

In [172]:
df_platforms.drop(columns=['programming_language2'], inplace=True)

KeyError: "['programming_language2'] not found in axis"

In [None]:
def analyze_website_tech(url, timeout=10):
    """
    Analyzes a URL to detect web technologies used    
    """
    result = {
        'url': url,
        'technologies': [],
        'language': None,
        'framework': None,
        'server': None,
        'headers': {},
        'status_code': None,
        'error': None
    }
    
    try:
        # Asegurar que la URL tenga protocolo
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
            
        # Make HTTP request
        response = requests.get(url, timeout=timeout, allow_redirects=True)
        result['status_code'] = response.status_code
        
        # Analyze headers
        headers = response.headers
        result['headers'] = dict(headers)
        
        # Get HTML content
        html_content = response.text.lower()
        
        # Detect technologies by headers
        tech_info = detect_tech_from_headers(headers)
        result.update(tech_info)
        
        # Detect technologies by HTML content
        html_tech = detect_tech_from_html(html_content)
        result['technologies'].extend(html_tech)
        
        # Detect by URL patterns
        url_tech = detect_tech_from_url(url)
        result['technologies'].extend(url_tech)
        
        # Determine primary language
        result['language'] = determine_primary_language(result)
        
        # Drop duplicates
        result['technologies'] = list(set(result['technologies']))
        
    except requests.exceptions.RequestException as e:
        result['error'] = str(e)
    except Exception as e:
        result['error'] = f"Unexpected error: {str(e)}"
    
    return result

def detect_tech_from_headers(headers):
    """Detects technologies based on HTTP headers"""
    tech_info = {
        'technologies': [],
        'language': None,
        'framework': None,
        'server': None
    }
    
    # Convert headers to lowercase for search
    headers_lower = {k.lower(): v for k, v in headers.items()}
    
    # Detect web server
    if 'server' in headers_lower:
        server = headers_lower['server']
        tech_info['server'] = server
        
        if 'apache' in server.lower():
            tech_info['technologies'].append('Apache')
        if 'nginx' in server.lower():
            tech_info['technologies'].append('Nginx')
        if 'iis' in server.lower():
            tech_info['technologies'].append('IIS')
    
    # Detect language by specific headers
    if 'x-powered-by' in headers_lower:
        powered_by = headers_lower['x-powered-by'].lower()
        
        if 'php' in powered_by:
            tech_info['language'] = 'PHP'
            tech_info['technologies'].append('PHP')
        elif 'asp.net' in powered_by:
            tech_info['language'] = 'ASP.NET'
            tech_info['technologies'].append('ASP.NET')
        elif 'express' in powered_by:
            tech_info['language'] = 'Node.js'
            tech_info['technologies'].append('Node.js')
            tech_info['technologies'].append('Express')
    
    # Detect Ruby on Rails
    if 'x-runtime' in headers_lower:
        tech_info['language'] = 'Ruby'
        tech_info['framework'] = 'Ruby on Rails'
        tech_info['technologies'].append('Ruby on Rails')
    
    # Detect by cookies
    if 'set-cookie' in headers_lower:
        cookies = headers_lower['set-cookie'].lower()
        
        if 'phpsessid' in cookies:
            tech_info['language'] = 'PHP'
            tech_info['technologies'].append('PHP')
        elif 'jsessionid' in cookies:
            tech_info['language'] = 'Java'
            tech_info['technologies'].append('Java')
        elif '_session_id' in cookies and 'rails' in cookies:
            tech_info['language'] = 'Ruby'
            tech_info['technologies'].append('Ruby on Rails')
    
    # Detect CloudFlare
    if 'cf-ray' in headers_lower:
        tech_info['technologies'].append('CloudFlare')
    
    return tech_info

def detect_tech_from_html(html_content):
    """Detects technologies by analyzing HTML content"""
    technologies = []
    
    # WordPress
    if any(indicator in html_content for indicator in [
        'wp-content', 'wp-includes', 'wordpress', '/wp-json/'
    ]):
        technologies.append('WordPress')
    
    # Drupal
    if any(indicator in html_content for indicator in [
        'drupal', 'sites/default/files', 'misc/drupal.js'
    ]):
        technologies.append('Drupal')
    
    # Joomla
    if any(indicator in html_content for indicator in [
        'joomla', 'option=com_', 'joomla.org'
    ]):
        technologies.append('Joomla')
    
    # React
    if any(indicator in html_content for indicator in [
        'react', 'data-reactroot', '__react', 'react-dom'
    ]):
        technologies.append('React')
    
    # Vue.js
    if any(indicator in html_content for indicator in [
        'vue.js', 'vue.min.js', 'v-if=', 'v-for='
    ]):
        technologies.append('Vue.js')
    
    # Angular
    if any(indicator in html_content for indicator in [
        'angular', 'ng-app', 'ng-controller', 'angular.min.js'
    ]):
        technologies.append('Angular')
    
    # jQuery
    if any(indicator in html_content for indicator in [
        'jquery', 'jquery.min.js', '$.fn.jquery'
    ]):
        technologies.append('jQuery')
    
    # Bootstrap
    if any(indicator in html_content for indicator in [
        'bootstrap', 'bootstrap.min.css', 'bootstrap.css'
    ]):
        technologies.append('Bootstrap')
    
    # Django (Python)
    if any(indicator in html_content for indicator in [
        'csrfmiddlewaretoken', 'django', '__admin_media_prefix__'
    ]):
        technologies.append('Django')
    
    # Laravel (PHP)
    if any(indicator in html_content for indicator in [
        'laravel_session', 'laravel', 'csrf-token'
    ]):
        technologies.append('Laravel')
    
    # Ruby on Rails
    if any(indicator in html_content for indicator in [
        'csrf-param', 'csrf-token', 'rails', 'data-method='
    ]):
        technologies.append('Ruby on Rails')
    
    # Google Analytics
    if any(indicator in html_content for indicator in [
        'google-analytics', 'gtag(', 'ga('
    ]):
        technologies.append('Google Analytics')
    
    return technologies

def detect_tech_from_url(url):
    """Detects technologies based on URL patterns"""
    technologies = []
    
    # Analyze file extensions
    if '.php' in url:
        technologies.append('PHP')
    elif '.asp' in url or '.aspx' in url:
        technologies.append('ASP.NET')
    elif '.jsp' in url:
        technologies.append('Java')
    elif '.py' in url:
        technologies.append('Python')
    elif '.rb' in url:
        technologies.append('Ruby')
    elif '.cfm' in url:
        technologies.append('ColdFusion')
    
    # WordPress specific patterns
    if any(pattern in url for pattern in ['/wp-content/', '/wp-admin/', '/wp-includes/']):
        technologies.append('WordPress')
    
    # Drupal patterns
    if any(pattern in url for pattern in ['/node/', '/admin/config/']):
        technologies.append('Drupal')
    
    return technologies

def determine_primary_language(result):
    """Determine the primary language based on all the information"""
    technologies = result['technologies']
    
    # Prioridad por frameworks específicos
    if 'Ruby on Rails' in technologies:
        return 'Ruby'
    elif 'Django' in technologies:
        return 'Python'
    elif 'Laravel' in technologies:
        return 'PHP'
    elif 'ASP.NET' in technologies:
        return 'ASP.NET'
    elif 'PHP' in technologies or 'WordPress' in technologies:
        return 'PHP'
    elif 'Java' in technologies:
        return 'Java'
    elif 'Node.js' in technologies:
        return 'JavaScript (Node.js)'
    elif any(js_tech in technologies for js_tech in ['React', 'Vue.js', 'Angular']):
        return 'JavaScript'
    elif result['language'] is not None:
        return result['language']
    else:
        return None

In [None]:
# Función que retorna solo el lenguaje
def get_language(url):
    try:
        result = analyze_website_tech(url)
        return result['language']
    except:
        return None
    
df_platforms['programming_language'] = df_platforms['platform_url'].apply(get_language)

In [None]:
df_platforms.programming_language

0        Python
1          Ruby
2          None
3       ASP.NET
4    JavaScript
5           PHP
6          None
7           PHP
8          Ruby
9          None
Name: programming_language, dtype: object

**NOTE:**
* **PHP**: frameworks like Laravel, Symfony, CodeIgniter... and CMS like WordPress, Drupal, Joomla...
* **JavaScript**: frontends like React, Vue.js, Angular, jQuery... and backends like Node.js or Express.js.

In [None]:
df_platforms.to_csv("../data/df_platforms.csv", index=False)

# governance_explicit

In [None]:
df_platforms = pd.read_csv("../data/df_platforms.csv")
df_platforms

Unnamed: 0,platform_url,active,platform_about,year_creation,terms_link,privacy_link,code_repository,programming_language
0,observation.org,Status code: 200,https://observation.org/about,2002,https://observation.org/terms,https://observation.org/privacy,https://github.com/TecharoHQ/anubis,Python
1,seasearch.org.uk,Status code: 200,https://www.seasearch.org.uk/about,1999,https://www.seasearch.org.uk/website-terms-con...,https://www.seasearch.org.uk/privacy,,Ruby
2,natuurpunt.be,Status code: 200,https://natuurpunt.be/dit-is-natuurpunt,2001,https://natuurpunt.be/algemene-voorwaarden-en-...,https://natuurpunt.be/algemene-voorwaarden-en-...,,
3,slu.se/artdatabanken,Status code: 200,,1989,,,,ASP.NET
4,theroadlab.co.uk,Status code: 200,https://www.theroadlab.co.uk/about,2022,,,,JavaScript
5,exploreyourshore.ie,Status code: 200,https://exploreyourshore.ie/about-explore-your...,2019,https://exploreyourshore.ie/legals/,https://exploreyourshore.ie/cookie-policy/,,PHP
6,eyeonwater.org,Status code: 200,https://eyeonwater.org/about-us,2015,https://eyeonwater.org/privacy-policy,https://eyeonwater.org/privacy-policy,,
7,iseahorse.org,Status code: 200,https://projectseahorse.org/about-us/,2012,https://projectseahorse.org/regulating-trade/c...,https://projectseahorse.org/privacy-statement-ca/,,PHP
8,redpromar.org,Status code: 200,,2020,https://redpromar.org/sightings,https://redpromar.org/legal-notice,,Ruby
9,coastwards.org,Fallo: HTTPSConnectionPool(host='coastwards.or...,,2016,,,,


In [None]:
def extract_data_governance_policy_link(url):
    """
    Extracts the link to data governance policies from a web platform.
    Searches for terms related to data governance, data management, 
    scientific data policies and data management frameworks.
    
    Args:
        url (str): Platform URL
        
    Returns:
        str: URL of the data governance page if found, None otherwise
    """
    try:
        # Ensure URL has protocol
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        
        # Configure headers to avoid blocking
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }
        
        # Make HTTP request
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Terms related to data governance (multiple languages)
        governance_texts = [
            # English - Data governance
            'data governance', 'data policy', 'data management', 'data stewardship',
            'data standards', 'data quality', 'data principles', 'data framework',
            'scientific data policy', 'research data policy', 'data sharing policy',
            'data use policy', 'data access policy', 'data ethics', 'data guidelines',
            'information governance', 'data management plan', 'fair data', 'open data policy',
            
            # English - Scientific specific terms
            'research guidelines', 'scientific standards', 'data citation', 'metadata standards',
            'data repository policy', 'data licensing', 'data attribution', 'data provenance',
            'scientific integrity', 'research ethics', 'data transparency', 'reproducible research',
            
            # Spanish - Data governance
            'gobernanza de datos', 'política de datos', 'gestión de datos', 'administración de datos',
            'estándares de datos', 'calidad de datos', 'principios de datos', 'marco de datos',
            'política científica de datos', 'política de investigación', 'política de intercambio',
            'ética de datos', 'directrices de datos', 'gobernanza de información',
            
            # French
            'gouvernance des données', 'politique des données', 'gestion des données',
            'éthique des données', 'données ouvertes', 'intégrité scientifique',
            
            # German
            'datenverwaltung', 'datenrichtlinien', 'datenethik', 'forschungsrichtlinien',
            
            # Dutch
            'data governance', 'gegevensbeheer', 'onderzoeksrichtlijnen',
            
            # Universal technical terms
            'fair principles', 'dmp', 'orcid', 'doi policy', 'creative commons',
            'research data management', 'rdm', 'data management framework'
        ]
        
        found_links = []
        
        # Search for direct links with governance-related text
        for link in soup.find_all('a', href=True):
            link_text = link.get_text(strip=True).lower()
            href = link.get('href')
            
            # Check if text contains governance terms
            for governance_text in governance_texts:
                if governance_text in link_text:
                    # Convert to absolute URL if relative
                    if href.startswith('/'):
                        governance_url = urljoin(url, href)
                    elif href.startswith('http'):
                        governance_url = href
                    else:
                        governance_url = urljoin(url, '/' + href)
                    
                    found_links.append((governance_url, governance_text, link_text))
        
        # Search in specific sections (footer, navigation, menus)
        specific_areas = soup.find_all(['footer', 'nav', 'div'], 
                                     class_=re.compile(r'footer|nav|menu|policy|legal|research|science|data', re.I))
        
        for area in specific_areas:
            for link in area.find_all('a', href=True):
                link_text = link.get_text(strip=True).lower()
                href = link.get('href')
                
                for governance_text in governance_texts:
                    if governance_text in link_text:
                        if href.startswith('/'):
                            governance_url = urljoin(url, href)
                        elif href.startswith('http'):
                            governance_url = href
                        else:
                            governance_url = urljoin(url, '/' + href)
                        
                        found_links.append((governance_url, governance_text, link_text))
        
        # Search common governance policy URLs
        common_governance_paths = [
            '/data-governance', '/data-policy', '/data-management', '/research-policy',
            '/scientific-policy', '/data-standards', '/data-guidelines', '/research-guidelines',
            '/fair-data', '/open-data', '/data-ethics', '/research-ethics', '/data-use-policy',
            '/data-sharing', '/data-access', '/scientific-standards', '/research-integrity',
            '/data-management-plan', '/rdm', '/metadata-policy', '/data-citation',
            '/governance.html', '/policy.html', '/guidelines.html', '/standards.html'
        ]
        
        for path in common_governance_paths:
            test_url = urljoin(url, path)
            try:
                test_response = requests.head(test_url, headers=headers, timeout=5)
                if test_response.status_code == 200:
                    found_links.append((test_url, 'common_path', path))
            except:
                continue
        
        # Filter and prioritize results
        if found_links:
            # Prioritize by term relevance
            priority_terms = ['data governance', 'data policy', 'research policy', 'scientific policy']
            
            # First search for high priority terms
            for link_url, term, text in found_links:
                if any(priority_term in term for priority_term in priority_terms):
                    return link_url
            
            # If no high priority terms, return first found
            return found_links[0][0]
        
        return None
        
    except Exception as e:
        print(f"Error processing data governance in {url}: {e}")
        return None

In [None]:
df_platforms['governance_explicit'] = df_platforms['platform_url'].apply(extract_data_governance_policy_link)

Error processing data governance in https://coastwards.org: HTTPSConnectionPool(host='coastwards.org', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7ff0c6cfa7d0>: Failed to establish a new connection: [Errno 111] Conexión rehusada'))


In [None]:
df_platforms[['platform_url', 'governance_explicit']]

Unnamed: 0,platform_url,governance_explicit
0,observation.org,https://observation.org/data-governance
1,seasearch.org.uk,
2,natuurpunt.be,
3,slu.se/artdatabanken,
4,theroadlab.co.uk,
5,exploreyourshore.ie,
6,eyeonwater.org,
7,iseahorse.org,
8,redpromar.org,
9,coastwards.org,


In [None]:
df_platforms.to_csv("../data/df_platforms.csv", index=False)

# api_availability

In [None]:
def extract_api_url(url):
    """
    Extracts the API URL or documentation URL from a platform.
    Searches for API documentation, endpoints, and common API indicators.
    
    Args:
        url (str): Platform URL
        
    Returns:
        str: URL to API documentation/endpoint if found, None otherwise
    """
    try:
        # Ensure URL has protocol
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        
        # Configure headers to avoid blocking
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }
        
        # Make HTTP request
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # API-related terms to search for
        api_terms = [
            # High priority terms
            'api documentation', 'api docs', 'api reference', 'api guide',
            'developer documentation', 'developers', 'rest api',
            
            # Medium priority terms
            'api', 'web api', 'json api', 'restful api', 'graphql',
            'api endpoint', 'api access', 'api integration', 'sdk',
            
            # Spanish
            'documentación api', 'guía api', 'desarrolladores',
            
            # French
            'documentation api', 'guide api', 'développeurs',
            
            # German
            'api dokumentation', 'entwickler',
            
            # Dutch
            'api documentatie', 'ontwikkelaars'
        ]
        
        found_api_links = []
        
        # 1. Search for direct API links in navigation and content
        for link in soup.find_all('a', href=True):
            link_text = link.get_text(strip=True).lower()
            href = link.get('href')
            
            # Check if link text contains API terms
            for api_term in api_terms:
                if api_term in link_text:
                    # Convert to absolute URL if relative
                    if href.startswith('/'):
                        api_url = urljoin(url, href)
                    elif href.startswith('http'):
                        api_url = href
                    else:
                        api_url = urljoin(url, '/' + href)
                    
                    # Add priority score based on term importance
                    priority = 3 if api_term in ['api documentation', 'api docs', 'developers', 'api reference'] else 2
                    found_api_links.append((api_url, api_term, link_text, priority))
        
        # 2. Check common API endpoint paths
        common_api_paths = [
            # High priority paths
            '/api/docs', '/api-docs', '/docs/api', '/documentation/api',
            '/developers', '/developer',
            
            # Medium priority paths
            '/api', '/api/v1', '/api/v2', '/swagger', '/docs',
            '/documentation', '/rest', '/restapi', '/graphql',
            
            # Lower priority paths
            '/api.html', '/api.php', '/help/api', '/support/api',
            '/reference/api'
        ]
        
        for i, path in enumerate(common_api_paths):
            test_url = urljoin(url, path)
            try:
                test_response = requests.head(test_url, headers=headers, timeout=5)
                if test_response.status_code == 200:
                    # Higher priority for paths at the beginning of the list
                    priority = 3 if i < 5 else (2 if i < 10 else 1)
                    found_api_links.append((test_url, 'common_path', path, priority))
            except:
                continue
        
        # 3. Sort by priority and return the best match
        if found_api_links:
            # Sort by priority (highest first) and then by term quality
            found_api_links.sort(key=lambda x: (-x[3], x[1] != 'common_path'))
            return found_api_links[0][0]
        
        return None
        
    except Exception as e:
        print(f"Error extracting API URL from {url}: {e}")
        return None

In [None]:
# Apply the API extraction function to all platforms
df_platforms['api'] = df_platforms['platform_url'].apply(extract_api_url)

Error extracting API URL from https://coastwards.org: HTTPSConnectionPool(host='coastwards.org', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7ff0c6da2590>: Failed to establish a new connection: [Errno 111] Conexión rehusada'))


In [None]:
df_platforms

Unnamed: 0,platform_url,active,platform_about,year_creation,terms_link,privacy_link,code_repository,programming_language,governance_explicit,api
0,observation.org,Status code: 200,https://observation.org/about,2002,https://observation.org/terms,https://observation.org/privacy,https://github.com/TecharoHQ/anubis,Python,https://observation.org/data-governance,https://observation.org/api-docs
1,seasearch.org.uk,Status code: 200,https://www.seasearch.org.uk/about,1999,https://www.seasearch.org.uk/website-terms-con...,https://www.seasearch.org.uk/privacy,,Ruby,,
2,natuurpunt.be,Status code: 200,https://natuurpunt.be/dit-is-natuurpunt,2001,https://natuurpunt.be/algemene-voorwaarden-en-...,https://natuurpunt.be/algemene-voorwaarden-en-...,,,,
3,slu.se/artdatabanken,Status code: 200,,1989,,,,ASP.NET,,
4,theroadlab.co.uk,Status code: 200,https://www.theroadlab.co.uk/about,2022,,,,JavaScript,,
5,exploreyourshore.ie,Status code: 200,https://exploreyourshore.ie/about-explore-your...,2019,https://exploreyourshore.ie/legals/,https://exploreyourshore.ie/cookie-policy/,,PHP,,
6,eyeonwater.org,Status code: 200,https://eyeonwater.org/about-us,2015,https://eyeonwater.org/privacy-policy,https://eyeonwater.org/privacy-policy,,,,
7,iseahorse.org,Status code: 200,https://projectseahorse.org/about-us/,2012,https://projectseahorse.org/regulating-trade/c...,https://projectseahorse.org/privacy-statement-ca/,,PHP,,
8,redpromar.org,Status code: 200,,2020,https://redpromar.org/sightings,https://redpromar.org/legal-notice,,Ruby,,https://redpromar.org/api/v1
9,coastwards.org,Fallo: HTTPSConnectionPool(host='coastwards.or...,,2016,,,,,,


In [None]:
df_platforms.to_csv("../data/df_platforms.csv", index=False)

In [None]:
import time
from urllib.parse import parse_qs, unquote, urlparse
import requests
from bs4 import BeautifulSoup

def duckduckgo_search(query):
    search = f"{query} api"
    headers = {"User-Agent": "Mozilla/5.0"}
    search_url = f"https://lite.duckduckgo.com/lite/?q={search.replace(' ', '+')}"

    response = requests.get(search_url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, "html.parser")

    # Buscar el primer enlace resultante
    first_link = soup.find("a", class_="result-link")
    if first_link and "href" in first_link.attrs:
        raw_href = first_link["href"]
        parsed = urlparse(raw_href)
        query_params = parse_qs(parsed.query)
        if "uddg" in query_params:
            time.sleep(5)
            return unquote(query_params["uddg"][0])  # Enlace decodificado
        else:
            time.sleep(5)
            return raw_href  # Por si acaso ya es directo
    time.sleep(5)

In [None]:
df_platforms['api_ddg'] = df_platforms['platform_url'].apply(duckduckgo_search)

In [None]:
df_platforms[["platform_url", 'api_ddg']]

Unnamed: 0,platform_url,api_ddg
0,observation.org,https://observation.org/api/docs/
1,seasearch.org.uk,https://www.seasearch.org.uk/
2,natuurpunt.be,https://www.natuurpunt.be/snuit/aanspreekpunt-...
3,slu.se/artdatabanken,https://www.slu.se/artdatabanken/rapportering-...
4,theroadlab.co.uk,https://www.theroadlab.co.uk/
5,exploreyourshore.ie,https://exploreyourshore.ie/
6,eyeonwater.org,https://www.eyeonwater.org/api/
7,iseahorse.org,https://github.com/zeke/seahorse
8,redpromar.org,https://redpromar.org/
9,coastwards.org,https://github.com/maureentsakiris/coastwards


In [None]:
api_dict = {
    "eyeonwater.org": "https://www.eyeonwater.org/api/",
    "iseahorse.org": "https://github.com/zeke/seahorse",
    "coastwards.org": "https://github.com/maureentsakiris/coastwards",
}
for k,v in api_dict.items():
    df_platforms.loc[df_platforms['platform_url'] == k, "api"] = v

In [None]:
df_platforms["api"]

0                 https://observation.org/api-docs
1                                             None
2                                             None
3                                             None
4                                             None
5                                             None
6                  https://www.eyeonwater.org/api/
7                 https://github.com/zeke/seahorse
8                     https://redpromar.org/api/v1
9    https://github.com/maureentsakiris/coastwards
Name: api, dtype: object

# organization_of_managers

In [None]:
def extract_organization_from_whois(url):
    """
    Extracts the organization name from WHOIS data of the domain.
    This is more reliable than parsing website content.
    
    Args:
        url (str): Platform URL
        
    Returns:
        str: Organization name from WHOIS data if found, None otherwise
    """
    try:
        import whois
        
        # Ensure URL has protocol and extract domain
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        
        # Parse domain from URL
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        
        # Remove www prefix if present
        if domain.startswith('www.'):
            domain = domain[4:]
        
        # Get WHOIS information
        domain_info = whois.whois(domain)
        
        # Try different fields that might contain organization info
        org_fields = ['org', 'organization', 'registrant_organization', 'registrant', 'admin_organization']
        
        for field in org_fields:
            if hasattr(domain_info, field):
                org_value = getattr(domain_info, field)
                if org_value:
                    # Handle case where value might be a list
                    if isinstance(org_value, list):
                        org_value = org_value[0] if org_value else None
                    
                    if org_value and isinstance(org_value, str):
                        # Clean the organization name
                        org_clean = org_value.strip()
                        
                        # Skip if it's just privacy protection or empty
                        privacy_indicators = [
                            'privacy', 'protection', 'private', 'whoisguard', 'proxy',
                            'redacted', 'data protected', 'contact privacy'
                        ]
                        
                        if (org_clean and 
                            len(org_clean) > 2 and 
                            not any(indicator in org_clean.lower() for indicator in privacy_indicators)):
                            return org_clean
        
        # If no organization found in standard fields, try registrant name
        if hasattr(domain_info, 'registrant_name') and domain_info.registrant_name:
            registrant = domain_info.registrant_name
            if isinstance(registrant, list):
                registrant = registrant[0] if registrant else None
            
            if registrant and isinstance(registrant, str):
                registrant_clean = registrant.strip()
                privacy_indicators = [
                    'privacy', 'protection', 'private', 'whoisguard', 'proxy',
                    'redacted', 'data protected', 'contact privacy'
                ]
                
                if (registrant_clean and 
                    len(registrant_clean) > 2 and 
                    not any(indicator in registrant_clean.lower() for indicator in privacy_indicators)):
                    return registrant_clean
        
        return None
        
    except Exception as e:
        print(f"Error extracting organization from WHOIS for {url}: {e}")
        return None

In [None]:
# Apply the WHOIS-based organization extraction function to all platforms
df_platforms['organization_of_managers'] = df_platforms['platform_url'].apply(extract_organization_from_whois)

In [None]:
df_platforms['organization_of_managers']

0                        None
1                        None
2                        None
3            fresve9742-00001
4                        None
5                        None
6                    MARIS BV
7                        None
8    CARTOGRAFICA+CANARIAS+SA
9                        None
Name: organization_of_managers, dtype: object

In [None]:
df_platforms.to_csv("../data/df_platforms.csv", index=False)

Search for another strategy.

# platform_license

In [None]:
import re
import requests
from bs4 import BeautifulSoup

def detect_license(text):
    text = text.replace('\n', ' ').replace('\r', ' ').lower()

    patterns = [
        # Common copyright (symbol or world)
        r'©\s*\d{4}[^\.\n]*',
        r'copyright\s*©?\s*\d{4}[^\.\n]*',
        r'copyright\s+[\w\s\.\-&,]*\d{4}[^\.\n]*',
        r'[^\.]*©\s*\d{4}',  # Match text before and after © symbol   
        r'powered by.*?©\s*\d{4}',
        r'todos los derechos reservados',
        r'all rights reserved',
        r'tous droits réservés',
        r'alle rechte vorbehalten',
        r'tutti i diritti riservati',

        # Creative Commons
        r'creative commons[^<\n]{0,100}',
        r'cc\s*(by|by-sa|by-nc|by-nd|0)[\s\-0-9\.]*',

        # Other licenses
        r'mit license',
        r'licensed under the mit license',
        r'gpl license',
        r'licensed under the gpl',
        r'bsd license',
        r'apache license',
        r'european union public licence',
        r'licencia.*mit',
        r'licencia.*gpl',
        r'licencia.*apache',
        r'licencia.*bsd',
        r'licenza.*',
        r'lizenz.*',
        r'licence.*'
    ]

    for pat in patterns:
        match = re.search(pat, text, re.IGNORECASE)
        if match:
            return match.group(0).strip()

    return None

def extract_license(url):
    try:
        # Ensure URL has protocol
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        resp = requests.get(url, timeout=10, headers=headers)
        soup = BeautifulSoup(resp.text, 'html.parser')
        texto = soup.get_text(separator=' ', strip=True)
        return detect_license(texto)
    except Exception as e:
        return None

In [None]:
df_platforms['platform_license'] = df_platforms['platform_url'].apply(extract_license)

In [None]:
df_platforms[["platform_url", 'platform_license']]

Unnamed: 0,platform_url,platform_license
0,observation.org,
1,seasearch.org.uk,© 2025 seasearch/mcs uk join our free trial ge...
2,natuurpunt.be,© 2025 natuurpunt footer menu cookieverklaring...
3,slu.se/artdatabanken,
4,theroadlab.co.uk,© 2022 by roadlab
5,exploreyourshore.ie,
6,eyeonwater.org,© 2025 maris b
7,iseahorse.org,© 2025 project seahorse
8,redpromar.org,© 2025 redpromar aviso legal política de priva...
9,coastwards.org,


observation.org is working with anti-bot technology that deny capturing information using requests. Selenium or playwright would be necessary.

In [None]:
# Test for web with not copyright license
extract_license("https://minka-sdg.org")

'creative commons attribution 4.0 license. minka incorporates consent commons icons, allowing for easy visual compreh'

In [None]:
df_platforms.to_csv("../data/df_platforms.csv", index=False)

# platform_email

In [None]:
# Search email in home
def get_email(url):
    try:
        # Ensure URL has protocol
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, timeout=10, headers=headers)
        if response.status_code != 200:
            return None
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()
        emails = re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text)
        return emails[0] if emails else None
    except Exception as e:
        return None

In [None]:
df_platforms['platform_email'] = df_platforms['platform_url'].apply(get_email)
df_platforms.platform_email

0                           None
1                           None
2                           None
3                           None
4                           None
5    info@biodiversityireland.ie
6                           None
7       info@projectseahorse.org
8                           None
9                           None
Name: platform_email, dtype: object

In [None]:
def get_email_from_possible_pages(base_url):
    contact_paths = [
        '', '/contact', '/contact-us', '/contacto', '/a-propos', '/kontakt',
        '/contatti', '/contato', '/assistance', '/chi-siamo', '/uber-uns',
        '/soporte', '/support', '/supporto', '/ajuda', '/acerca-de', '/sobre',
        '/hilfe', '/steun', '/over-ons'
    ]
    for path in contact_paths:
        email = get_email(base_url.rstrip('/') + path)
        if email:
            return email
    return None

df_platforms['platform_email'] = df_platforms['platform_url'].apply(get_email_from_possible_pages)

In [None]:
df_platforms['platform_email']

0                                              None
1                                              None
2                                              None
3                                              None
4                                              None
5                       info@biodiversityireland.ie
6                                              None
7                          info@projectseahorse.org
8    redpromar.medioambiente@gobiernodecanarias.org
9                                              None
Name: platform_email, dtype: object

In [None]:
df_platforms.to_csv("../data/df_platforms.csv", index=False)

# country_of_managers

In [None]:
import whois

def get_country_from_whois(domain):
    try:
        w = whois.whois(domain)
        return w.get('country')
    except:
        return None

df_platforms['country_from_whois'] = df_platforms['platform_url'].apply(get_country_from_whois)

In [None]:
df_platforms['country_from_whois']

0      GB
1    None
2    None
3    None
4    None
5    None
6      NL
7      CA
8      ES
9      US
Name: country_from_whois, dtype: object

- Determine the country from which a website operates based on the server's IP address, using a geolocation API. This allows us to know where the servers (the data) are physically hosted. Example: a French organization may have its servers in the US.

In [None]:
df_platforms.platform_url

0         observation.org
1        seasearch.org.uk
2           natuurpunt.be
3    slu.se/artdatabanken
4        theroadlab.co.uk
5     exploreyourshore.ie
6          eyeonwater.org
7           iseahorse.org
8           redpromar.org
9          coastwards.org
Name: platform_url, dtype: object

In [None]:
# Example with ip-api.com
from urllib.parse import urlparse
import socket
import requests

def country_from_ip(url):
    try:
        if url == "slu.se/artdatabanken":
            url = "slu.se"
        # Extract domain (without http/https or routes)
        parsed = urlparse(url)
        domain = parsed.netloc or parsed.path  # in case the URL doesn't have a scheme

        # Remove www if necessary
        if domain.startswith('www.'):
            domain = domain[4:]

        # Resolve IP
        ip = socket.gethostbyname(domain)

        # Query geolocalization API
        r = requests.get(f"http://ip-api.com/json/{ip}", timeout=10)
        if r.status_code == 200:
            time.sleep(1.4) # limit of 45 requests per minute
            return r.json().get("country")
    except Exception as e:
        return None
    
df_platforms['country_servers'] = df_platforms['platform_url'].apply(country_from_ip)

In [None]:
df_platforms['country_servers']

0    The Netherlands
1             Canada
2             Canada
3             Sweden
4      United States
5            Ireland
6    The Netherlands
7             France
8              Spain
9            Germany
Name: country_servers, dtype: object

In [None]:
df_platforms.to_csv("../data/df_platforms.csv", index=False)

# data_download

In [None]:
def extract_data_download_options(url):
    """
    Detects if a platform offers data download options.
    Searches for download links, export options, and data access features.
    
    Args:
        url (str): Platform URL
        
    Returns:
        str: Description of download options if found, None otherwise
    """
    try:
        # Ensure URL has protocol
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        page_text = soup.get_text().lower()
        
        # Check for anti-bot protection
        if 'botstopper' in page_text or 'checking if you are not a bot' in page_text:
            return "Protected by anti-bot"
        
        download_options = []
        
        # Terms related to data download/export
        download_terms = [
            # English
            'download data', 'export data', 'data download', 'data export',
            'download dataset', 'export dataset', 'bulk download', 'batch download',
            'csv download', 'excel download', 'json download', 'xml download',
            'download observations', 'export observations', 'data access',
            'download records', 'export records', 'get data', 'api download',
            
            # Spanish
            'descargar datos', 'exportar datos', 'descarga de datos',
            'descargar registros', 'exportar registros',
            
            # French
            'télécharger données', 'exporter données', 'téléchargement données',
            
            # German
            'daten herunterladen', 'daten exportieren', 'datendownload',
            
            # Dutch
            'data downloaden', 'gegevens downloaden', 'data exporteren'
        ]
        
        # Search for download terms in text
        found_terms = []
        for term in download_terms:
            if term in page_text:
                found_terms.append(term)
        
        # Search for download links
        download_links = []
        for link in soup.find_all('a', href=True):
            href = link.get('href')
            href_lower = href.lower()
            link_text = link.get_text(strip=True).lower()
            
            # Check for download-related URLs
            download_patterns = [
                'download', 'export', 'csv', 'excel', 'json', 'xml', 
                'data.csv', 'data.json', 'data.xml', '.zip', 'bulk'
            ]
            
            if any(pattern in href_lower for pattern in download_patterns):
                # Convert to absolute URL if relative
                if href.startswith('/'):
                    absolute_url = urljoin(url, href)
                elif href.startswith('http'):
                    absolute_url = href
                else:
                    absolute_url = urljoin(url, '/' + href)
                download_links.append(f"{absolute_url}")
            elif any(pattern in link_text for pattern in download_patterns):
                # Convert to absolute URL if relative
                if href.startswith('/'):
                    absolute_url = urljoin(url, href)
                elif href.startswith('http'):
                    absolute_url = href
                else:
                    absolute_url = urljoin(url, '/' + href)
                download_links.append(f"{absolute_url}")
        
        # Search for file format mentions
        file_formats = []
        format_patterns = [
            r'csv\s+format', r'excel\s+format', r'json\s+format', 
            r'xml\s+format', r'\.csv', r'\.xlsx?', r'\.json', r'\.xml'
        ]
        
        for pattern in format_patterns:
            matches = re.findall(pattern, page_text)
            if matches:
                file_formats.extend(matches)
        
        # Search for API endpoints that might provide data
        api_endpoints = []
        if '/api' in page_text or 'rest api' in page_text or 'graphql' in page_text:
            api_endpoints.append("API available")
        
        # Compile results
        if found_terms or download_links or file_formats or api_endpoints:
            result_parts = []
            
            if found_terms:
                result_parts.append(f"{', '.join(set(found_terms[:3]))}")  # Limit to 3
            
            if download_links:
                result_parts.append(f"{', '.join(download_links[:2])}")  # Limit to 2
            
            if file_formats:
                result_parts.append(f"{', '.join(set(file_formats[:3]))}")  # Limit to 3
            
            if api_endpoints:
                result_parts.append("API: Available")
            
            return " | ".join(result_parts)
        
        return None
        
    except Exception as e:
        print(f"Error checking download options for {url}: {e}")
        return None

In [None]:
df_platforms['data_download_options'] = df_platforms['platform_url'].apply(extract_data_download_options)

Error checking download options for https://coastwards.org: HTTPSConnectionPool(host='coastwards.org', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7ff0c49120d0>: Failed to establish a new connection: [Errno 111] Conexión rehusada'))


In [None]:
df_platforms[['platform_url', 'data_download_options']]

Unnamed: 0,platform_url,data_download_options
0,observation.org,Protected by anti-bot
1,seasearch.org.uk,
2,natuurpunt.be,https://natuurpunt.be/natuurgebieden
3,slu.se/artdatabanken,
4,theroadlab.co.uk,https://www.theroadlab.co.uk/download
5,exploreyourshore.ie,
6,eyeonwater.org,
7,iseahorse.org,
8,redpromar.org,
9,coastwards.org,


In [None]:
df_platforms.to_csv("../data/df_platforms.csv", index=False)

# data_standards

In [None]:
def detect_data_standards(url):
    """
    Detects data standards mentioned on a platform.
    Searches for common data standards like Darwin Core, Dublin Core, etc.
    
    Args:
        url (str): Platform URL
        
    Returns:
        str: List of detected data standards, None if none found
    """
    try:
        # Ensure URL has protocol
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        page_text = soup.get_text().lower()
        
        # Check for anti-bot protection
        if 'botstopper' in page_text or 'checking if you are not a bot' in page_text:
            return "Protected by anti-bot"
        
        # Common data standards to search for
        data_standards = {
            # Biodiversity standards
            'Darwin Core': ['darwin core', 'darwincore', 'dwc'],
            'ABCD': ['abcd', 'access to biological collection data'],
            'EML': ['eml', 'ecological metadata language'],
            'GBIF': ['gbif', 'global biodiversity information facility'],
            
            # General metadata standards
            'Dublin Core': ['dublin core', 'dublincore', 'dc metadata'],
            'ISO 19115': ['iso 19115', 'iso19115', 'geographic information metadata'],
            'DCAT': ['dcat', 'data catalog vocabulary'],
            'Schema.org': ['schema.org', 'schema org', 'structured data'],
            
            # Scientific data standards
            'FAIR': ['fair data', 'fair principles', 'findable accessible interoperable reusable'],
            'DataCite': ['datacite', 'data cite', 'doi metadata'],
            'ORCID': ['orcid', 'open researcher and contributor id'],
            'DOI': ['digital object identifier', 'doi'],
            
            # Environmental/Earth sciences
            'CF Conventions': ['cf conventions', 'climate and forecast', 'netcdf'],
            'ISO 19139': ['iso 19139', 'iso19139'],
            'OGC': ['ogc', 'open geospatial consortium'],
            'WMS': ['web map service', 'wms'],
            'WFS': ['web feature service', 'wfs'],
            
            # Data exchange formats
            'JSON-LD': ['json-ld', 'json linked data'],
            'RDF': ['rdf', 'resource description framework'],
            'OWL': ['owl', 'web ontology language'],
            'SKOS': ['skos', 'simple knowledge organization system'],
            
            # Marine/Ocean standards
            'OBIS': ['obis', 'ocean biogeographic information system'],
            'ICES': ['ices', 'international council for the exploration of the sea'],
            'SeaDataNet': ['seadatanet', 'sea data net'],
            
            # Research data standards
            'CEDARS': ['cedars', 'comprehensive extensible data archival and retrieval system'],
            'DataVerse': ['dataverse', 'data verse'],
            'Zenodo': ['zenodo'],
            'Figshare': ['figshare'],
        }
        
        found_standards = []
        
        # Search for each standard
        for standard_name, terms in data_standards.items():
            for term in terms:
                if term in page_text:
                    found_standards.append(standard_name)
                    break  # Found this standard, move to next
        
        # Also search in meta tags and structured data
        meta_standards = []
        
        # Check meta tags
        for meta in soup.find_all('meta'):
            content = meta.get('content', '').lower()
            name = meta.get('name', '').lower()
            
            for standard_name, terms in data_standards.items():
                for term in terms:
                    if term in content or term in name:
                        meta_standards.append(standard_name)
                        break
        
        # Check for structured data (JSON-LD, microdata)
        json_ld = soup.find_all('script', type='application/ld+json')
        if json_ld:
            found_standards.append('JSON-LD')
        
        # Check for microdata
        if soup.find_all(attrs={'itemtype': True}):
            found_standards.append('Microdata')
        
        # Combine and deduplicate
        all_standards = list(set(found_standards + meta_standards))
        
        if all_standards:
            return ', '.join(sorted(all_standards))
        
        return None
        
    except Exception as e:
        print(f"Error detecting data standards for {url}: {e}")
        return None

In [None]:
df_platforms['data_standards'] = df_platforms['platform_url'].apply(detect_data_standards)

Error detecting data standards for https://coastwards.org: HTTPSConnectionPool(host='coastwards.org', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7ff0c4db8590>: Failed to establish a new connection: [Errno 111] Conexión rehusada'))


In [None]:
df_platforms[['platform_url', 'data_standards']]

Unnamed: 0,platform_url,data_standards
0,observation.org,Protected by anti-bot
1,seasearch.org.uk,DOI
2,natuurpunt.be,
3,slu.se/artdatabanken,
4,theroadlab.co.uk,JSON-LD
5,exploreyourshore.ie,"JSON-LD, OWL"
6,eyeonwater.org,
7,iseahorse.org,"ICES, JSON-LD"
8,redpromar.org,
9,coastwards.org,


- Biodiversity Standards: Darwin Core, ABCD, EML, GBIF
- General Metadata Standards: Dublin Core, ISO 19115, DCAT, Schema.org
- Scientific Standards: FAIR, DataCite, ORCID, DOI
- Environmental/Geospatial Standards: CF Conventions, OGC, WMS, WFS
- Data Exchange Formats: JSON-LD, RDF, OWL, SKOS
- Marine Standards: OBIS, ICES, SeaDataNet
- Data Repositories: DataVerse, Zenodo, Figshare

In [None]:
df_platforms.to_csv("../data/df_platforms.csv", index=False)

# data_license

In [None]:
def detect_data_license(url):
    """
    Detects licensing information for data on platforms.
    Uses a broad approach to capture any licensing mentions.
    
    Args:
        url (str): Platform URL
        
    Returns:
        str: License info if found, guidance if not found
    """
    try:
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code != 200:
            return "Page not accessible"
            
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text().lower()
        
        # Check for anti-bot
        if 'botstopper' in text or 'checking if you are not a bot' in text:
            return "Protected by anti-bot"
        
        found_licenses = []
        
        # 1. Creative Commons (broad search)
        if 'creative commons' in text:
            if 'cc by-sa' in text or 'attribution-sharealike' in text:
                found_licenses.append('CC BY-SA')
            elif 'cc by-nc-sa' in text:
                found_licenses.append('CC BY-NC-SA')
            elif 'cc by-nc' in text or 'noncommercial' in text:
                found_licenses.append('CC BY-NC')
            elif 'cc by' in text or 'attribution' in text:
                found_licenses.append('CC BY')
            elif 'cc0' in text or 'public domain' in text:
                found_licenses.append('CC0')
            else:
                found_licenses.append('Creative Commons')
        
        # 2. Other open licenses
        if 'open data commons' in text or 'odc' in text:
            found_licenses.append('Open Data Commons')
        if 'open database license' in text or 'odbl' in text:
            found_licenses.append('ODbL')
        if 'open government' in text and 'licen' in text:
            found_licenses.append('Open Government Licence')
        
        # 3. General usage terms (very broad)
        if 'open access' in text:
            found_licenses.append('Open Access')
        if 'freely available' in text or 'free to use' in text:
            found_licenses.append('Free Use')
        if 'public domain' in text and 'creative commons' not in text:
            found_licenses.append('Public Domain')
        if 'non-commercial' in text or 'noncommercial' in text:
            found_licenses.append('Non-Commercial Use')
        if 'academic use' in text or 'educational use' in text:
            found_licenses.append('Academic Use')
        if 'research use' in text or 'research purposes' in text:
            found_licenses.append('Research Use')
        if 'attribution required' in text:
            found_licenses.append('Attribution Required')
        if 'all rights reserved' in text:
            found_licenses.append('All Rights Reserved')
        
        # 4. Look for any mention of "license" or "licence"
        if not found_licenses:
            if 'license' in text or 'licence' in text:
                found_licenses.append('Licensed (unspecified)')
        
        # 5. Check for CC images
        if not found_licenses:
            for img in soup.find_all('img'):
                src = img.get('src', '').lower()
                alt = img.get('alt', '').lower()
                if any(cc in src or cc in alt for cc in ['creativecommons', 'cc-by', 'cc0']):
                    found_licenses.append('Creative Commons (Badge)')
                    break
        
        # 6. Check for CC links
        if not found_licenses:
            for link in soup.find_all('a', href=True):
                href = link.get('href').lower()
                if 'creativecommons.org' in href:
                    found_licenses.append('Creative Commons (Link)')
                    break
        
        # Return results
        if found_licenses:
            # Remove duplicates while preserving order
            unique_licenses = []
            seen = set()
            for license_name in found_licenses:
                if license_name not in seen:
                    seen.add(license_name)
                    unique_licenses.append(license_name)
            
            return ', '.join(unique_licenses[:3])
        
        # If no licenses found, check if we can access terms pages
        try:
            terms_response = requests.head(url + '/terms', headers=headers, timeout=5)
            if terms_response.status_code == 200:
                return "Check terms/legal pages"
        except:
            pass
        
        return "No license info found"
        
    except Exception as e:
        print(f"Error detecting license for {url}: {e}")
        return None

In [None]:
df_platforms['data_license'] = df_platforms['platform_url'].apply(detect_data_license)

Error detecting license for https://coastwards.org: HTTPSConnectionPool(host='coastwards.org', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7ff0c4f1e890>: Failed to establish a new connection: [Errno 111] Conexión rehusada'))


In [None]:
df_platforms[['platform_url', 'data_license']]

Unnamed: 0,platform_url,data_license
0,observation.org,Protected by anti-bot
1,seasearch.org.uk,No license info found
2,natuurpunt.be,No license info found
3,slu.se/artdatabanken,No license info found
4,theroadlab.co.uk,No license info found
5,exploreyourshore.ie,No license info found
6,eyeonwater.org,No license info found
7,iseahorse.org,All Rights Reserved
8,redpromar.org,Creative Commons (Badge)
9,coastwards.org,


In [None]:
df_platforms.to_csv("../data/df_platforms.csv", index=False)