# Process for extracting data from citizen science apps located in the iOS store

iOS apps are manually selected from `results_ios.csv` and stored in `selected_ios.csv`.

## Creating final dataset with selected iOS apps

In [103]:
import pandas as pd
import requests

In [104]:
# Read selected iOS apps dataset
df = pd.read_csv("../data/selected_ios.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             48 non-null     object
 1   developer        48 non-null     object
 2   description      48 non-null     object
 3   include_exclude  49 non-null     object
 4   comments         1 non-null      object
 5   url              48 non-null     object
 6   language         48 non-null     object
dtypes: object(7)
memory usage: 2.8+ KB


In [105]:
df.drop(columns=["include_exclude", "comments"], inplace=True)

In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         48 non-null     object
 1   developer    48 non-null     object
 2   description  48 non-null     object
 3   url          48 non-null     object
 4   language     48 non-null     object
dtypes: object(5)
memory usage: 2.0+ KB


In [107]:
df.head()

Unnamed: 0,name,developer,description,url,language
0,Anecdata.org,MDI Biological Laboratory,Anecdata.org is a free online platform anyone ...,https://apps.apple.com/us/app/anecdata-org/id1...,['EN']
1,ZOE Health Study,Zoe Limited,Help critical research into major health condi...,https://apps.apple.com/us/app/zoe-health-study...,['EN']
2,Leave No Trace Citizen Science,Greg Newman,The Leave No Trace Center for Outdoor Ethics p...,https://apps.apple.com/us/app/leave-no-trace-c...,['EN']
3,BirdNET,Stefan Kahl,How can computers learn to recognize birds fro...,https://apps.apple.com/us/app/birdnet/id154184...,"['CS', 'NL', 'EN', 'FR', 'DE', 'IT', 'LT', 'PL..."
4,GLOBE Observer,NASA,The GLOBE Observer app enables you to make obs...,https://apps.apple.com/us/app/globe-observer/i...,['EN']


In [108]:
df_ios_total = pd.read_csv("../data/results_ios.csv")
df_ios_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          188 non-null    object 
 1   url           188 non-null    object 
 2   developer     188 non-null    object 
 3   rating        188 non-null    float64
 4   currency      188 non-null    object 
 5   description   188 non-null    object 
 6   language      188 non-null    object 
 7   platform_url  129 non-null    object 
dtypes: float64(1), object(7)
memory usage: 11.9+ KB


In [109]:
# Combine to add platform_url (sellerUrl)
df2 = pd.merge(df, df_ios_total[["url", "platform_url"]], how="left")
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          48 non-null     object
 1   developer     48 non-null     object
 2   description   48 non-null     object
 3   url           48 non-null     object
 4   language      48 non-null     object
 5   platform_url  36 non-null     object
dtypes: object(6)
memory usage: 2.4+ KB


In [110]:
# Complementary search of platform_url using google

from googlesearch import search

def get_platform_url(co_name):
    query = co_name
    try:
        # Busca en sitios específicos de ciencia ciudadana
        resultados = list(search(query, num_results=1, lang="en"))
        return resultados[0] if resultados else None
    except Exception as e:
        print(f"Error buscando {co_name}: {e}")
        return None
    
df2['platform_url_google'] = df2['name'].apply(get_platform_url)


In [111]:
df2 = df2[-df2['name'].isnull()]

In [112]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48 entries, 0 to 47
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   name                 48 non-null     object
 1   developer            48 non-null     object
 2   description          48 non-null     object
 3   url                  48 non-null     object
 4   language             48 non-null     object
 5   platform_url         36 non-null     object
 6   platform_url_google  48 non-null     object
dtypes: object(7)
memory usage: 3.0+ KB


In [113]:
df2[['name', "platform_url", "platform_url_google"]]

Unnamed: 0,name,platform_url,platform_url_google
0,Anecdata.org,,https://www.anecdata.org/
1,ZOE Health Study,https://health-study.joinzoe.com/,https://health-study.zoe.com/
2,Leave No Trace Citizen Science,https://lnt.org/our-work/citizen-science/,https://lnt.org/our-work/citizen-science/
3,BirdNET,https://birdnet.cornell.edu,https://birdnet.cornell.edu/
4,GLOBE Observer,,https://observer.globe.gov/
5,Quantified Citizen Pro,https://www.quantifiedcitizen.com/,https://play.google.com/store/apps/details?id=...
6,spotFIRE | Citizen Science,https://www.spotteron.app,https://eu-citizen.science/
7,Brain Explorer,https://www.brainexplorer.net,/search?num=3
8,WaterLinx | Citizen Science,https://www.spotteron.app/,https://eu-citizen.science/
9,The Happiness Project,,https://gretchenrubin.com/books/the-happiness-...


In [114]:
import numpy as np
df2.loc[df2.platform_url_google == "/search?num=3", "platform_url_google"] = np.nan

In [115]:
df2[['name', "url", "platform_url", "platform_url_google"]]

Unnamed: 0,name,url,platform_url,platform_url_google
0,Anecdata.org,https://apps.apple.com/us/app/anecdata-org/id1...,,https://www.anecdata.org/
1,ZOE Health Study,https://apps.apple.com/us/app/zoe-health-study...,https://health-study.joinzoe.com/,https://health-study.zoe.com/
2,Leave No Trace Citizen Science,https://apps.apple.com/us/app/leave-no-trace-c...,https://lnt.org/our-work/citizen-science/,https://lnt.org/our-work/citizen-science/
3,BirdNET,https://apps.apple.com/us/app/birdnet/id154184...,https://birdnet.cornell.edu,https://birdnet.cornell.edu/
4,GLOBE Observer,https://apps.apple.com/us/app/globe-observer/i...,,https://observer.globe.gov/
5,Quantified Citizen Pro,https://apps.apple.com/us/app/quantified-citiz...,https://www.quantifiedcitizen.com/,https://play.google.com/store/apps/details?id=...
6,spotFIRE | Citizen Science,https://apps.apple.com/us/app/spotfire-citizen...,https://www.spotteron.app,https://eu-citizen.science/
7,Brain Explorer,https://apps.apple.com/us/app/brain-explorer/i...,https://www.brainexplorer.net,
8,WaterLinx | Citizen Science,https://apps.apple.com/us/app/waterlinx-citize...,https://www.spotteron.app/,https://eu-citizen.science/
9,The Happiness Project,https://apps.apple.com/us/app/the-happiness-pr...,,https://gretchenrubin.com/books/the-happiness-...


In [116]:
# Fix manually

platforms_mapping = {
    "Anecdata.org": "https://www.anecdata.org/",
    "ZOE Health Study": "https://health-study.zoe.com/",
    "GLOBE Observer": "https://observer.globe.gov/",
    "spotFIRE | Citizen Science": "https://www.spotfire.com/",
    "WaterLinx | Citizen Science": "https://www.spotteron.net/apps/regional-community-science-projects/waterlinx-citizen-science",
    "The Happiness Project": "https://thehappinessproject.app",
    "Zooniverse": "https://www.zooniverse.org/",
    "AmphiApp | Citizen Science": "https://www.spotteron.net/apps/regional-community-science-projects/amphiapp-citizen-science-app-en",
    "YOUCOUNT Youth Citizen Science": "https://www.youcountproject.eu/",
    "HerpMapper": "https://www.herpmapper.org/",
    "WhaleReport": "https://ocean.org/whales/wras/",
    "CoronaReport": "https://www.coronareport.global/",
    "Dark Sky Meter": "https://www.darkskymeter.com/",
    "BirdWeather": "https://www.birdweather.com/",
    "myHAZ-VCT": "https://uwiseismic.com/myhaz-vct/",
    "EyeOnWater - Colour": "https://www.eyeonwater.org/",
    "Tea Bag Index | SPOTTERON": "http://www.teatime4science.org/",
    # "Happiness Quest": "https://www.happinessquest.app/",
    "OpenRadiation": "https://www.openradiation.org/en",
    "Build4People: Citizen Science": "https://build4people.org/",
    "IPM Popillia Pest Management": "https://www.popillia.eu/",
    "BiciZen": "https://www.bicizen.org/",
}

# Keep original value if not in platforms_mapping dictionary
df2['platform_url'] = df2.apply(
    lambda row: platforms_mapping.get(row['name'], row['platform_url']),
    axis=1
)


In [117]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48 entries, 0 to 47
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   name                 48 non-null     object
 1   developer            48 non-null     object
 2   description          48 non-null     object
 3   url                  48 non-null     object
 4   language             48 non-null     object
 5   platform_url         45 non-null     object
 6   platform_url_google  43 non-null     object
dtypes: object(7)
memory usage: 3.0+ KB


In [118]:
# Drop auxiliary column with google search 
df2.drop(columns=["platform_url_google"], inplace=True)

In [119]:
# No platform_url for this apps

df2[df2['platform_url'].isnull()]

Unnamed: 0,name,developer,description,url,language,platform_url
27,Mass Science,King's Mobile,Mass Science is a research platform and app wh...,https://apps.apple.com/us/app/mass-science/id1...,['EN'],
31,Happiness Quest,Robb Rutledge,Happiness is really complicated and scientists...,https://apps.apple.com/us/app/happiness-quest/...,['EN'],
45,Discovery Tool Our Voice,Stanford University,"In the Our Voice approach, Citizen Scientists ...",https://apps.apple.com/us/app/discovery-tool-o...,['EN'],


In [120]:
df_selected = df2[df2['platform_url'].notnull()].reset_index(drop=True)

In [121]:
df_selected.to_csv("../data/selected_ios.csv", index=False)

# Process with selected iOS apps

In [2]:
import pandas as pd
import requests

In [3]:
df = pd.read_csv("../data/selected_ios.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          45 non-null     object
 1   developer     45 non-null     object
 2   description   45 non-null     object
 3   url           45 non-null     object
 4   language      45 non-null     object
 5   platform_url  45 non-null     object
dtypes: object(6)
memory usage: 2.2+ KB


In [5]:
df.head()

Unnamed: 0,name,developer,description,url,language,platform_url
0,Anecdata.org,MDI Biological Laboratory,Anecdata.org is a free online platform anyone ...,https://apps.apple.com/us/app/anecdata-org/id1...,['EN'],https://www.anecdata.org/
1,ZOE Health Study,Zoe Limited,Help critical research into major health condi...,https://apps.apple.com/us/app/zoe-health-study...,['EN'],https://health-study.zoe.com/
2,Leave No Trace Citizen Science,Greg Newman,The Leave No Trace Center for Outdoor Ethics p...,https://apps.apple.com/us/app/leave-no-trace-c...,['EN'],https://lnt.org/our-work/citizen-science/
3,BirdNET,Stefan Kahl,How can computers learn to recognize birds fro...,https://apps.apple.com/us/app/birdnet/id154184...,"['CS', 'NL', 'EN', 'FR', 'DE', 'IT', 'LT', 'PL...",https://birdnet.cornell.edu
4,GLOBE Observer,NASA,The GLOBE Observer app enables you to make obs...,https://apps.apple.com/us/app/globe-observer/i...,['EN'],https://observer.globe.gov/


## active

Check status code.

In [6]:
def check_website(url):
    try:
        session = requests.Session()
        session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Referer": "https://www.google.com/"
        })
        response = session.get(url, timeout=5)
        return f"Status code: {response.status_code}"
    except requests.exceptions.RequestException as e:
        return f"Fallo: {str(e)}"
    
df["active"] = df['platform_url'].apply(check_website)

In [7]:
df.active.value_counts()

active
Status code: 200    42
Status code: 403     2
Status code: 404     1
Name: count, dtype: int64

In [8]:
# Forbidden: we fix urls
df[df.active == "Status code: 403"]

Unnamed: 0,name,developer,description,url,language,platform_url,active
33,Big Butterfly Count,Butterfly Conservation Trading Ltd,Take part in the UK’s annual Big Butterfly Cou...,https://apps.apple.com/us/app/big-butterfly-co...,['EN'],http://bigbutterflycount.org,Status code: 403
42,ZSL Instant Wild,Zoological Society of London (ZSL),ZSL’s Instant Wild app empowers you to take pa...,https://apps.apple.com/us/app/zsl-instant-wild...,['EN'],https://www.zsl.org/instant-wild-about,Status code: 403


In [9]:
# Fix urls manually
df.loc[df.name == "Big Butterfly Count", "platform_url"] = "https://bigbutterflycount.butterfly-conservation.org/"
df.loc[df.name == "ZSL Instant Wild", "platform_url"] = "https://instantwild.zsl.org/"

In [10]:
df[df.active == "Status code: 404"]

Unnamed: 0,name,developer,description,url,language,platform_url,active
21,Sealife Tracker,Natural Apptitude,The Sealife Tracker project aims to collect mu...,https://apps.apple.com/us/app/sealife-tracker/...,['EN'],http://www.natural-apptitude.co.uk/project/sea...,Status code: 404


In [13]:
df["active"] = df['platform_url'].apply(check_website)
df.active.value_counts()

active
Status code: 200    43
Status code: 404     1
Status code: 403     1
Name: count, dtype: int64

In [14]:
df[df["active"] == "Status code: 403"]

Unnamed: 0,name,developer,description,url,language,platform_url,active
33,Big Butterfly Count,Butterfly Conservation Trading Ltd,Take part in the UK’s annual Big Butterfly Cou...,https://apps.apple.com/us/app/big-butterfly-co...,['EN'],https://bigbutterflycount.butterfly-conservati...,Status code: 403


Page is working. Code returned is 403, but content is displayed correctly. 

## year_creation

In [15]:
import whois

def get_domain_creation_year(url):
    try:
        domain = whois.whois(url)
        creation_date = domain.creation_date
        if isinstance(creation_date, list):  # Some WHOIS return multiple dates
            creation_date = creation_date[0]
        return creation_date.year
    except Exception as e:
        return None

year = get_domain_creation_year("minka-sdg.org")
print(f"Year of creation of the domain: {year}")

Year of creation of the domain: 2021


In [16]:
df['year_creation'] = df['platform_url'].apply(get_domain_creation_year)

2025-07-16 11:50:05,985 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Nombre o servicio desconocido


In [None]:
df[["name", "platform_url", 'year_creation']].head(10)

Unnamed: 0,name,platform_url,year_creation
0,Anecdata.org,https://www.anecdata.org/,2013.0
1,ZOE Health Study,https://health-study.zoe.com/,1995.0
2,Leave No Trace Citizen Science,https://lnt.org/our-work/citizen-science/,1995.0
3,BirdNET,https://birdnet.cornell.edu,1985.0
4,GLOBE Observer,https://observer.globe.gov/,1997.0
5,Quantified Citizen Pro,https://www.quantifiedcitizen.com/,2017.0
6,spotFIRE | Citizen Science,https://www.spotfire.com/,1996.0
7,Brain Explorer,https://www.brainexplorer.net,2019.0
8,WaterLinx | Citizen Science,https://www.spotteron.net/apps/regional-commun...,2016.0
9,The Happiness Project,https://thehappinessproject.app,


Waterlinx is inside spotteron website. `year_creation` is for the main url:
https://www.spotteron.net/apps/regional-community-science-projects/waterlinx-citizen-science

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           45 non-null     object 
 1   developer      45 non-null     object 
 2   description    45 non-null     object 
 3   url            45 non-null     object 
 4   language       45 non-null     object 
 5   platform_url   45 non-null     object 
 6   active         45 non-null     object 
 7   year_creation  41 non-null     float64
dtypes: float64(1), object(7)
memory usage: 2.9+ KB


## about

In [20]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Diccionario de términos "about" en diferentes idiomas
ABOUT_TERMS = {
    'en': ['about', 'about-us', 'aboutus', 'who-we-are', 'mission'],
    'es': ['sobre', 'sobre-nosotros', 'quienes-somos', 'nosotros'],
    'fr': ['a-propos', 'qui-sommes-nous', 'apropos'],
    'de': ['uber-uns', 'ueber-uns', 'uberuns'],
    'it': ['chi-siamo', 'su-di-noi'],
    'pt': ['sobre', 'sobre-nos', 'quem-somos']
}

def find_about_url(base_url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(base_url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Buscar en todos los enlaces
        all_links = [a.get('href') for a in soup.find_all('a', href=True)]
        
        # Convertir a URLs absolutas y filtrar None
        absolute_links = []
        for link in all_links:
            if link:
                try:
                    absolute_links.append(urljoin(base_url, link))
                except:
                    continue
        
        # Buscar coincidencias con términos "about" en cualquier idioma
        for term_list in ABOUT_TERMS.values():
            for term in term_list:
                for link in absolute_links:
                    if term in link.lower():
                        return link
        
        # Si no se encuentra por URL, buscar en el texto del enlace
        for a in soup.find_all('a'):
            text = a.get_text().lower()
            if any(term in text for terms in ABOUT_TERMS.values() for term in terms):
                href = a.get('href')
                if href:
                    return urljoin(base_url, href)
        
        return None
    
    except Exception as e:
        print(f"Error procesando {base_url}: {str(e)}")
        return None

# Aplicar a tu dataframe
df['platform_about_url'] = df['platform_url'].apply(find_about_url)

Error procesando http://www.natural-apptitude.co.uk/project/sealife-tracker/: 404 Client Error: Not Found for url: https://www.natural-apptitude.co.uk/project/sealife-tracker
Error procesando https://uwiseismic.com/myhaz-vct/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error procesando https://bigbutterflycount.butterfly-conservation.org/: 403 Client Error: Forbidden for url: https://bigbutterflycount.butterfly-conservation.org/


In [21]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def find_terms_url(base_url, list_terms):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(base_url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Buscar en todos los enlaces
        all_links = [a.get('href') for a in soup.find_all('a', href=True)]
        
        # Convertir a URLs absolutas y filtrar None
        absolute_links = []
        for link in all_links:
            if link:
                try:
                    absolute_links.append(urljoin(base_url, link))
                except:
                    continue
        
        # Buscar coincidencias con términos "about" en los enlaces
        for term in list_terms:
            for link in absolute_links:
                if term in link.lower():
                    return link
        
        # Si no se encuentra por URL, buscar en el texto del enlace
        for a in soup.find_all('a'):
            text = a.get_text().lower()
            if any(term in text for term in list_terms):
                href = a.get('href')
                if href:
                    return urljoin(base_url, href)
        
        return None

    except Exception as e:
        print(f"Error procesando {base_url}: {str(e)}")
        return None

In [22]:
about_terms = [
    'about', 'about-us', 'aboutus', 'who-we-are', 'mission', 
    'sobre', 'sobre-nosotros', 'quienes-somos', 'nosotros', 
    'a-propos', 'qui-sommes-nous', 'apropos',
    'uber-uns', 'ueber-uns', 'uberuns',
    'chi-siamo', 'su-di-noi',
    'sobre', 'sobre-nos', 'quem-somos'
    ]

df['platform_about_url'] = df['platform_url'].apply(lambda x: find_terms_url(x, about_terms))

Error procesando http://www.natural-apptitude.co.uk/project/sealife-tracker/: 404 Client Error: Not Found for url: https://www.natural-apptitude.co.uk/project/sealife-tracker
Error procesando https://uwiseismic.com/myhaz-vct/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error procesando https://bigbutterflycount.butterfly-conservation.org/: 403 Client Error: Forbidden for url: https://bigbutterflycount.butterfly-conservation.org/


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                45 non-null     object 
 1   developer           45 non-null     object 
 2   description         45 non-null     object 
 3   url                 45 non-null     object 
 4   language            45 non-null     object 
 5   platform_url        45 non-null     object 
 6   active              45 non-null     object 
 7   year_creation       41 non-null     float64
 8   platform_about_url  34 non-null     object 
dtypes: float64(1), object(8)
memory usage: 3.3+ KB


In [None]:
df[df.platform_about_url.notnull()]

Unnamed: 0,name,developer,description,url,language,platform_url,active,year_creation,platform_about_url
0,Anecdata.org,MDI Biological Laboratory,Anecdata.org is a free online platform anyone ...,https://apps.apple.com/us/app/anecdata-org/id1...,['EN'],https://www.anecdata.org/,Status code: 200,2013.0,
9,The Happiness Project,UCL,Play fun mini-games to help brain scientists s...,https://apps.apple.com/us/app/the-happiness-pr...,['EN'],https://thehappinessproject.app,Status code: 200,,
16,ISeeChange Tracker,ISeeChange,Our climate is changing--and so are we. With I...,https://apps.apple.com/us/app/iseechange-track...,['EN'],https://www.iseechange.org/,Status code: 200,2013.0,
19,CSMON-LIFE segnalazioni,Divulgando Srl,CSMON-LIFE (Citizen Science MONitoring) is one...,https://apps.apple.com/us/app/csmon-life-segna...,"['EN', 'IT']",http://www.csmon-life.eu,Status code: 200,,
21,Sealife Tracker,Natural Apptitude,The Sealife Tracker project aims to collect mu...,https://apps.apple.com/us/app/sealife-tracker/...,['EN'],http://www.natural-apptitude.co.uk/project/sea...,Status code: 404,2013.0,
23,Dark Sky Meter,DDQ,The Dark Sky Meter app measures the sky bright...,https://apps.apple.com/us/app/dark-sky-meter/i...,"['NL', 'EN']",https://www.darkskymeter.com/,Status code: 200,2013.0,
25,myHAZ-VCT,British Geological Survey,myHAZ-VCT is a citizen science app for sharing...,https://apps.apple.com/us/app/myhaz-vct/id1507...,['EN'],https://uwiseismic.com/myhaz-vct/,Status code: 200,2001.0,
28,Dawn Chorus,Natural Apptitude,Dawn Chorus is a project that combines conserv...,https://apps.apple.com/us/app/dawn-chorus/id15...,"['EN', 'DE']",https://dawn-chorus.org/,Status code: 200,2020.0,
33,Big Butterfly Count,Butterfly Conservation Trading Ltd,Take part in the UK’s annual Big Butterfly Cou...,https://apps.apple.com/us/app/big-butterfly-co...,['EN'],https://bigbutterflycount.butterfly-conservati...,Status code: 403,1998.0,
38,Citizen Scientist,NOMAD.ORG.UK LTD,Citizen Scientist is an app that measures your...,https://apps.apple.com/us/app/citizen-scientis...,['EN'],https://www.natureboostproject.com,Status code: 200,2024.0,


In [25]:
# Fix manually about_links missing

missing_platform_about = {
    "https://www.anecdata.org/": "https://www.anecdata.org/about",
    "http://www.csmon-life.eu": "https://www.csmon-life.eu/pagina/progetto/18/About%20us",
    "https://dawn-chorus.org/": "https://dawn-chorus.org/about-dawn-chorus/",
    "https://bigbutterflycount.butterfly-conservation.org/": "https://bigbutterflycount.butterfly-conservation.org/about",
    "https://instantwild.zsl.org/": "https://instantwild.zsl.org/about",
}

# Keep original value if not in platforms_mapping dictionary
df['platform_about_url'] = df.apply(
    lambda row: missing_platform_about.get(row['platform_url'], row['platform_about_url']),
    axis=1
)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                45 non-null     object 
 1   developer           45 non-null     object 
 2   description         45 non-null     object 
 3   url                 45 non-null     object 
 4   language            45 non-null     object 
 5   platform_url        45 non-null     object 
 6   active              45 non-null     object 
 7   year_creation       41 non-null     float64
 8   platform_about_url  39 non-null     object 
dtypes: float64(1), object(8)
memory usage: 3.3+ KB


In [27]:
df.to_csv("../data/ios_apps_platforms_test.csv", index=False)

In [28]:
df = pd.read_csv("../data/ios_apps_platforms_test.csv")

## extract text from url
Not include this section, just links for first template.

In [None]:
# Extract text from about section
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def extract_text_from_url(url):
    try:
        if not url or pd.isna(url):  # Si la URL es None o NaN
            return None
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Lanza error si la solicitud falla
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Eliminar scripts, estilos y otros elementos no deseados
        for script in soup(["script", "style", "nav", "footer", "head", "meta", "link"]):
            script.decompose()
        
        # Obtener texto y limpiar espacios en blanco
        text = soup.get_text(separator=' ', strip=True)
        
        # Eliminar líneas vacías y múltiples espacios
        text = ' '.join(text.split())
        
        return text if text else None
    
    except Exception as e:
        print(f"Error extrayendo texto de {url}: {str(e)}")
        return None

In [None]:
extract_text_from_url("https://joinzoe.com/about-zoe")

"Science for every body We run the world’s largest in-depth nutrition study, and we’ve turned our research into a personalized program that gives you insights into how your body responds to food. Discover our pioneering studies PREDICT encompasses a collection of rigorously designed clinical trials that have helped us to understand and predict personalized metabolic responses to foods so we can all move beyond a “one-size-fits-all” approach to nutrition. Through this research, we’ve developed an at-home test kit that can give everyone a better understanding of their unique metabolism and gut microbiome. View our studies Latest published research from the ZOE teams May 2024 Effects of a personalized nutrition program on cardiometabolic health: a randomized controlled trial METHOD (Measuring Efficacy THrough Outcomes of Diet) is a randomised controlled trial assessing the effectiveness of following ZOE’s personalised nutrition program, compared to standard care dietary advice, on cardiom

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


def extract_main_content(url):
    """
    Extracts the main content of an About page

    Args:
        url: URL of the About page

    Returns:
        str: Extracted main text or error message
    """
    # Configure session with retries
    session = requests.Session()
    retry_strategy = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    # Headers para simular un navegador real
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    })
    
    try:
        # Check if the URL is valid
        if pd.isna(url) or url == '' or not str(url).startswith(('http://', 'https://')):
            return None
        
        # Make request with timeout
        response = session.get(url, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Delete elements we don't want
        remove_unwanted_elements(soup)
        
        # Search main content
        main_content = find_main_content(soup)
        
        if main_content:
            # Clean and format text
            text = clean_text(main_content.get_text(separator=' ', strip=True))
            return text
        else:
            return "The main content could not be extracted"
            
    except requests.exceptions.RequestException as e:
        return f"Error accessing the URL: {str(e)}"
    except Exception as e:
        return f"Error processing page: {str(e)}"

def remove_unwanted_elements(soup):
    """
    Remove unwanted elements such as footers, navigation, etc.
    """
    from bs4 import Comment, NavigableString
    
    # Selectors of elements to delete
    unwanted_selectors = [
        'footer', 'nav', 'header', '.footer', '.navigation', 
        '.nav', '.menu', '.sidebar', '.ads', '.advertisement',
        '.cookie', '.popup', '.modal', '.social-media',
        '.share', '.comments', '.related', '.recommended',
        'script', 'style', 'noscript', 'iframe'
    ]
    
    for selector in unwanted_selectors:
        for element in soup.select(selector):
            element.decompose()
    
    # Delete HTML comments
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()
    
    # Remove common text in footers
    footer_texts = ['copyright', '©', 'all rights reserved', 'privacy policy', 'terms of service']
    for element in soup.find_all(string=True):
        # Solo procesar NavigableString que no sean comentarios
        if (isinstance(element, NavigableString) and 
            not isinstance(element, Comment) and
            hasattr(element, 'parent') and 
            element.parent and 
            hasattr(element.parent, 'name') and
            element.parent.name):
            
            try:
                if any(footer_text in element.lower() for footer_text in footer_texts):
                    # Solo eliminar si es un elemento pequeño (likely footer)
                    if len(element.strip()) < 200:
                        parent = element.parent
                        if parent and parent.name in ['div', 'span', 'p', 'footer']:
                            parent.decompose()
            except (AttributeError, TypeError):
                # Si hay algún error, simplemente continuar
                continue

def find_main_content(soup):
    """
    Find the main content using different strategies
    """
    # Find elements with common main content selectors
    main_selectors = [
        'main', 'article', '.main-content', '.content', '.main',
        '.about-content', '.page-content', '.post-content',
        '[role="main"]', '.container .row', '.wrapper'
    ]
    
    for selector in main_selectors:
        main_element = soup.select_one(selector)
        if main_element and has_substantial_text(main_element):
            return main_element
    
    # Find the div with the most text
    divs = soup.find_all('div')
    best_div = None
    max_text_length = 0
    
    for div in divs:
        text_length = len(div.get_text().strip())
        if text_length > max_text_length and text_length > 200:  # Mínimo 200 caracteres
            max_text_length = text_length
            best_div = div
    
    if best_div:
        return best_div
    
    # Use full body as a last resort
    return soup.find('body')

def has_substantial_text(element):
    """
    Checks if an element has enough text
    """
    text = element.get_text().strip()
    return len(text) > 100  # Mínimo 100 caracteres

def clean_text(text):
    """
    Clean and format the extracted text
    """
    # Normalize whitespace but preserve structure
    text = re.sub(r'\s+', ' ', text)
    
    # Remove multiple empty lines
    text = re.sub(r'\n\s*\n', '\n\n', text)
    
    # Clean special characters but keep spaces
    # Only remove problematic characters
    text = re.sub(r'[^\w\s\.\,\;\:\!\?\-\(\)\[\]\{\}\"\'\/\@\#\$\%\&\*\+\=\<\>\n]', '', text)
    
    # Ensure spaces after semicolons
    text = re.sub(r'([.!?])([A-Z])', r'\1 \2', text)
    
    return text.strip()

extract_main_content("https://joinzoe.com/about-zoe")

"Science for every body We run the worlds largest in-depth nutrition study, and weve turned our research into a personalized program that gives you insights into how your body responds to food. Discover our pioneering studies PREDICT encompasses a collection of rigorously designed clinical trials that have helped us to understand and predict personalized metabolic responses to foods so we can all move beyond a one-size-fits-all approach to nutrition. Through this research, weve developed an at-home test kit that can give everyone a better understanding of their unique metabolism and gut microbiome. View our studies Latest published research from the ZOE teams May 2024 Effects of a personalized nutrition program on cardiometabolic health: a randomized controlled trial METHOD (Measuring Efficacy THrough Outcomes of Diet) is a randomised controlled trial assessing the effectiveness of following ZOEs personalised nutrition program, compared to standard care dietary advice, on cardiometabol

In [None]:
df['platform_about'] = df['platform_about_url'].apply(extract_main_content)

In [None]:
df.platform_about

0                                                      
1     Science for every body We run the worlds large...
2     Lets protect and enjoy our natural world toget...
3     Observations (Total  24h): -  - Species (24h):...
4     GLOBE Breadcrumb About About GLOBE Observer GL...
5     A platform to disrupt and democratize health r...
6     About Spotfire Spotfire pioneers a visual-firs...
7     Why do most mental health problems arise durin...
8     DE EN You are here: SPOTTERON Citizen Science ...
9                                                  None
10    About Faunawatch has been committed to animal ...
11    Wildlife Acoustics creates the worlds leading ...
12    About the Zooniverse Section About the Zoonive...
13    Sections About Us USA National Phenology Netwo...
14    DE EN You are here: SPOTTERON Citizen Science ...
15    About the YouCount project YouCount is a proje...
16                                                 None
17    About HerpMapper HerpMapper is a 501(c)(3)

In [None]:
# Replace empty string by None
import numpy as np
df.loc[df.platform_about == "", "platform_about"] = None

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                45 non-null     object 
 1   developer           45 non-null     object 
 2   description         45 non-null     object 
 3   url                 45 non-null     object 
 4   language            45 non-null     object 
 5   platform_url        45 non-null     object 
 6   active              45 non-null     object 
 7   year_creation       41 non-null     float64
 8   platform_about_url  39 non-null     object 
 9   platform_about      37 non-null     object 
dtypes: float64(1), object(9)
memory usage: 3.6+ KB


## terms_use_link

In [29]:
# Keywords to identify "Terms of Use"
terms_keyword = [
    # Inglés
    'terms', 'terms-of-use', 'terms-of-service',
    # Español
    'términos', 'condiciones', 'aviso-legal',
    # Francés
    'conditions', 'mentions-légales', 'cgu',
    # Neerlandés
    'voorwaarden', 'gebruiksvoorwaarden',
    # Alemán
    'nutzungsbedingungen', 'agb',
    # Sueco
    'användarvillkor', 'villkor',
    # Danés
    'vilkår', 'betingelser',
    # Italiano
    'termini', 'condizioni',
    # Portugués
    'termos', 'condições',
]

In [30]:
def find_terms_url(base_url, list_terms):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(base_url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Buscar en todos los enlaces
        all_links = [a.get('href') for a in soup.find_all('a', href=True)]
        
        # Convertir a URLs absolutas y filtrar None
        absolute_links = []
        for link in all_links:
            if link:
                try:
                    absolute_links.append(urljoin(base_url, link))
                except:
                    continue
        
        # Buscar coincidencias con términos "about" en los enlaces
        for term in list_terms:
            for link in absolute_links:
                if term in link.lower():
                    return link
        
        # Si no se encuentra por URL, buscar en el texto del enlace
        for a in soup.find_all('a'):
            text = a.get_text().lower()
            if any(term in text for term in list_terms):
                href = a.get('href')
                if href:
                    return urljoin(base_url, href)
        
        return None

    except Exception as e:
        print(f"Error procesando {base_url}: {str(e)}")
        return None


In [31]:
df['terms_use_link'] = df['platform_url'].apply(lambda x: find_terms_url(x, terms_keyword))

Error procesando http://www.natural-apptitude.co.uk/project/sealife-tracker/: 404 Client Error: Not Found for url: https://www.natural-apptitude.co.uk/project/sealife-tracker
Error procesando https://uwiseismic.com/myhaz-vct/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error procesando https://bigbutterflycount.butterfly-conservation.org/: 403 Client Error: Forbidden for url: https://bigbutterflycount.butterfly-conservation.org/


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                45 non-null     object 
 1   developer           45 non-null     object 
 2   description         45 non-null     object 
 3   url                 45 non-null     object 
 4   language            45 non-null     object 
 5   platform_url        45 non-null     object 
 6   active              45 non-null     object 
 7   year_creation       41 non-null     float64
 8   platform_about_url  39 non-null     object 
 9   terms_use_link      17 non-null     object 
dtypes: float64(1), object(9)
memory usage: 3.6+ KB


* Run just one time (Google limit):

In [33]:
from googlesearch import search
import time

def search_google_terms(co_url, list_keywords):
    for term in list_keywords:
        query = f"site:{co_url} (inurl:{term} OR intitle:{term})"
        try:
            # Busca en sitios específicos de ciencia ciudadana
            results = list(search(query, num_results=1, lang="en"))
            time.sleep(2)
            return results[0] if results else None
        except Exception as e:
            print(f"Error buscando {co_url}: {e}")
            time.sleep(2)
            return None
        
df["terms_use_google"] = df['platform_url'].apply(lambda x: search_google_terms(x, terms_keyword))

Error buscando https://beemachine.ai: 429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3Dsite%253Ahttps%253A%252F%252Fbeemachine.ai%2B%2528inurl%253Aterms%2BOR%2Bintitle%253Aterms%2529%26num%3D3%26hl%3Den%26start%3D0%26safe%3Dactive&hl=en&q=EgRTIoHjGIH93cMGIjDWJEFuRLReuFzqxoMExB3kDD6QGWweuoUQ0vo3ffHZzGgqI3r6fDOouE870ESKSucyAnJSWgFD
Error buscando http://www.birdtrack.net: 429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3Dsite%253Ahttp%253A%252F%252Fwww.birdtrack.net%2B%2528inurl%253Aterms%2BOR%2Bintitle%253Aterms%2529%26num%3D3%26hl%3Den%26start%3D0%26safe%3Dactive&hl=en&q=EgRTIoHjGIT93cMGIjBA2kOSOJsBRoNGf7TCTpSfcrYxp86UWBSQixIXGjXT2NfwXi2VPNpfNom6c9iELj8yAnJSWgFD
Error buscando https://www.openradiation.org/en: 429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3Dsite

In [None]:
# Replace empty string by None
df.loc[df.terms_use_google == "", "terms_use_google"] = None

In [34]:
df[["name", "terms_use_link"]]

Unnamed: 0,name,terms_use_link
0,Anecdata.org,
1,ZOE Health Study,
2,Leave No Trace Citizen Science,https://lnt.org/terms-conditions
3,BirdNET,
4,GLOBE Observer,
5,Quantified Citizen Pro,https://www.quantifiedcitizen.com/terms-of-use...
6,spotFIRE | Citizen Science,
7,Brain Explorer,
8,WaterLinx | Citizen Science,https://www.spotteron.net/terms-of-use
9,The Happiness Project,


In [35]:
# Fix manually
df.loc[df.name == "CoastSnap | SPOTTERON", "terms_use_link"] = "https://www.spotteron.net/terms-of-use"

In [37]:
df.drop(columns=['terms_use_google'], inplace=True)

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                45 non-null     object 
 1   developer           45 non-null     object 
 2   description         45 non-null     object 
 3   url                 45 non-null     object 
 4   language            45 non-null     object 
 5   platform_url        45 non-null     object 
 6   active              45 non-null     object 
 7   year_creation       41 non-null     float64
 8   platform_about_url  39 non-null     object 
 9   terms_use_link      17 non-null     object 
dtypes: float64(1), object(9)
memory usage: 3.6+ KB


In [None]:
df[["platform_url", 'terms_use_link']].head(10)

Unnamed: 0,platform_url,terms_use_link
0,https://www.anecdata.org/,
1,https://health-study.zoe.com/,
2,https://lnt.org/our-work/citizen-science/,https://lnt.org/terms-conditions
3,https://birdnet.cornell.edu,
4,https://observer.globe.gov/,
5,https://www.quantifiedcitizen.com/,https://www.quantifiedcitizen.com/terms-of-use...
6,https://www.spotfire.com/,
7,https://www.brainexplorer.net,
8,https://www.spotteron.net/apps/regional-commun...,https://www.spotteron.net/terms-of-use
9,https://thehappinessproject.app,


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                45 non-null     object 
 1   developer           45 non-null     object 
 2   description         45 non-null     object 
 3   url                 45 non-null     object 
 4   language            45 non-null     object 
 5   platform_url        45 non-null     object 
 6   active              45 non-null     object 
 7   year_creation       41 non-null     float64
 8   platform_about_url  39 non-null     object 
 9   terms_use_link      17 non-null     object 
dtypes: float64(1), object(9)
memory usage: 3.6+ KB


In [41]:
# Fix manually
df.loc[df.name == "The Happiness Project", "terms_use_link"] = "https://thehappinessproject.app/terms-and-conditions"

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                45 non-null     object 
 1   developer           45 non-null     object 
 2   description         45 non-null     object 
 3   url                 45 non-null     object 
 4   language            45 non-null     object 
 5   platform_url        45 non-null     object 
 6   active              45 non-null     object 
 7   year_creation       41 non-null     float64
 8   platform_about_url  39 non-null     object 
 9   terms_use_link      18 non-null     object 
dtypes: float64(1), object(9)
memory usage: 3.6+ KB


In [43]:
df.to_csv("../data/ios_apps_platforms_test.csv", index=False)

## privacy_policy

In [44]:
privacy_keywords = [
    "privacy",
    "privacidad",
    "datenschutz",
    "confidentialité",
    "politica",
    "política",
    "privacidade",
    "personuppgifter",
    "privacyverklaring",
    "integritetspolicy",
]

df['privacy_link'] = df['platform_url'].apply(lambda x: find_terms_url(x, privacy_keywords))

Error procesando http://www.natural-apptitude.co.uk/project/sealife-tracker/: 404 Client Error: Not Found for url: https://www.natural-apptitude.co.uk/project/sealife-tracker
Error procesando https://uwiseismic.com/myhaz-vct/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error procesando https://bigbutterflycount.butterfly-conservation.org/: 403 Client Error: Forbidden for url: https://bigbutterflycount.butterfly-conservation.org/


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                45 non-null     object 
 1   developer           45 non-null     object 
 2   description         45 non-null     object 
 3   url                 45 non-null     object 
 4   language            45 non-null     object 
 5   platform_url        45 non-null     object 
 6   active              45 non-null     object 
 7   year_creation       41 non-null     float64
 8   platform_about_url  39 non-null     object 
 9   terms_use_link      18 non-null     object 
 10  privacy_link        30 non-null     object 
dtypes: float64(1), object(10)
memory usage: 4.0+ KB


In [46]:
df.to_csv("../data/ios_apps_platforms_test.csv", index=False)

## code_repository

In [47]:
repository_keywords = [
    "github",
    "gitlab",
    "bitbucket",
    "sourceforge",
    "codeberg",
    "gitea",
]

df['code_repository'] = df['platform_url'].apply(lambda x: find_terms_url(x, repository_keywords))

Error procesando http://www.natural-apptitude.co.uk/project/sealife-tracker/: 404 Client Error: Not Found for url: https://www.natural-apptitude.co.uk/project/sealife-tracker
Error procesando https://uwiseismic.com/myhaz-vct/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error procesando https://bigbutterflycount.butterfly-conservation.org/: 403 Client Error: Forbidden for url: https://bigbutterflycount.butterfly-conservation.org/


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                45 non-null     object 
 1   developer           45 non-null     object 
 2   description         45 non-null     object 
 3   url                 45 non-null     object 
 4   language            45 non-null     object 
 5   platform_url        45 non-null     object 
 6   active              45 non-null     object 
 7   year_creation       41 non-null     float64
 8   platform_about_url  39 non-null     object 
 9   terms_use_link      18 non-null     object 
 10  privacy_link        30 non-null     object 
 11  code_repository     1 non-null      object 
dtypes: float64(1), object(11)
memory usage: 4.3+ KB


Only one website has a reference to the GitHub repository on its home page.

In [49]:
df[df['code_repository'].notnull()]

Unnamed: 0,name,developer,description,url,language,platform_url,active,year_creation,platform_about_url,terms_use_link,privacy_link,code_repository
3,BirdNET,Stefan Kahl,How can computers learn to recognize birds fro...,https://apps.apple.com/us/app/birdnet/id154184...,"['CS', 'NL', 'EN', 'FR', 'DE', 'IT', 'LT', 'PL...",https://birdnet.cornell.edu,Status code: 200,1985.0,https://birdnet.cornell.edu/map,,,https://github.com/kahst/BirdNET-Analyzer


- Second strategy: using github API.

In [50]:
# Find repository and organization
from dotenv import load_dotenv
import os

# Cargar las variables desde el archivo .env
load_dotenv()

github_token = os.getenv("GITHUB_TOKEN")

In [51]:
def search_github_repo(platform_name, max_resultados=3):
    url = "https://api.github.com/search/repositories"
    params = {
        "q": f"{platform_name} in:name",
        "sort": "stars",
        "order": "desc",
        "per_page": max_resultados
    }

    headers = {
        "Accept": "application/vnd.github+json",
        "Authorization": f"token {github_token}"
    }

    response = requests.get(url, params=params, headers=headers)
    
    if response.status_code == 200:
        results = response.json()["items"]
        repos_info = []
        for repo in results:
            full_name = repo['full_name']  # ej. birdnet-team/BirdNET-Analyzer
            owner = repo['owner']['login']  # ej. birdnet-team
            repos_info.append({
                "first_repo": f"https://github.com/{full_name}",
                "organization": owner,
                "url_organization": f"https://github.com/{owner}"
            })
        time.sleep(5)
        if len(repos_info) > 0:
            return repos_info[0]['first_repo']
        else:
            return None
    else:
        print(f"Error in request: {response.status_code}")
        time.sleep(5)
        return None

In [52]:
# Example of use:

plataform = "BirdNET"
repo = search_github_repo(plataform)
print("Found repositories:")
print(repo)

Found repositories:
https://github.com/birdnet-team/BirdNET-Analyzer


In [53]:
df["code_repository_github_api"] = df['name'].apply(search_github_repo)

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        45 non-null     object 
 1   developer                   45 non-null     object 
 2   description                 45 non-null     object 
 3   url                         45 non-null     object 
 4   language                    45 non-null     object 
 5   platform_url                45 non-null     object 
 6   active                      45 non-null     object 
 7   year_creation               41 non-null     float64
 8   platform_about_url          39 non-null     object 
 9   terms_use_link              18 non-null     object 
 10  privacy_link                30 non-null     object 
 11  code_repository             1 non-null      object 
 12  code_repository_github_api  18 non-null     object 
dtypes: float64(1), object(12)
memory usag

In [55]:
plataforma = "GLOBE Observer"
url_org = search_github_repo(plataforma)
if url_org:
    print("URL first repository:", url_org)
else:
    print("Nothing found.")

URL first repository: https://github.com/IGES-Geospatial/globe-observer-utils


In [56]:
# Remove manually wrong repositories

remove_repo_links = [
    "https://github.com/hdsingh/Brain-Explorer",
    "https://github.com/gargargargar/the-happiness-project",
    "https://github.com/yuzhengyang/WhaleReport",
    "https://github.com/SeaLife/csgo-item-price-tracker",
    "https://github.com/zudzug/RainMeter-DarkSky-Framework",
    "https://github.com/simonkuo/beemachine",
    "https://github.com/mak-kirkland/birdtracks",
    "https://github.com/juanjoSanz/hass_wibeee",
    "https://github.com/bmidgley/citizen-scientist",
    "https://github.com/avdata99/medusapp",
]


for remove_link in remove_repo_links:
    df.loc[df.code_repository_github_api == remove_link, "code_repository_github_api"] = None

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        45 non-null     object 
 1   developer                   45 non-null     object 
 2   description                 45 non-null     object 
 3   url                         45 non-null     object 
 4   language                    45 non-null     object 
 5   platform_url                45 non-null     object 
 6   active                      45 non-null     object 
 7   year_creation               41 non-null     float64
 8   platform_about_url          39 non-null     object 
 9   terms_use_link              18 non-null     object 
 10  privacy_link                30 non-null     object 
 11  code_repository             1 non-null      object 
 12  code_repository_github_api  8 non-null      object 
dtypes: float64(1), object(12)
memory usag

In [58]:
df.to_csv("../data/ios_apps_platforms_test.csv", index=False)

## language code

In [59]:
# language_code
df = pd.read_csv("../data/ios_apps_platforms_test.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        45 non-null     object 
 1   developer                   45 non-null     object 
 2   description                 45 non-null     object 
 3   url                         45 non-null     object 
 4   language                    45 non-null     object 
 5   platform_url                45 non-null     object 
 6   active                      45 non-null     object 
 7   year_creation               41 non-null     float64
 8   platform_about_url          39 non-null     object 
 9   terms_use_link              18 non-null     object 
 10  privacy_link                30 non-null     object 
 11  code_repository             1 non-null      object 
 12  code_repository_github_api  8 non-null      object 
dtypes: float64(1), object(12)
memory usag

In [60]:
import requests

def analyze_website_tech(url, timeout=10):
    """
    Analyzes a URL to detect web technologies used    
    """
    result = {
        'url': url,
        'technologies': [],
        'language': None,
        'framework': None,
        'server': None,
        'headers': {},
        'status_code': None,
        'error': None
    }
    
    try:
        # Make HTTP request
        response = requests.get(url, timeout=timeout, allow_redirects=True)
        result['status_code'] = response.status_code
        
        # Analyze headers
        headers = response.headers
        result['headers'] = dict(headers)
        
        # Get HTML content
        html_content = response.text.lower()
        
        # Detect technologies by headers
        tech_info = detect_tech_from_headers(headers)
        result.update(tech_info)
        
        # Detect technologies by HTML content
        html_tech = detect_tech_from_html(html_content)
        result['technologies'].extend(html_tech)
        
        # Detect by URL patterns
        url_tech = detect_tech_from_url(url)
        result['technologies'].extend(url_tech)
        
        # Determine primary language
        result['language'] = determine_primary_language(result)
        
        # Drop duplicates
        result['technologies'] = list(set(result['technologies']))
        
    except requests.exceptions.RequestException as e:
        result['error'] = str(e)
    except Exception as e:
        result['error'] = f"Unexpected error: {str(e)}"
    
    return result

def detect_tech_from_headers(headers):
    """Detects technologies based on HTTP headers"""
    tech_info = {
        'technologies': [],
        'language': None,
        'framework': None,
        'server': None
    }
    
    # Convert headers to lowercase for search
    headers_lower = {k.lower(): v for k, v in headers.items()}
    
    # Detect web server
    if 'server' in headers_lower:
        server = headers_lower['server']
        tech_info['server'] = server
        
        if 'apache' in server.lower():
            tech_info['technologies'].append('Apache')
        if 'nginx' in server.lower():
            tech_info['technologies'].append('Nginx')
        if 'iis' in server.lower():
            tech_info['technologies'].append('IIS')
    
    # Detect language by specific headers
    if 'x-powered-by' in headers_lower:
        powered_by = headers_lower['x-powered-by'].lower()
        
        if 'php' in powered_by:
            tech_info['language'] = 'PHP'
            tech_info['technologies'].append('PHP')
        elif 'asp.net' in powered_by:
            tech_info['language'] = 'ASP.NET'
            tech_info['technologies'].append('ASP.NET')
        elif 'express' in powered_by:
            tech_info['language'] = 'Node.js'
            tech_info['technologies'].append('Node.js')
            tech_info['technologies'].append('Express')
    
    # Detect Ruby on Rails
    if 'x-runtime' in headers_lower:
        tech_info['language'] = 'Ruby'
        tech_info['framework'] = 'Ruby on Rails'
        tech_info['technologies'].append('Ruby on Rails')
    
    # Detect by cookies
    if 'set-cookie' in headers_lower:
        cookies = headers_lower['set-cookie'].lower()
        
        if 'phpsessid' in cookies:
            tech_info['language'] = 'PHP'
            tech_info['technologies'].append('PHP')
        elif 'jsessionid' in cookies:
            tech_info['language'] = 'Java'
            tech_info['technologies'].append('Java')
        elif '_session_id' in cookies and 'rails' in cookies:
            tech_info['language'] = 'Ruby'
            tech_info['technologies'].append('Ruby on Rails')
    
    # Detect CloudFlare
    if 'cf-ray' in headers_lower:
        tech_info['technologies'].append('CloudFlare')
    
    return tech_info

def detect_tech_from_html(html_content):
    """Detects technologies by analyzing HTML content"""
    technologies = []
    
    # WordPress
    if any(indicator in html_content for indicator in [
        'wp-content', 'wp-includes', 'wordpress', '/wp-json/'
    ]):
        technologies.append('WordPress')
    
    # Drupal
    if any(indicator in html_content for indicator in [
        'drupal', 'sites/default/files', 'misc/drupal.js'
    ]):
        technologies.append('Drupal')
    
    # Joomla
    if any(indicator in html_content for indicator in [
        'joomla', 'option=com_', 'joomla.org'
    ]):
        technologies.append('Joomla')
    
    # React
    if any(indicator in html_content for indicator in [
        'react', 'data-reactroot', '__react', 'react-dom'
    ]):
        technologies.append('React')
    
    # Vue.js
    if any(indicator in html_content for indicator in [
        'vue.js', 'vue.min.js', 'v-if=', 'v-for='
    ]):
        technologies.append('Vue.js')
    
    # Angular
    if any(indicator in html_content for indicator in [
        'angular', 'ng-app', 'ng-controller', 'angular.min.js'
    ]):
        technologies.append('Angular')
    
    # jQuery
    if any(indicator in html_content for indicator in [
        'jquery', 'jquery.min.js', '$.fn.jquery'
    ]):
        technologies.append('jQuery')
    
    # Bootstrap
    if any(indicator in html_content for indicator in [
        'bootstrap', 'bootstrap.min.css', 'bootstrap.css'
    ]):
        technologies.append('Bootstrap')
    
    # Django (Python)
    if any(indicator in html_content for indicator in [
        'csrfmiddlewaretoken', 'django', '__admin_media_prefix__'
    ]):
        technologies.append('Django')
    
    # Laravel (PHP)
    if any(indicator in html_content for indicator in [
        'laravel_session', 'laravel', 'csrf-token'
    ]):
        technologies.append('Laravel')
    
    # Ruby on Rails
    if any(indicator in html_content for indicator in [
        'csrf-param', 'csrf-token', 'rails', 'data-method='
    ]):
        technologies.append('Ruby on Rails')
    
    # Google Analytics
    if any(indicator in html_content for indicator in [
        'google-analytics', 'gtag(', 'ga('
    ]):
        technologies.append('Google Analytics')
    
    return technologies

def detect_tech_from_url(url):
    """Detects technologies based on URL patterns"""
    technologies = []
    
    # Analyze file extensions
    if '.php' in url:
        technologies.append('PHP')
    elif '.asp' in url or '.aspx' in url:
        technologies.append('ASP.NET')
    elif '.jsp' in url:
        technologies.append('Java')
    elif '.py' in url:
        technologies.append('Python')
    elif '.rb' in url:
        technologies.append('Ruby')
    elif '.cfm' in url:
        technologies.append('ColdFusion')
    
    # WordPress specific patterns
    if any(pattern in url for pattern in ['/wp-content/', '/wp-admin/', '/wp-includes/']):
        technologies.append('WordPress')
    
    # Drupal patterns
    if any(pattern in url for pattern in ['/node/', '/admin/config/']):
        technologies.append('Drupal')
    
    return technologies

def determine_primary_language(result):
    """Determine the primary language based on all the information"""
    technologies = result['technologies']
    
    # Prioridad por frameworks específicos
    if 'Ruby on Rails' in technologies:
        return 'Ruby'
    elif 'Django' in technologies:
        return 'Python'
    elif 'Laravel' in technologies:
        return 'PHP'
    elif 'ASP.NET' in technologies:
        return 'ASP.NET'
    elif 'PHP' in technologies or 'WordPress' in technologies:
        return 'PHP'
    elif 'Java' in technologies:
        return 'Java'
    elif 'Node.js' in technologies:
        return 'JavaScript (Node.js)'
    elif any(js_tech in technologies for js_tech in ['React', 'Vue.js', 'Angular']):
        return 'JavaScript'
    elif result['language'] is not None:
        return result['language']
    else:
        return None



In [64]:
# Función que retorna solo el lenguaje
def get_language(url):
    try:
        result = analyze_website_tech(url)
        return result['language']
    except:
        return None
    
df['code_language'] = df['platform_url'].apply(get_language)

In [65]:
df.code_language.value_counts()

code_language
PHP           20
JavaScript     4
Ruby           2
Java           1
Name: count, dtype: int64

**NOTE:**
* **PHP**: frameworks like Laravel, Symfony, CodeIgniter... and CMS like WordPress, Drupal, Joomla...
* **JavaScript**: frontends like React, Vue.js, Angular, jQuery... and backends like Node.js or Express.js.

In [66]:
df.to_csv("../data/ios_apps_platforms_test.csv", index=False)

## governance_explicit

In [67]:
gov_terms = [
    "governance",
    "data governance", 
    "gobierno de datos", 
    "gobernanza de datos", 
    "gestión de datos",
    "data management", 
    "gestión de datos", 
    "data stewardship", 
    "calidad de datos",
    "data quality", 
    "cumplimiento de datos",
    "data compliance", 
    "política de datos", 
    "data policy", 
    "estrategia de datos",
    "data strategy", 
    "protección de datos", 
    "data protection", 
    "gobernanza de la información",
    "information governance", 
    "gouvernance des données", 
    "governance dei dati", 
    "Daten-Governance", 
    "governança de dados"
]

df['governance_url'] = df['platform_url'].apply(lambda x: find_terms_url(x, gov_terms))

Error procesando http://www.natural-apptitude.co.uk/project/sealife-tracker/: 404 Client Error: Not Found for url: https://www.natural-apptitude.co.uk/project/sealife-tracker
Error procesando https://uwiseismic.com/myhaz-vct/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error procesando https://bigbutterflycount.butterfly-conservation.org/: 403 Client Error: Forbidden for url: https://bigbutterflycount.butterfly-conservation.org/


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        45 non-null     object 
 1   developer                   45 non-null     object 
 2   description                 45 non-null     object 
 3   url                         45 non-null     object 
 4   language                    45 non-null     object 
 5   platform_url                45 non-null     object 
 6   active                      45 non-null     object 
 7   year_creation               41 non-null     float64
 8   platform_about_url          39 non-null     object 
 9   terms_use_link              18 non-null     object 
 10  privacy_link                30 non-null     object 
 11  code_repository             1 non-null      object 
 12  code_repository_github_api  8 non-null      object 
 13  code_language               27 non-nu

In [69]:
df['governance_url'].value_counts()

governance_url
https://www.spotteron.net/citizen-science-app-features/data-handling    3
Name: count, dtype: int64

In [70]:
df.to_csv("../data/ios_apps_platforms_test.csv", index=False)

## API

- Search in common paths to API:

In [71]:
import pandas as pd
import requests

# Try common paths to API

def check_api(url):
    api_endpoints = ['/api', '/api/v1', '/swagger', '/docs']
    for endpoint in api_endpoints:
        try:
            resp = requests.get(url.rstrip('/') + endpoint, timeout=3)
            if resp.status_code == 200:
                return url.rstrip('/') + endpoint
        except requests.RequestException:
            continue
    return None

df['api_link'] = df['platform_url'].apply(check_api)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        45 non-null     object 
 1   developer                   45 non-null     object 
 2   description                 45 non-null     object 
 3   url                         45 non-null     object 
 4   language                    45 non-null     object 
 5   platform_url                45 non-null     object 
 6   active                      45 non-null     object 
 7   year_creation               41 non-null     float64
 8   platform_about_url          39 non-null     object 
 9   terms_use_link              18 non-null     object 
 10  privacy_link                30 non-null     object 
 11  code_repository             1 non-null      object 
 12  code_repository_github_api  8 non-null      object 
 13  code_language               27 non-nu

In [73]:
df.api_link.value_counts()

api_link
https://birdnet.cornell.edu/api           1
https://www.spotfire.com/docs             1
https://www.echometertouch.com/api        1
https://www.iseechange.org/swagger        1
https://www.coronareport.global/api/v1    1
https://www.birdweather.com/api           1
http://www.birdtrack.net/api              1
https://caterpillarscount.unc.edu/api     1
https://instantwild.zsl.org/api           1
Name: count, dtype: int64

- Find term API or swagger in home:

In [74]:
# Find API mention in home code

api_terms = ["api", "swagger"]

df['api_link'] = df['platform_url'].apply(lambda x: find_terms_url(x, api_terms))

Error procesando https://thehappinessproject.app: HTTPSConnectionPool(host='thehappinessproject.app', port=443): Read timed out. (read timeout=10)
Error procesando http://www.natural-apptitude.co.uk/project/sealife-tracker/: 404 Client Error: Not Found for url: https://www.natural-apptitude.co.uk/project/sealife-tracker
Error procesando https://uwiseismic.com/myhaz-vct/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Error procesando https://bigbutterflycount.butterfly-conservation.org/: 403 Client Error: Forbidden for url: https://bigbutterflycount.butterfly-conservation.org/


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        45 non-null     object 
 1   developer                   45 non-null     object 
 2   description                 45 non-null     object 
 3   url                         45 non-null     object 
 4   language                    45 non-null     object 
 5   platform_url                45 non-null     object 
 6   active                      45 non-null     object 
 7   year_creation               41 non-null     float64
 8   platform_about_url          39 non-null     object 
 9   terms_use_link              18 non-null     object 
 10  privacy_link                30 non-null     object 
 11  code_repository             1 non-null      object 
 12  code_repository_github_api  8 non-null      object 
 13  code_language               27 non-nu

In [76]:
df.api_link.value_counts()

api_link
https://birdnet.cornell.edu/api/                                                                       1
https://app.birdweather.com/api/index.html                                                             1
http://hokcab01.almacen.api.auditmedia.es//api/Recortes/9060d58a-be68-4976-8423-13e92901e243/tv.mp4    1
Name: count, dtype: int64

- Search platform name + API in duckduckgo:

In [77]:
import time
from urllib.parse import parse_qs, unquote, urlparse
import requests
from bs4 import BeautifulSoup

def duckduckgo_search(query):
    search = f"{query} api"
    headers = {"User-Agent": "Mozilla/5.0"}
    search_url = f"https://lite.duckduckgo.com/lite/?q={search.replace(' ', '+')}"

    response = requests.get(search_url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.text, "html.parser")

    # Buscar el primer enlace resultante
    first_link = soup.find("a", class_="result-link")
    if first_link and "href" in first_link.attrs:
        raw_href = first_link["href"]
        parsed = urlparse(raw_href)
        query_params = parse_qs(parsed.query)
        if "uddg" in query_params:
            time.sleep(5)
            return unquote(query_params["uddg"][0])  # Enlace decodificado
        else:
            time.sleep(5)
            return raw_href  # Por si acaso ya es directo
    time.sleep(5)

In [79]:
df['api_link_ddg'] = df['name'].apply(duckduckgo_search)

In [80]:
# Remove links to home
df.loc[
    df["platform_url"].str.rstrip("/") == df["api_link_ddg"].str.rstrip("/"), 
    "api_link_ddg"
    ] = None

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        45 non-null     object 
 1   developer                   45 non-null     object 
 2   description                 45 non-null     object 
 3   url                         45 non-null     object 
 4   language                    45 non-null     object 
 5   platform_url                45 non-null     object 
 6   active                      45 non-null     object 
 7   year_creation               41 non-null     float64
 8   platform_about_url          39 non-null     object 
 9   terms_use_link              18 non-null     object 
 10  privacy_link                30 non-null     object 
 11  code_repository             1 non-null      object 
 12  code_repository_github_api  8 non-null      object 
 13  code_language               27 non-nu

In [None]:
df[["platform_url","api_link_ddg"]].head(10)

Unnamed: 0,platform_url,api_link_ddg
0,https://www.anecdata.org/,https://www.anecdata.org/pages/api
1,https://health-study.zoe.com/,https://api.covidradar.org/launch/
2,https://lnt.org/our-work/citizen-science/,https://lnt.org/our-work/citizen-science/citiz...
3,https://birdnet.cornell.edu,
4,https://observer.globe.gov/,https://www.globe.gov/globe-data/globe-api
5,https://www.quantifiedcitizen.com/,https://api.quantifiedcitizen.com/
6,https://www.spotfire.com/,
7,https://www.brainexplorer.net,https://mouse.brain-map.org/static/api
8,https://www.spotteron.net/apps/regional-commun...,
9,https://thehappinessproject.app,https://github.com/jfaccioli/happiness-project


In [84]:
# Manual revision
drop_api_link_result = [
    "https://api.covidradar.org/launch/",
    "https://lnt.org/our-work/citizen-science/citizen-science-toolkit/",
    "https://github.com/BirdNET-Team/BirdNET-Analyzer",
    "https://api.quantifiedcitizen.com/", # status code 404
    "https://mouse.brain-map.org/static/api", # Not the same platform
    "https://the-happiness-project.com/", # Same as home
    "http://www.csmon-life.eu/", # Same as home
    "https://www.iseechange.com/", # Same as home
    "https://www.wildlifeacoustics.com/uploads/user-guides/EMT-ANDROID-GUIDE-EN-12112023.pdf", # user guide, not api
    "https://www.usanpn.org/node/223", # Same as home
    "https://www.iseechange.com/",  # Same as home
    "http://www.csmon-life.eu/", # Same as home
    "https://disease.sh/", # False result
    "https://api.myshiptracking.com/", # False result
]

In [85]:
for link in drop_api_link_result:
    df.loc[df["api_link_ddg"] == link, "api_link_ddg"] = None

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        45 non-null     object 
 1   developer                   45 non-null     object 
 2   description                 45 non-null     object 
 3   url                         45 non-null     object 
 4   language                    45 non-null     object 
 5   platform_url                45 non-null     object 
 6   active                      45 non-null     object 
 7   year_creation               41 non-null     float64
 8   platform_about_url          39 non-null     object 
 9   terms_use_link              18 non-null     object 
 10  privacy_link                30 non-null     object 
 11  code_repository             1 non-null      object 
 12  code_repository_github_api  8 non-null      object 
 13  code_language               27 non-nu

In [None]:
df[["platform_url","api_link_ddg"]].head(10)

Unnamed: 0,platform_url,api_link_ddg
0,https://www.anecdata.org/,https://www.anecdata.org/pages/api
1,https://health-study.zoe.com/,
2,https://lnt.org/our-work/citizen-science/,
3,https://birdnet.cornell.edu,
4,https://observer.globe.gov/,https://www.globe.gov/globe-data/globe-api
5,https://www.quantifiedcitizen.com/,
6,https://www.spotfire.com/,
7,https://www.brainexplorer.net,
8,https://www.spotteron.net/apps/regional-commun...,
9,https://thehappinessproject.app,https://github.com/jfaccioli/happiness-project


**duckduckgo query limit**: The search operation limit is set at 30 requests per minute, while content fetching is limited to 20 requests per minute. 

In [88]:
from googlesearch import search

def google_search(co_name):
    query = f"{co_name} api"
    try:
        resultados = list(search(query, num_results=1, lang="en"))
        return resultados[0] if resultados else None
    except Exception as e:
        print(f"Error buscando {co_name}: {e}")
        return None

df['api_link_google'] = df['name'].apply(duckduckgo_search)

In [89]:
# Remove links to home
df.loc[df.platform_url.str.rstrip("/") == df.api_link_google.str.rstrip("/"), "api_link_google"] = None

Google query limit: max 100 queries per day.

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        45 non-null     object 
 1   developer                   45 non-null     object 
 2   description                 45 non-null     object 
 3   url                         45 non-null     object 
 4   language                    45 non-null     object 
 5   platform_url                45 non-null     object 
 6   active                      45 non-null     object 
 7   year_creation               41 non-null     float64
 8   platform_about_url          39 non-null     object 
 9   terms_use_link              18 non-null     object 
 10  privacy_link                30 non-null     object 
 11  code_repository             1 non-null      object 
 12  code_repository_github_api  8 non-null      object 
 13  code_language               27 non-nu

In [None]:
df.api_link_google.head(10)

0                    https://www.anecdata.org/pages/api
1                    https://api.covidradar.org/launch/
2     https://lnt.org/our-work/citizen-science/citiz...
3      https://github.com/BirdNET-Team/BirdNET-Analyzer
4            https://www.globe.gov/globe-data/globe-api
5                    https://api.quantifiedcitizen.com/
6                                                  None
7                https://mouse.brain-map.org/static/api
8                                                  None
9        https://github.com/jfaccioli/happiness-project
10     https://github.com/stcalica/citizen-science-apis
11    https://www.wildlifeacoustics.com/products/ech...
12    http://developer.zooniverse.org/en/latest/apis...
13                      https://www.usanpn.org/node/223
14                                                 None
15                                                 None
16                          https://www.iseechange.com/
17                                              

**Missing**: remove manually links from `api_link_google`.

In [92]:
df.to_csv("../data/ios_apps_platforms_test.csv", index=False)

## organization_of_managers

Find propietary in whois register:

In [93]:
import whois
import pandas as pd

def get_whois_organization(url):
    try:
        domain = url.split("//")[-1].split("/")[0]  # extract domain
        info = whois.whois(domain)
        return info.org or info.name
    except Exception as e:
        return None

df['org_managers_whois'] = df['platform_url'].apply(get_whois_organization)

2025-07-16 13:54:54,382 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Nombre o servicio desconocido


In [94]:
df.org_managers_whois.info()

<class 'pandas.core.series.Series'>
RangeIndex: 45 entries, 0 to 44
Series name: org_managers_whois
Non-Null Count  Dtype 
--------------  ----- 
32 non-null     object
dtypes: object(1)
memory usage: 492.0+ bytes


In [95]:
df.org_managers_whois.value_counts()

org_managers_whois
Domains By Proxy, LLC                                        7
REDACTED FOR PRIVACY                                         4
Not Disclosed                                                3
DDQ B.V.                                                     2
Leave No Trace, Inc.                                         1
Privacy service provided by Withheld for Privacy ehf         1
Faunawatch                                                   1
Cloud Software Group, Inc.                                   1
Identity Protection Service                                  1
Chris Lintott                                                1
Ocean Wise                                                   1
[Data Protected, Not Disclosed]                              1
NameBrightPrivacy.com                                        1
Contact Privacy Inc. Customer 0173889823                     1
MARIS BV                                                     1
Natural Apptitude                   

Many wrong results: data not available due to privacy limits (results like "Not Disclosed", "REDACTED FOR PRIVACY"...), and others not visible ("Domains By Proxy, LLC", "Privacy service provided by Withheld for Privacy ehf"...). 

Whois is not a good source of information in this case.

In [96]:
import requests
from bs4 import BeautifulSoup

def get_organization_from_footer(url):
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        footer = soup.find('footer')
        if footer:
            return footer.get_text(strip=True)
    except:
        return None

df['org_managers_footer'] = df['platform_url'].apply(get_organization_from_footer)

In [97]:
df.org_managers_footer.info()

<class 'pandas.core.series.Series'>
RangeIndex: 45 entries, 0 to 44
Series name: org_managers_footer
Non-Null Count  Dtype 
--------------  ----- 
31 non-null     object
dtypes: object(1)
memory usage: 492.0+ bytes


NOTE: Both strategies need manual cleaning.

In [98]:
df.to_csv("../data/ios_apps_platforms_test.csv", index=False)

## platform License

- Extract license from footer using regular expressions:

In [99]:
import re
import requests
from bs4 import BeautifulSoup

def detect_license_footer(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, timeout=8, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        footer = soup.find('footer')
        if not footer:
            return None

        texto = footer.get_text(separator=' ', strip=True).lower()

        patrones = [
            # Copyright
            r'(©\s*\d{4}\s*[\w\s\.\-&,]+)',
            r'(©\s*[\w\s\.\-&,]+)',
            r'(copyright\s+\d{4}\s+[\w\s\.\-&,]+)',
            r'(todos los derechos reservados)',
            r'(all rights reserved)',
            r'(tous droits réservés)',
            r'(alle rechte vorbehalten)',
            r'(tutti i diritti riservati)',

            # Creative Commons
            r'(creative commons[^\n]{0,100})',
            r'(cc\s*(by|by-sa|by-nc|by-nd|0)[\s\-0-9\.]*)',

            # Otras licencias
            r'(licensed under the mit license)',
            r'(mit license)',
            r'(gpl license)',
            r'(licencia.*mit)',
            r'(licencia.*gpl)',
            r'(apache license)',
            r'(bsd license)',
        ]

        for pat in patrones:
            match = re.search(pat, texto, re.IGNORECASE)
            if match:
                return match.group(0).strip().capitalize()

        return None  # Nada encontrado
    except:
        return None


In [100]:
df['software_license_footer'] = df['platform_url'].apply(detect_license_footer)
df.software_license_footer.info()

<class 'pandas.core.series.Series'>
RangeIndex: 45 entries, 0 to 44
Series name: software_license_footer
Non-Null Count  Dtype 
--------------  ----- 
12 non-null     object
dtypes: object(1)
memory usage: 492.0+ bytes


In [102]:
df[["platform_url", "software_license_footer"]].head(10)

Unnamed: 0,platform_url,software_license_footer
0,https://www.anecdata.org/,
1,https://health-study.zoe.com/,© 2025 zoe limited
2,https://lnt.org/our-work/citizen-science/,
3,https://birdnet.cornell.edu,"© 2025 cornell university, chemnitz university..."
4,https://observer.globe.gov/,
5,https://www.quantifiedcitizen.com/,
6,https://www.spotfire.com/,
7,https://www.brainexplorer.net,
8,https://www.spotteron.net/apps/regional-commun...,
9,https://thehappinessproject.app,


- Extract license from the whole homepage using regular expressions:

In [103]:
import re
import requests
from bs4 import BeautifulSoup

def detect_license(text):
    text = text.replace('\n', ' ').replace('\r', ' ').lower()

    patterns = [
        # Common copyright (symbol or world)
        r'©\s*\d{4}\s*[\w\s\.\-&,]+',
        r'copyright\s+[\w\s\.\-&,]*\d{4}?',
        r'todos los derechos reservados',
        r'all rights reserved',
        r'tous droits réservés',
        r'alle rechte vorbehalten',
        r'tutti i diritti riservati',

        # Creative Commons
        r'creative commons[^<\n]{0,100}',
        r'cc\s*(by|by-sa|by-nc|by-nd|0)[\s\-0-9\.]*',

        # Other licenses
        r'mit license',
        r'licensed under the mit license',
        r'gpl license',
        r'licensed under the gpl',
        r'bsd license',
        r'apache license',
        r'european union public licence',
        r'licencia.*mit',
        r'licencia.*gpl',
        r'licencia.*apache',
        r'licencia.*bsd',
        r'licenza.*',
        r'lizenz.*',
        r'licence.*'
    ]

    for pat in patterns:
        match = re.search(pat, text, re.IGNORECASE)
        if match:
            return match.group(0).strip()

    return None

def extract_license(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        resp = requests.get(url, timeout=10, headers=headers)
        soup = BeautifulSoup(resp.text, 'html.parser')
        texto = soup.get_text(separator=' ', strip=True)
        return detect_license(texto)
    except Exception as e:
        return None

df['software_license_home'] = df['platform_url'].apply(extract_license)

In [104]:
df.software_license_home.info()

<class 'pandas.core.series.Series'>
RangeIndex: 45 entries, 0 to 44
Series name: software_license_home
Non-Null Count  Dtype 
--------------  ----- 
17 non-null     object
dtypes: object(1)
memory usage: 492.0+ bytes


In [105]:
df.software_license_home.head(10)

0                                                 None
1                                   © 2025 zoe limited
2    © 2025 leave no trace leave no trace is a regi...
3    © 2025 cornell university, chemnitz university...
4                                                 None
5                                                 None
6    © 2025 cloud software group, inc. all rights r...
7                                                 None
8                                                 None
9                                                 None
Name: software_license_home, dtype: object

In [106]:
# Test for web with not copyright license
extract_license("https://minka-sdg.org")

'creative commons attribution 4.0 license. minka incorporates consent commons icons, allowing for easy visual compreh'

In [107]:
# Keep last results and remove unnecesary columns
df['software_license'] = df['software_license_home']
df.drop(columns=["software_license_footer", "software_license_home"], inplace=True)

In [108]:
df.to_csv("../data/ios_apps_platforms_test.csv", index=False)

## platform_email

Search for email in home:

In [109]:
def get_email(url):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, timeout=10, headers=headers)
        if response.status_code != 200:
            return None
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()
        emails = re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", text)
        return emails[0] if emails else None
    except Exception as e:
        return None

In [110]:
df['email_from_home'] = df['platform_url'].apply(get_email)
df.email_from_home.info()

<class 'pandas.core.series.Series'>
RangeIndex: 45 entries, 0 to 44
Series name: email_from_home
Non-Null Count  Dtype 
--------------  ----- 
13 non-null     object
dtypes: object(1)
memory usage: 492.0+ bytes


In [111]:
df.email_from_home.value_counts()

email_from_home
office@spotteron.netphone          3
ccb-birdnet@cornell.edu            1
sales2025@wildlifeacoustics.com    1
martelst@units.it                  1
src@sta.uwi.edu                    1
birdtrack@bto.org                  1
pollinators@wisc.edu               1
caterpillarscount@gmail.com        1
Byinfo@build4people.org            1
a.mechelli@kcl.ac.uka.mechelli     1
info@medusapp.net                  1
Name: count, dtype: int64

In [112]:
df['email_from_about'] = df['platform_about_url'].apply(get_email)

In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        45 non-null     object 
 1   developer                   45 non-null     object 
 2   description                 45 non-null     object 
 3   url                         45 non-null     object 
 4   language                    45 non-null     object 
 5   platform_url                45 non-null     object 
 6   active                      45 non-null     object 
 7   year_creation               41 non-null     float64
 8   platform_about_url          39 non-null     object 
 9   terms_use_link              18 non-null     object 
 10  privacy_link                30 non-null     object 
 11  code_repository             1 non-null      object 
 12  code_repository_github_api  8 non-null      object 
 13  code_language               27 non-nu

In [114]:
df[["email_from_home", "email_from_about"]].head(10)

Unnamed: 0,email_from_home,email_from_about
0,,
1,,
2,,
3,ccb-birdnet@cornell.edu,ccb-birdnet@cornell.edu
4,,
5,,
6,,
7,,
8,office@spotteron.netphone,office@spotteron.netphone
9,,


In [115]:
def get_email_from_possible_pages(base_url):
    contact_paths = [
        '', '/contact', '/contact-us', '/contacto', '/a-propos', '/kontakt',
        '/contatti', '/contato', '/assistance', '/chi-siamo', '/uber-uns',
        '/soporte', '/support', '/supporto', '/ajuda', '/acerca-de', '/sobre',
        '/hilfe', '/steun', '/over-ons'
    ]
    for path in contact_paths:
        email = get_email(base_url.rstrip('/') + path)
        if email:
            return email
    return None

df['email_from_contact'] = df['platform_url'].apply(get_email_from_possible_pages)

In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        45 non-null     object 
 1   developer                   45 non-null     object 
 2   description                 45 non-null     object 
 3   url                         45 non-null     object 
 4   language                    45 non-null     object 
 5   platform_url                45 non-null     object 
 6   active                      45 non-null     object 
 7   year_creation               41 non-null     float64
 8   platform_about_url          39 non-null     object 
 9   terms_use_link              18 non-null     object 
 10  privacy_link                30 non-null     object 
 11  code_repository             1 non-null      object 
 12  code_repository_github_api  8 non-null      object 
 13  code_language               27 non-nu

In [117]:
df[df.email_from_home.notnull() | df.email_from_about.notnull() | df.email_from_contact.notnull()]

Unnamed: 0,name,developer,description,url,language,platform_url,active,year_creation,platform_about_url,terms_use_link,...,governance_url,api_link,api_link_ddg,api_link_google,org_managers_whois,org_managers_footer,software_license,email_from_home,email_from_about,email_from_contact
3,BirdNET,Stefan Kahl,How can computers learn to recognize birds fro...,https://apps.apple.com/us/app/birdnet/id154184...,"['CS', 'NL', 'EN', 'FR', 'DE', 'IT', 'LT', 'PL...",https://birdnet.cornell.edu,Status code: 200,1985.0,https://birdnet.cornell.edu/map,,...,,https://birdnet.cornell.edu/api/,,https://github.com/BirdNET-Team/BirdNET-Analyzer,,"© 2025 Cornell University, Chemnitz University...","© 2025 cornell university, chemnitz university...",ccb-birdnet@cornell.edu,ccb-birdnet@cornell.edu,ccb-birdnet@cornell.edu
6,spotFIRE | Citizen Science,SPOTTERON Gmbh,SpotFIRE is a Citizen Science project focusing...,https://apps.apple.com/us/app/spotfire-citizen...,"['EN', 'DE']",https://www.spotfire.com/,Status code: 200,1996.0,https://www.spotfire.com/about,,...,,,,,"Cloud Software Group, Inc.",By Verified EngineerRead more,"© 2025 cloud software group, inc. all rights r...",,,support@spotfire.com
8,WaterLinx | Citizen Science,SPOTTERON Gmbh,Water constitutes and connects all life forms ...,https://apps.apple.com/us/app/waterlinx-citize...,"['EN', 'DE']",https://www.spotteron.net/apps/regional-commun...,Status code: 200,2016.0,https://www.spotteron.net/about,https://www.spotteron.net/terms-of-use,...,https://www.spotteron.net/citizen-science-app-...,,,,Not Disclosed,ContactINTERESTED? GET IN CONTACT TODAY!eMail:...,,office@spotteron.netphone,office@spotteron.netphone,office@spotteron.netphone
9,The Happiness Project,UCL,Play fun mini-games to help brain scientists s...,https://apps.apple.com/us/app/the-happiness-pr...,['EN'],https://thehappinessproject.app,Status code: 200,,,https://thehappinessproject.app/terms-and-cond...,...,,,https://github.com/jfaccioli/happiness-project,https://github.com/jfaccioli/happiness-project,,Happiness NewsletterSubscribe to hear about ne...,,,,Kingdomrutledgelab@gmail.com
11,Echo Meter Touch Bat Detector,Wildlife Acoustics,Using the Echo Meter Touch 2 Pro plug-in modul...,https://apps.apple.com/us/app/echo-meter-touch...,['EN'],https://www.echometertouch.com,Status code: 200,2015.0,https://www.wildlifeacoustics.com/about-us,https://www.wildlifeacoustics.com/legal-docume...,...,,,https://www.wildlifeacoustics.com/products/ech...,https://www.wildlifeacoustics.com/products/ech...,Identity Protection Service,,,sales2025@wildlifeacoustics.com,sales2025@wildlifeacoustics.com,sales2025@wildlifeacoustics.com
12,Zooniverse,Chris Lintott,The Zooniverse is the world’s largest and most...,https://apps.apple.com/us/app/zooniverse/id119...,['EN'],https://www.zooniverse.org/,Status code: 200,2008.0,https://www.zooniverse.org/about,,...,,,http://developer.zooniverse.org/en/latest/apis...,http://developer.zooniverse.org/en/latest/apis...,Chris Lintott,ZooniversePeople-Powered ResearchProjectsArtsB...,,,contact@zooniverse.org.ZooniversePeople-Powered,contact@zooniverse.org.ZooniversePeople-Powered
14,AmphiApp | Citizen Science,SPOTTERON Gmbh,Amphibien gehören zu den am stärksten bedrohte...,https://apps.apple.com/us/app/amphiapp-citizen...,"['EN', 'DE']",https://www.spotteron.net/apps/regional-commun...,Status code: 200,2016.0,https://www.spotteron.net/about,https://www.spotteron.net/terms-of-use,...,https://www.spotteron.net/citizen-science-app-...,,,,Not Disclosed,ContactINTERESTED? GET IN CONTACT TODAY!eMail:...,,office@spotteron.netphone,office@spotteron.netphone,office@spotteron.netphone
19,CSMON-LIFE segnalazioni,Divulgando Srl,CSMON-LIFE (Citizen Science MONitoring) is one...,https://apps.apple.com/us/app/csmon-life-segna...,"['EN', 'IT']",http://www.csmon-life.eu,Status code: 200,,https://www.csmon-life.eu/pagina/progetto/18/A...,,...,,,,,,Primo progetto italiano di citizen science sul...,copyright 2014,martelst@units.it,martelst@units.it,martelst@units.it
24,BirdWeather,Scribe Labs Inc,BirdWeather: Your Gateway to Nature's Soundsca...,https://apps.apple.com/us/app/birdweather/id64...,"['AF', 'AR', 'CS', 'DA', 'NL', 'EN', 'FI', 'FR...",https://www.birdweather.com/,Status code: 200,2017.0,https://www.birdweather.com/about,https://www.birdweather.com/terms-of-service,...,,https://app.birdweather.com/api/index.html,https://app.birdweather.com/api/index.html,https://app.birdweather.com/api/index.html,NameBrightPrivacy.com,,,,tim@birdweather.com,
25,myHAZ-VCT,British Geological Survey,myHAZ-VCT is a citizen science app for sharing...,https://apps.apple.com/us/app/myhaz-vct/id1507...,['EN'],https://uwiseismic.com/myhaz-vct/,Status code: 200,2001.0,,,...,,,https://vct.myhaz.app/docs/index.html,https://vct.myhaz.app/docs/index.html,Contact Privacy Inc. Customer 0173889823,,© 2001-2025 the university of the west indies...,src@sta.uwi.edu,,src@sta.uwi.edu


Result: 18 platforms has some email between the three strategies, looking inside the web.

- Merge results of three columns in one:

In [118]:
# Mix three columns in one
df["email"] = df['email_from_home']
df.loc[df.email.isnull(), "email"] = df.loc[df.email.isnull(), "email_from_about"]
df.loc[df.email.isnull(), "email"] = df.loc[df.email.isnull(), "email_from_contact"]

- Remove unnecessary columns:

In [119]:
df.drop(columns=["email_from_home", "email_from_about", "email_from_contact"], inplace=True)

In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        45 non-null     object 
 1   developer                   45 non-null     object 
 2   description                 45 non-null     object 
 3   url                         45 non-null     object 
 4   language                    45 non-null     object 
 5   platform_url                45 non-null     object 
 6   active                      45 non-null     object 
 7   year_creation               41 non-null     float64
 8   platform_about_url          39 non-null     object 
 9   terms_use_link              18 non-null     object 
 10  privacy_link                30 non-null     object 
 11  code_repository             1 non-null      object 
 12  code_repository_github_api  8 non-null      object 
 13  code_language               27 non-nu

In [121]:
df.to_csv("../data/ios_apps_platforms_test.csv", index=False)

## country of managers

In [122]:
import whois

def get_country_from_whois(domain):
    try:
        w = whois.whois(domain)
        return w.get('country')
    except:
        return None

df['country_from_whois'] = df['platform_url'].apply(get_country_from_whois)

2025-07-16 14:51:44,348 - whois.whois - ERROR - Error trying to connect to socket: closing socket - [Errno -2] Nombre o servicio desconocido


In [123]:
# Remove "REDACTED FOR PRIVACY" values
df.loc[df['country_from_whois'] == "REDACTED FOR PRIVACY", "country_from_whois"] = None 

In [125]:
df.country_from_whois.head(10)

0      IS
1      US
2      US
3    None
4    None
5    None
6      US
7      CH
8      AT
9    None
Name: country_from_whois, dtype: object

In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   name                        45 non-null     object 
 1   developer                   45 non-null     object 
 2   description                 45 non-null     object 
 3   url                         45 non-null     object 
 4   language                    45 non-null     object 
 5   platform_url                45 non-null     object 
 6   active                      45 non-null     object 
 7   year_creation               41 non-null     float64
 8   platform_about_url          39 non-null     object 
 9   terms_use_link              18 non-null     object 
 10  privacy_link                30 non-null     object 
 11  code_repository             1 non-null      object 
 12  code_repository_github_api  8 non-null      object 
 13  code_language               27 non-nu

- Determine the country from which a website operates based on the server's IP address, using a geolocation API. This allows us to know where the servers (the data) are physically hosted. Example: a French organization may have its servers in the US.

In [127]:
# Example with ip-api.com
from urllib.parse import urlparse
import socket
import requests

def country_from_ip(url):
    try:
        # Extract domain (without http/https or routes)
        parsed = urlparse(url)
        domain = parsed.netloc or parsed.path  # in case the URL doesn't have a scheme

        # Remove www if necessary
        if domain.startswith('www.'):
            domain = domain[4:]

        # Resolve IP
        ip = socket.gethostbyname(domain)

        # Query geolocalization API
        r = requests.get(f"http://ip-api.com/json/{ip}", timeout=10)
        if r.status_code == 200:
            time.sleep(1.4) # limit of 45 requests per minute
            return r.json().get("country")
    except Exception as e:
        return None
    
df['country_servers'] = df['platform_url'].apply(country_from_ip)

In [128]:
df.country_servers.info()

<class 'pandas.core.series.Series'>
RangeIndex: 45 entries, 0 to 44
Series name: country_servers
Non-Null Count  Dtype 
--------------  ----- 
45 non-null     object
dtypes: object(1)
memory usage: 492.0+ bytes


In [129]:
# Test
country_from_ip("https://minka-sdg.org")

'Spain'

In [131]:
df[["platform_url", "country_servers"]].head(10)

Unnamed: 0,platform_url,country_servers
0,https://www.anecdata.org/,United States
1,https://health-study.zoe.com/,United States
2,https://lnt.org/our-work/citizen-science/,Canada
3,https://birdnet.cornell.edu,Germany
4,https://observer.globe.gov/,United States
5,https://www.quantifiedcitizen.com/,United States
6,https://www.spotfire.com/,United States
7,https://www.brainexplorer.net,Switzerland
8,https://www.spotteron.net/apps/regional-commun...,France
9,https://thehappinessproject.app,United States
