# Android Play Store

In [5]:
# First attempt: simple extraction from Google Play Store

import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://play.google.com/store/search?q=citizen%20science&c=apps&hl=en-es"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

apps = soup.find_all("div", class_="VfPpkd-EScbFb-JIbuQc TAQqTe") 

results = []

for app in apps:
    app_data = {
        "name": app.find("span", class_="DdYX5").text if app.find("span", class_="DdYX5") else "No name",
        "developer": app.find("span", class_="wMUdtb").text if app.find("span", class_="wMUdtb") else "No developer",
        "url": f"https://play.google.com{app.find('a', class_='Si6A0c Gy4nib')['href']}"
    }
    results.append(app_data)

In [6]:
results_android = pd.DataFrame(results)
results_android.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name       30 non-null     object
 1   developer  30 non-null     object
 2   url        30 non-null     object
dtypes: object(3)
memory usage: 852.0+ bytes


Results are limited by Google App Store to 30. We try another approach to get more results.

In [7]:
from google_play_scraper import search

apps = search(
    "citizen science",
    lang="en",  # Idioma (ej: "es" para español)
    n_hits=100  # Número de resultados
)
results = []

for app in apps:
    app_data = {
        "name": app['title'],
        "developer": app['developer'],
        "url": f"https://play.google.com/store/apps/details?id={app['appId']}",
        "description": app['description'],
    }
    results.append(app_data)

In [8]:
results_android = pd.DataFrame(results)
results_android.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         30 non-null     object
 1   developer    30 non-null     object
 2   url          30 non-null     object
 3   description  30 non-null     object
dtypes: object(4)
memory usage: 1.1+ KB


In [9]:
from google_play_scraper import search, app
import time
import random

def extended_search_strategy():
    """
    Searches with related terms and variations
    """
    search_terms = [
        "citizen science",
        "citizen science projects",
        "science citizen",
        "community science",
        "crowdsourcing science",
        "scientific research",
        "data collection science",
        "environmental monitoring",
        "biodiversity research",
        "climate science",
        "astronomy citizen",
        "bird watching science",
        "nature observation",
        "scientific community",
        "research participation"
    ]
    
    all_apps = {}  # Use dict to avoid duplicates by appId
    
    for term in search_terms:
        print(f"Searching: {term}")
        try:
            apps = search(term, lang="en", n_hits=50)
            for app in apps:
                if app['appId'] not in all_apps:
                    all_apps[app['appId']] = {
                        "name": app['title'],
                        "developer": app['developer'],
                        "appId": app['appId'],
                        "url": f"https://play.google.com/store/apps/details?id={app['appId']}",
                        "description": app['description'],
                        "search_term": term  # Para saber qué término lo encontró
                    }
            
            # Pequeña pausa para evitar rate limiting
            time.sleep(random.uniform(1, 3))
            
        except Exception as e:
            print(f"Error with term '{term}': {e}")
            continue
    
    return list(all_apps.values())

def category_based_search():
    """
    Search by specific categories
    """
    from google_play_scraper import search
    
    # Términos más específicos por categoría
    categories = {
        "astronomy": ["star map", "constellation", "sky guide", "planet", "telescope"],
        "biology": ["species identification", "plant identification", "bird identification", "wildlife tracking"],
        "environment": ["air quality", "water quality", "pollution monitoring", "climate tracking"],
        "health": ["symptom tracker", "disease surveillance", "health monitoring"],
        "geography": ["mapping", "geographic survey", "land use", "satellite imagery"]
    }
    
    all_apps = {}
    
    for category, terms in categories.items():
        print(f"\n--- Category: {category} ---")
        for term in terms:
            try:
                apps = search(f"{term} research", lang="en", n_hits=30)
                for app in apps:
                    if app['appId'] not in all_apps:
                        all_apps[app['appId']] = {
                            "name": app['title'],
                            "developer": app['developer'],
                            "appId": app['appId'],
                            "url": f"https://play.google.com/store/apps/details?id={app['appId']}",
                            "description": app['description'],
                            "category": category,
                            "search_term": term
                        }
                
                time.sleep(random.uniform(1, 2))
                
            except Exception as e:
                print(f"Error con '{term}': {e}")
                continue
    
    return list(all_apps.values())

def filter_relevant_apps(apps):
    """
    Filter relevant apps after searching
    """
    citizen_science_keywords = [
        'citizen science', 'community science', 'crowdsourcing', 'research',
        'data collection', 'scientific', 'monitoring', 'observation',
        'survey', 'biodiversity', 'environmental', 'climate', 'conservation',
        'tracking', 'identification', 'measurement', 'recording'
    ]
    
    relevant_apps = []
    
    for app in apps:
        description_lower = app['description'].lower()
        title_lower = app['name'].lower()
        
        # Contar coincidencias en título y descripción
        matches = 0
        for keyword in citizen_science_keywords:
            if keyword in description_lower or keyword in title_lower:
                matches += 1
        
        # Si tiene al menos 2 coincidencias, considerarlo relevante
        if matches >= 2:
            app['relevance_score'] = matches
            relevant_apps.append(app)
    
    # Ordenar por relevancia
    return sorted(relevant_apps, key=lambda x: x['relevance_score'], reverse=True)

def get_detailed_app_info(app_ids):
    """
    Get detailed information about each app
    """
    detailed_apps = []
    
    for app_id in app_ids:
        try:
            detailed_info = app(app_id)
            detailed_apps.append({
                "name": detailed_info['title'],
                "developer": detailed_info['developer'],
                "appId": app_id,
                "url": detailed_info['url'],
                "description": detailed_info['description'],
                "category": detailed_info.get('genre', 'Unknown'),
                "installs": detailed_info.get('installs', 'Unknown'),
                "updated": detailed_info.get('updated', 'Unknown'),
            })
            
            time.sleep(random.uniform(0.5, 1.5))
            
        except Exception as e:
            print(f"Error getting details of {app_id}: {e}")
            continue
    
    return detailed_apps

def comprehensive_citizen_science_search():
    """
    Main function that combines all strategies
    """
    print("=== Comprehensive search for Citizen Science apps ===\n")
    
    print("Search by related terms...")
    apps_strategy1 = extended_search_strategy()
    print(f"Found: {len(apps_strategy1)} unique apps")
    
    print("\nSearch by specific categories...")
    apps_strategy2 = category_based_search()
    print(f"Found: {len(apps_strategy2)} unique apps")
    
    # Join results
    all_apps = {}
    for app in apps_strategy1 + apps_strategy2:
        all_apps[app['appId']] = app
    
    print(f"\nTotal unique apps found: {len(all_apps)}")
    
    print("\nFiltering apps by relevance...")
    relevant_apps = filter_relevant_apps(list(all_apps.values()))
    print(f"Relevant apps: {len(relevant_apps)}")
    
    print("\nObtaining detailed information...")
    app_ids = [app['appId'] for app in relevant_apps]
    detailed_apps = get_detailed_app_info(app_ids)
    
    return detailed_apps

In [10]:
# Run comprehensive search, time estimated: 
results = comprehensive_citizen_science_search()

print(f"\n=== FINAL RESULTS ===")
print(f"Total apps found: {len(results)}")

# Create dataframe with results
import pandas as pd

results_df = pd.DataFrame(results)

=== Comprehensive search for Citizen Science apps ===

Search by related terms...
Searching: citizen science
Searching: citizen science projects
Searching: science citizen
Searching: community science
Searching: crowdsourcing science
Searching: scientific research
Searching: data collection science
Searching: environmental monitoring
Searching: biodiversity research
Searching: climate science
Searching: astronomy citizen
Searching: bird watching science
Searching: nature observation
Searching: scientific community
Searching: research participation
Found: 311 unique apps

Search by specific categories...

--- Category: astronomy ---

--- Category: biology ---

--- Category: environment ---

--- Category: health ---

--- Category: geography ---
Found: 308 unique apps

Total unique apps found: 576

Filtering apps by relevance...
Relevant apps: 160

Obtaining detailed information...

=== FINAL RESULTS ===
Total apps found: 160


In [11]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         160 non-null    object
 1   developer    160 non-null    object
 2   appId        160 non-null    object
 3   url          160 non-null    object
 4   description  160 non-null    object
 5   category     160 non-null    object
 6   installs     160 non-null    object
 7   updated      160 non-null    int64 
dtypes: int64(1), object(7)
memory usage: 10.1+ KB


In [13]:
results_df.to_csv("../data/results_android.csv", index=False)

# iOS store results

In [18]:
url = "https://itunes.apple.com/search"
params = {
    "term": "citizen science",
    "entity": "software",
    "limit": 200       # Número de resultados
}

response = requests.get(url, params=params)
apps = response.json()["results"]

In [20]:
apps[1]

{'artworkUrl512': 'https://is1-ssl.mzstatic.com/image/thumb/Purple211/v4/ec/0b/75/ec0b75f5-9525-4338-a545-cdd6cf0ee352/AppIcon-0-0-1x_U007emarketing-0-8-0-0-85-220.png/512x512bb.jpg',
 'screenshotUrls': ['https://is1-ssl.mzstatic.com/image/thumb/Purple123/v4/b3/d1/2c/b3d12cd5-ec58-32bf-d778-5e17a8f3af51/mzl.upzuymst.png/392x696bb.png',
  'https://is1-ssl.mzstatic.com/image/thumb/Purple123/v4/f4/5d/0a/f45d0a2d-c0a9-823d-9992-8b142173b308/mzl.yjitautu.png/392x696bb.png',
  'https://is1-ssl.mzstatic.com/image/thumb/Purple123/v4/db/9a/79/db9a798a-d62e-4892-4066-87327dfb1078/mzl.hxnbptjg.png/392x696bb.png',
  'https://is1-ssl.mzstatic.com/image/thumb/Purple113/v4/38/2a/18/382a18ef-e04e-fb3c-b173-b59dfff477d9/mzl.sfwqlagw.png/392x696bb.png'],
 'isGameCenterEnabled': False,
 'features': ['iosUniversal'],
 'supportedDevices': ['iPhone5s-iPhone5s',
  'iPadAir-iPadAir',
  'iPadAirCellular-iPadAirCellular',
  'iPadMiniRetina-iPadMiniRetina',
  'iPadMiniRetinaCellular-iPadMiniRetinaCellular',
  'i

In [21]:
import requests
import numpy as np

url = "https://itunes.apple.com/search"
params = {
    "term": "citizen science",
    "entity": "software",
    "limit": 200       # Número de resultados
}

response = requests.get(url, params=params)
apps = response.json()["results"]
print(len(apps))

results = []

for app in apps:
    app_data = {
        "name": app['trackName'],
        "url": app['trackViewUrl'],
        "developer": app['artistName'],
        "rating": app.get('averageUserRating', 'N/A'),
        "currency": app['currency'],
        "description": app['description'],
        "language": app['languageCodesISO2A'],
        "platform_url": app.get('sellerUrl', np.nan),
    }
    results.append(app_data)

188


In [22]:
results_ios = pd.DataFrame(results)
results_ios.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          188 non-null    object 
 1   url           188 non-null    object 
 2   developer     188 non-null    object 
 3   rating        188 non-null    float64
 4   currency      188 non-null    object 
 5   description   188 non-null    object 
 6   language      188 non-null    object 
 7   platform_url  129 non-null    object 
dtypes: float64(1), object(7)
memory usage: 11.9+ KB


In [23]:
results_ios.to_csv("../data/results_ios.csv", index=False)