In [None]:
# pip install requests pandas

In [None]:
import requests
import pandas as pd
import time

def fetch_all_glioma_trials():
    print("🔍 Fetching up to 10,000 glioma trials from ClinicalTrials.gov v2 API...")

    base_url = "https://clinicaltrials.gov/api/v2/studies"
    page_size = 1000
    next_token = None
    all_data = []
    total_fetched = 0

    while True:
        params = {
            "query.cond": "glioma",
            "pageSize": page_size,
            "format": "json"
        }
        if next_token:
            params["pageToken"] = next_token

        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print("❌ API error:", response.status_code)
            break

        json_data = response.json()
        studies = json_data.get("studies", [])
        next_token = json_data.get("nextPageToken", None)
        total_count = json_data.get("totalCount", "N/A")

        print(f"📄 Retrieved {len(studies)} studies (Total so far: {total_fetched + len(studies)})")

        for s in studies:
            mod = s.get("protocolSection", {})
            id_mod = mod.get("identificationModule", {})
            status_mod = mod.get("statusModule", {})
            design_mod = mod.get("designModule", {})

            nct = id_mod.get("nctId", "")
            title = id_mod.get("officialTitle", "") or id_mod.get("briefTitle", "")
            status = status_mod.get("overallStatus", "")
            phase = design_mod.get("phase", "")
            url = f"https://clinicaltrials.gov/study/{nct}"

            all_data.append({
                "NCT ID": nct,
                "Title": title,
                "Status": status,
                "Phase": phase,
                "URL": url
            })

        total_fetched += len(studies)

        if not next_token or total_fetched >= 10000:
            break

        time.sleep(0.2)  # be polite to the server

    df = pd.DataFrame(all_data)
    df.to_csv("clinical_trials_glioma_full.csv", index=False)
    print(f"✅ Done! Saved {len(df)} trials to clinical_trials_glioma_full.csv")

if __name__ == "__main__":
    fetch_all_glioma_trials()


🔍 Fetching up to 10,000 glioma trials from ClinicalTrials.gov v2 API...
📄 Retrieved 1000 studies (Total so far: 1000)
📄 Retrieved 1000 studies (Total so far: 2000)
📄 Retrieved 1000 studies (Total so far: 3000)
📄 Retrieved 325 studies (Total so far: 3325)
✅ Done! Saved 3325 trials to clinical_trials_glioma_full.csv


所有的column都scratch


In [None]:
import requests
import pandas as pd
import time

# 安全提取 textBlock（兼容 dict / str / None）
def get_text_block_safe(section):
    if isinstance(section, dict):
        return section.get("textBlock", "")
    return section if isinstance(section, str) else ""

# 关键词标签生成器
def get_tags(desc_mod, eligibility_mod):
    text = get_text_block_safe(desc_mod.get("briefSummary", "")) + " "
    text += get_text_block_safe(desc_mod.get("detailedDescription", ""))
    text = text.lower()

    tags = []

    if any(k in text for k in ["mri", "pet", "fmri", "spect", "imaging"]):
        tags.append("imaging")
    if any(k in text for k in ["liquid biopsy", "ctdna", "cfdna"]):
        tags.append("liquid biopsy")
    if any(k in text for k in ["resection", "post-surgery", "surgical", "postoperative"]):
        tags.append("surgery")
    if any(k in text for k in ["chemotherapy", "temozolomide", "tmz"]):
        tags.append("chemotherapy")
    if any(k in text for k in ["immunotherapy", "checkpoint", "anti-pd", "nivolumab", "pembrolizumab"]):
        tags.append("immunotherapy")

    min_age = eligibility_mod.get("minimumAge", "").lower()
    if any(k in text for k in ["pediatric", "child", "children"]) or ("year" in min_age and "0" in min_age):
        tags.append("pediatric")

    return "; ".join(tags)

# 主函数
def fetch_all_glioma_trials_full():
    print("🔍 Fetching up to 10,000 glioma trials from ClinicalTrials.gov v2 API...")

    base_url = "https://clinicaltrials.gov/api/v2/studies"
    page_size = 1000
    next_token = None
    all_data = []
    total_fetched = 0

    while True:
        params = {
            "query.cond": "glioma",
            "pageSize": page_size,
            "format": "json"
        }
        if next_token:
            params["pageToken"] = next_token

        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print("❌ API error:", response.status_code)
            break

        json_data = response.json()
        studies = json_data.get("studies", [])
        next_token = json_data.get("nextPageToken", None)

        for s in studies:
            prot = s.get("protocolSection", {})

            id_mod = prot.get("identificationModule", {})
            status_mod = prot.get("statusModule", {})
            design_mod = prot.get("designModule", {})
            desc_mod = prot.get("descriptionModule", {})
            sponsor_mod = prot.get("sponsorCollaboratorsModule", {})
            conditions_mod = prot.get("conditionsModule", {})
            intervention_mod = prot.get("armsInterventionsModule", {})
            eligibility_mod = prot.get("eligibilityModule", {})
            contacts_mod = prot.get("contactsLocationsModule", {})

            # 通用安全列表拼接器
            def safe_join(lst, key=None):
                if isinstance(lst, list):
                    return "; ".join([item.get(key, "") if isinstance(item, dict) else str(item) for item in lst])
                return ""

            # Locations 安全解析
            locations = contacts_mod.get("locations", [])
            location_str = safe_join(locations, "locationFacility") if isinstance(locations, list) else ""

            all_data.append({
                "NCT ID": id_mod.get("nctId", ""),
                "Title": id_mod.get("officialTitle", "") or id_mod.get("briefTitle", ""),
                "Status": status_mod.get("overallStatus", ""),
                "Phase": design_mod.get("phase", ""),
                "Study Type": design_mod.get("studyType", ""),
                "Start Date": status_mod.get("startDateStruct", {}).get("date", ""),
                "Completion Date": status_mod.get("completionDateStruct", {}).get("date", ""),
                "Primary Completion Date": status_mod.get("primaryCompletionDateStruct", {}).get("date", ""),
                "Sponsor": sponsor_mod.get("leadSponsor", {}).get("name", ""),
                "Collaborators": safe_join(sponsor_mod.get("collaborators", []), "name"),
                "Conditions": safe_join(conditions_mod.get("conditions", [])),
                "Interventions": safe_join(intervention_mod.get("interventions", []), "name"),
                "Minimum Age": eligibility_mod.get("minimumAge", ""),
                "Maximum Age": eligibility_mod.get("maximumAge", ""),
                "Gender": eligibility_mod.get("sex", ""),
                "Brief Summary": get_text_block_safe(desc_mod.get("briefSummary", "")),
                "Detailed Description": get_text_block_safe(desc_mod.get("detailedDescription", "")),
                "Locations": location_str,
                "Countries": safe_join(contacts_mod.get("locationCountries", [])),
                "Study URL": f"https://clinicaltrials.gov/study/{id_mod.get('nctId', '')}",
                "Tags": get_tags(desc_mod, eligibility_mod)
            })

        total_fetched += len(studies)
        print(f"📄 Retrieved {len(studies)} (Total: {total_fetched})")

        if not next_token or total_fetched >= 10000:
            break

        time.sleep(0.2)

    df = pd.DataFrame(all_data)
    df.to_csv("clinical_trials_glioma_all_columns.csv", index=False)
    print(f"✅ Done! Saved {len(df)} trials to clinical_trials_glioma_all_columns.csv")

if __name__ == "__main__":
    fetch_all_glioma_trials_full()


🔍 Fetching up to 10,000 glioma trials from ClinicalTrials.gov v2 API...
📄 Retrieved 1000 (Total: 1000)
📄 Retrieved 1000 (Total: 2000)
📄 Retrieved 1000 (Total: 3000)
📄 Retrieved 325 (Total: 3325)
✅ Done! Saved 3325 trials to clinical_trials_glioma_all_columns.csv


# Astrocytoma

In [None]:
import requests
import pandas as pd
import time

# 安全提取 textBlock（兼容 dict / str / None）
def get_text_block_safe(section):
    if isinstance(section, dict):
        return section.get("textBlock", "")
    return section if isinstance(section, str) else ""

# 关键词标签生成器
def get_tags(desc_mod, eligibility_mod):
    text = get_text_block_safe(desc_mod.get("briefSummary", "")) + " "
    text += get_text_block_safe(desc_mod.get("detailedDescription", ""))
    text = text.lower()

    tags = []

    if any(k in text for k in ["mri", "pet", "fmri", "spect", "imaging"]):
        tags.append("imaging")
    if any(k in text for k in ["liquid biopsy", "ctdna", "cfdna"]):
        tags.append("liquid biopsy")
    if any(k in text for k in ["resection", "post-surgery", "surgical", "postoperative"]):
        tags.append("surgery")
    if any(k in text for k in ["chemotherapy", "temozolomide", "tmz"]):
        tags.append("chemotherapy")
    if any(k in text for k in ["immunotherapy", "checkpoint", "anti-pd", "nivolumab", "pembrolizumab"]):
        tags.append("immunotherapy")

    min_age = eligibility_mod.get("minimumAge", "").lower()
    if any(k in text for k in ["pediatric", "child", "children"]) or ("year" in min_age and "0" in min_age):
        tags.append("pediatric")

    return "; ".join(tags)

# 主函数
def fetch_all_glioma_trials_full():
    print("🔍 Fetching up to 10,000 glioma Astrocytoma trials from ClinicalTrials.gov v2 API...")

    base_url = "https://clinicaltrials.gov/api/v2/studies"
    page_size = 1000
    next_token = None
    all_data = []
    total_fetched = 0

    while True:
        params = {
            "query.cond": "astrocytoma",
            "pageSize": page_size,
            "format": "json"
        }
        if next_token:
            params["pageToken"] = next_token

        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print("❌ API error:", response.status_code)
            break

        json_data = response.json()
        studies = json_data.get("studies", [])
        next_token = json_data.get("nextPageToken", None)

        for s in studies:
            prot = s.get("protocolSection", {})

            id_mod = prot.get("identificationModule", {})
            status_mod = prot.get("statusModule", {})
            design_mod = prot.get("designModule", {})
            desc_mod = prot.get("descriptionModule", {})
            sponsor_mod = prot.get("sponsorCollaboratorsModule", {})
            conditions_mod = prot.get("conditionsModule", {})
            intervention_mod = prot.get("armsInterventionsModule", {})
            eligibility_mod = prot.get("eligibilityModule", {})
            contacts_mod = prot.get("contactsLocationsModule", {})

            # 通用安全列表拼接器
            def safe_join(lst, key=None):
                if isinstance(lst, list):
                    return "; ".join([item.get(key, "") if isinstance(item, dict) else str(item) for item in lst])
                return ""

            # Locations 安全解析
            locations = contacts_mod.get("locations", [])
            location_str = safe_join(locations, "locationFacility") if isinstance(locations, list) else ""

            all_data.append({
                "NCT ID": id_mod.get("nctId", ""),
                "Title": id_mod.get("officialTitle", "") or id_mod.get("briefTitle", ""),
                "Status": status_mod.get("overallStatus", ""),
                "Phase": design_mod.get("phase", ""),
                "Study Type": design_mod.get("studyType", ""),
                "Start Date": status_mod.get("startDateStruct", {}).get("date", ""),
                "Completion Date": status_mod.get("completionDateStruct", {}).get("date", ""),
                "Primary Completion Date": status_mod.get("primaryCompletionDateStruct", {}).get("date", ""),
                "Sponsor": sponsor_mod.get("leadSponsor", {}).get("name", ""),
                "Collaborators": safe_join(sponsor_mod.get("collaborators", []), "name"),
                "Conditions": safe_join(conditions_mod.get("conditions", [])),
                "Interventions": safe_join(intervention_mod.get("interventions", []), "name"),
                "Minimum Age": eligibility_mod.get("minimumAge", ""),
                "Maximum Age": eligibility_mod.get("maximumAge", ""),
                "Gender": eligibility_mod.get("sex", ""),
                "Brief Summary": get_text_block_safe(desc_mod.get("briefSummary", "")),
                "Detailed Description": get_text_block_safe(desc_mod.get("detailedDescription", "")),
                "Locations": location_str,
                "Countries": safe_join(contacts_mod.get("locationCountries", [])),
                "Study URL": f"https://clinicaltrials.gov/study/{id_mod.get('nctId', '')}",
                "Tags": get_tags(desc_mod, eligibility_mod)
            })

        total_fetched += len(studies)
        print(f"📄 Retrieved {len(studies)} (Total: {total_fetched})")

        if not next_token or total_fetched >= 10000:
            break

        time.sleep(0.2)

    df = pd.DataFrame(all_data)
    df.to_csv("clinical_trials_glioma_all_columns.csv", index=False)
    print(f"✅ Done! Saved {len(df)} trials to clinical_trials_Astrocytoma_all_columns.csv")

if __name__ == "__main__":
    fetch_all_glioma_trials_full()


🔍 Fetching up to 10,000 glioma Astrocytoma trials from ClinicalTrials.gov v2 API...
📄 Retrieved 1000 (Total: 1000)
📄 Retrieved 1000 (Total: 2000)
📄 Retrieved 161 (Total: 2161)
✅ Done! Saved 2161 trials to clinical_trials_Astrocytoma_all_columns.csv


# Ependymoma

In [None]:
import requests
import pandas as pd
import time

# 安全提取 textBlock（兼容 dict / str / None）
def get_text_block_safe(section):
    if isinstance(section, dict):
        return section.get("textBlock", "")
    return section if isinstance(section, str) else ""

# 关键词标签生成器
def get_tags(desc_mod, eligibility_mod):
    text = get_text_block_safe(desc_mod.get("briefSummary", "")) + " "
    text += get_text_block_safe(desc_mod.get("detailedDescription", ""))
    text = text.lower()

    tags = []

    if any(k in text for k in ["mri", "pet", "fmri", "spect", "imaging"]):
        tags.append("imaging")
    if any(k in text for k in ["liquid biopsy", "ctdna", "cfdna"]):
        tags.append("liquid biopsy")
    if any(k in text for k in ["resection", "post-surgery", "surgical", "postoperative"]):
        tags.append("surgery")
    if any(k in text for k in ["chemotherapy", "temozolomide", "tmz"]):
        tags.append("chemotherapy")
    if any(k in text for k in ["immunotherapy", "checkpoint", "anti-pd", "nivolumab", "pembrolizumab"]):
        tags.append("immunotherapy")

    min_age = eligibility_mod.get("minimumAge", "").lower()
    if any(k in text for k in ["pediatric", "child", "children"]) or ("year" in min_age and "0" in min_age):
        tags.append("pediatric")

    return "; ".join(tags)

# 主函数
def fetch_all_glioma_trials_full():
    print("🔍 Fetching up to 10,000 glioma Ependymoma trials from ClinicalTrials.gov v2 API...")

    base_url = "https://clinicaltrials.gov/api/v2/studies"
    page_size = 1000
    next_token = None
    all_data = []
    total_fetched = 0

    while True:
        params = {
            "query.cond": "Ependymoma",
            "pageSize": page_size,
            "format": "json"
        }
        if next_token:
            params["pageToken"] = next_token

        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print("❌ API error:", response.status_code)
            break

        json_data = response.json()
        studies = json_data.get("studies", [])
        next_token = json_data.get("nextPageToken", None)

        for s in studies:
            prot = s.get("protocolSection", {})

            id_mod = prot.get("identificationModule", {})
            status_mod = prot.get("statusModule", {})
            design_mod = prot.get("designModule", {})
            desc_mod = prot.get("descriptionModule", {})
            sponsor_mod = prot.get("sponsorCollaboratorsModule", {})
            conditions_mod = prot.get("conditionsModule", {})
            intervention_mod = prot.get("armsInterventionsModule", {})
            eligibility_mod = prot.get("eligibilityModule", {})
            contacts_mod = prot.get("contactsLocationsModule", {})

            # 通用安全列表拼接器
            def safe_join(lst, key=None):
                if isinstance(lst, list):
                    return "; ".join([item.get(key, "") if isinstance(item, dict) else str(item) for item in lst])
                return ""

            # Locations 安全解析
            locations = contacts_mod.get("locations", [])
            location_str = safe_join(locations, "locationFacility") if isinstance(locations, list) else ""

            all_data.append({
                "NCT ID": id_mod.get("nctId", ""),
                "Title": id_mod.get("officialTitle", "") or id_mod.get("briefTitle", ""),
                "Status": status_mod.get("overallStatus", ""),
                "Phase": design_mod.get("phase", ""),
                "Study Type": design_mod.get("studyType", ""),
                "Start Date": status_mod.get("startDateStruct", {}).get("date", ""),
                "Completion Date": status_mod.get("completionDateStruct", {}).get("date", ""),
                "Primary Completion Date": status_mod.get("primaryCompletionDateStruct", {}).get("date", ""),
                "Sponsor": sponsor_mod.get("leadSponsor", {}).get("name", ""),
                "Collaborators": safe_join(sponsor_mod.get("collaborators", []), "name"),
                "Conditions": safe_join(conditions_mod.get("conditions", [])),
                "Interventions": safe_join(intervention_mod.get("interventions", []), "name"),
                "Minimum Age": eligibility_mod.get("minimumAge", ""),
                "Maximum Age": eligibility_mod.get("maximumAge", ""),
                "Gender": eligibility_mod.get("sex", ""),
                "Brief Summary": get_text_block_safe(desc_mod.get("briefSummary", "")),
                "Detailed Description": get_text_block_safe(desc_mod.get("detailedDescription", "")),
                "Locations": location_str,
                "Countries": safe_join(contacts_mod.get("locationCountries", [])),
                "Study URL": f"https://clinicaltrials.gov/study/{id_mod.get('nctId', '')}",
                "Tags": get_tags(desc_mod, eligibility_mod)
            })

        total_fetched += len(studies)
        print(f"📄 Retrieved {len(studies)} (Total: {total_fetched})")

        if not next_token or total_fetched >= 10000:
            break

        time.sleep(0.2)

    df = pd.DataFrame(all_data)
    df.to_csv("clinical_trials_glioma_all_columns.csv", index=False)
    print(f"✅ Done! Saved {len(df)} trials to clinical_trials_Ependymoma_all_columns.csv")

if __name__ == "__main__":
    fetch_all_glioma_trials_full()


🔍 Fetching up to 10,000 glioma Ependymoma trials from ClinicalTrials.gov v2 API...
📄 Retrieved 300 (Total: 300)
✅ Done! Saved 300 trials to clinical_trials_Ependymoma_all_columns.csv


# Glioblastoma

In [None]:
import requests
import pandas as pd
import time

# 安全提取 textBlock（兼容 dict / str / None）
def get_text_block_safe(section):
    if isinstance(section, dict):
        return section.get("textBlock", "")
    return section if isinstance(section, str) else ""

# 关键词标签生成器
def get_tags(desc_mod, eligibility_mod):
    text = get_text_block_safe(desc_mod.get("briefSummary", "")) + " "
    text += get_text_block_safe(desc_mod.get("detailedDescription", ""))
    text = text.lower()

    tags = []

    if any(k in text for k in ["mri", "pet", "fmri", "spect", "imaging"]):
        tags.append("imaging")
    if any(k in text for k in ["liquid biopsy", "ctdna", "cfdna"]):
        tags.append("liquid biopsy")
    if any(k in text for k in ["resection", "post-surgery", "surgical", "postoperative"]):
        tags.append("surgery")
    if any(k in text for k in ["chemotherapy", "temozolomide", "tmz"]):
        tags.append("chemotherapy")
    if any(k in text for k in ["immunotherapy", "checkpoint", "anti-pd", "nivolumab", "pembrolizumab"]):
        tags.append("immunotherapy")

    min_age = eligibility_mod.get("minimumAge", "").lower()
    if any(k in text for k in ["pediatric", "child", "children"]) or ("year" in min_age and "0" in min_age):
        tags.append("pediatric")

    return "; ".join(tags)

# 主函数
def fetch_all_glioma_trials_full():
    print("🔍 Fetching up to 10,000 glioma Glioblastoma trials from ClinicalTrials.gov v2 API...")

    base_url = "https://clinicaltrials.gov/api/v2/studies"
    page_size = 1000
    next_token = None
    all_data = []
    total_fetched = 0

    while True:
        params = {
            "query.cond": "Glioblastoma",
            "pageSize": page_size,
            "format": "json"
        }
        if next_token:
            params["pageToken"] = next_token

        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print("❌ API error:", response.status_code)
            break

        json_data = response.json()
        studies = json_data.get("studies", [])
        next_token = json_data.get("nextPageToken", None)

        for s in studies:
            prot = s.get("protocolSection", {})

            id_mod = prot.get("identificationModule", {})
            status_mod = prot.get("statusModule", {})
            design_mod = prot.get("designModule", {})
            desc_mod = prot.get("descriptionModule", {})
            sponsor_mod = prot.get("sponsorCollaboratorsModule", {})
            conditions_mod = prot.get("conditionsModule", {})
            intervention_mod = prot.get("armsInterventionsModule", {})
            eligibility_mod = prot.get("eligibilityModule", {})
            contacts_mod = prot.get("contactsLocationsModule", {})

            # 通用安全列表拼接器
            def safe_join(lst, key=None):
                if isinstance(lst, list):
                    return "; ".join([item.get(key, "") if isinstance(item, dict) else str(item) for item in lst])
                return ""

            # Locations 安全解析
            locations = contacts_mod.get("locations", [])
            location_str = safe_join(locations, "locationFacility") if isinstance(locations, list) else ""

            all_data.append({
                "NCT ID": id_mod.get("nctId", ""),
                "Title": id_mod.get("officialTitle", "") or id_mod.get("briefTitle", ""),
                "Status": status_mod.get("overallStatus", ""),
                "Phases": design_mod.get("phases", ""),
                "Study Type": design_mod.get("studyType", ""),
                "Start Date": status_mod.get("startDateStruct", {}).get("date", ""),
                "Completion Date": status_mod.get("completionDateStruct", {}).get("date", ""),
                "Primary Completion Date": status_mod.get("primaryCompletionDateStruct", {}).get("date", ""),
                "Sponsor": sponsor_mod.get("leadSponsor", {}).get("name", ""),
                "Collaborators": safe_join(sponsor_mod.get("collaborators", []), "name"),
                "Conditions": safe_join(conditions_mod.get("conditions", [])),
                "Interventions": safe_join(intervention_mod.get("interventions", []), "name"),
                "Minimum Age": eligibility_mod.get("minimumAge", ""),
                "Maximum Age": eligibility_mod.get("maximumAge", ""),
                "Gender": eligibility_mod.get("sex", ""),
                "Brief Summary": get_text_block_safe(desc_mod.get("briefSummary", "")),
                "Detailed Description": get_text_block_safe(desc_mod.get("detailedDescription", "")),
                "Locations": location_str,
                "Countries": safe_join(contacts_mod.get("locationCountries", [])),
                "Study URL": f"https://clinicaltrials.gov/study/{id_mod.get('nctId', '')}",
                "Tags": get_tags(desc_mod, eligibility_mod)
            })

        total_fetched += len(studies)
        print(f"📄 Retrieved {len(studies)} (Total: {total_fetched})")

        if not next_token or total_fetched >= 10000:
            break

        time.sleep(0.2)

    df = pd.DataFrame(all_data)
    df.to_csv("clinical_trials_Glioblastoma_all_columns.csv", index=False)
    print(f"✅ Done! Saved {len(df)} trials to clinical_trials_Glioblastoma_filters.csv")

if __name__ == "__main__":
    fetch_all_glioma_trials_full()


🔍 Fetching up to 10,000 glioma Glioblastoma trials from ClinicalTrials.gov v2 API...
📄 Retrieved 1000 (Total: 1000)
📄 Retrieved 1000 (Total: 2000)
📄 Retrieved 81 (Total: 2081)
✅ Done! Saved 2081 trials to clinical_trials_Glioblastoma_filters.csv


# Oligodendroglioma

In [None]:
import requests
import pandas as pd
import time

# 安全提取 textBlock（兼容 dict / str / None）
def get_text_block_safe(section):
    if isinstance(section, dict):
        return section.get("textBlock", "")
    return section if isinstance(section, str) else ""

# 关键词标签生成器
def get_tags(desc_mod, eligibility_mod):
    text = get_text_block_safe(desc_mod.get("briefSummary", "")) + " "
    text += get_text_block_safe(desc_mod.get("detailedDescription", ""))
    text = text.lower()

    tags = []

    if any(k in text for k in ["mri", "pet", "fmri", "spect", "imaging"]):
        tags.append("imaging")
    if any(k in text for k in ["liquid biopsy", "ctdna", "cfdna"]):
        tags.append("liquid biopsy")
    if any(k in text for k in ["resection", "post-surgery", "surgical", "postoperative"]):
        tags.append("surgery")
    if any(k in text for k in ["chemotherapy", "temozolomide", "tmz"]):
        tags.append("chemotherapy")
    if any(k in text for k in ["immunotherapy", "checkpoint", "anti-pd", "nivolumab", "pembrolizumab"]):
        tags.append("immunotherapy")

    min_age = eligibility_mod.get("minimumAge", "").lower()
    if any(k in text for k in ["pediatric", "child", "children"]) or ("year" in min_age and "0" in min_age):
        tags.append("pediatric")

    return "; ".join(tags)

# 主函数
def fetch_all_glioma_trials_full():
    print("🔍 Fetching up to 10,000 glioma Oligodendroglioma trials from ClinicalTrials.gov v2 API...")

    base_url = "https://clinicaltrials.gov/api/v2/studies"
    page_size = 1000
    next_token = None
    all_data = []
    total_fetched = 0

    while True:
        params = {
            "query.cond": "Oligodendroglioma",
            "pageSize": page_size,
            "format": "json"
        }
        if next_token:
            params["pageToken"] = next_token

        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print("❌ API error:", response.status_code)
            break

        json_data = response.json()
        studies = json_data.get("studies", [])
        next_token = json_data.get("nextPageToken", None)

        for s in studies:
            prot = s.get("protocolSection", {})

            id_mod = prot.get("identificationModule", {})
            status_mod = prot.get("statusModule", {})
            design_mod = prot.get("designModule", {})
            desc_mod = prot.get("descriptionModule", {})
            sponsor_mod = prot.get("sponsorCollaboratorsModule", {})
            conditions_mod = prot.get("conditionsModule", {})
            intervention_mod = prot.get("armsInterventionsModule", {})
            eligibility_mod = prot.get("eligibilityModule", {})
            contacts_mod = prot.get("contactsLocationsModule", {})

            # 通用安全列表拼接器
            def safe_join(lst, key=None):
                if isinstance(lst, list):
                    return "; ".join([item.get(key, "") if isinstance(item, dict) else str(item) for item in lst])
                return ""

            # Locations 安全解析
            locations = contacts_mod.get("locations", [])
            location_str = safe_join(locations, "locationFacility") if isinstance(locations, list) else ""

            all_data.append({
                "NCT ID": id_mod.get("nctId", ""),
                "Title": id_mod.get("officialTitle", "") or id_mod.get("briefTitle", ""),
                "Status": status_mod.get("overallStatus", ""),
                "Phase": design_mod.get("phase", ""),
                "Study Type": design_mod.get("studyType", ""),
                "Start Date": status_mod.get("startDateStruct", {}).get("date", ""),
                "Completion Date": status_mod.get("completionDateStruct", {}).get("date", ""),
                "Primary Completion Date": status_mod.get("primaryCompletionDateStruct", {}).get("date", ""),
                "Sponsor": sponsor_mod.get("leadSponsor", {}).get("name", ""),
                "Collaborators": safe_join(sponsor_mod.get("collaborators", []), "name"),
                "Conditions": safe_join(conditions_mod.get("conditions", [])),
                "Interventions": safe_join(intervention_mod.get("interventions", []), "name"),
                "Minimum Age": eligibility_mod.get("minimumAge", ""),
                "Maximum Age": eligibility_mod.get("maximumAge", ""),
                "Gender": eligibility_mod.get("sex", ""),
                "Brief Summary": get_text_block_safe(desc_mod.get("briefSummary", "")),
                "Detailed Description": get_text_block_safe(desc_mod.get("detailedDescription", "")),
                "Locations": location_str,
                "Countries": safe_join(contacts_mod.get("locationCountries", [])),
                "Study URL": f"https://clinicaltrials.gov/study/{id_mod.get('nctId', '')}",
                "Tags": get_tags(desc_mod, eligibility_mod)
            })

        total_fetched += len(studies)
        print(f"📄 Retrieved {len(studies)} (Total: {total_fetched})")

        if not next_token or total_fetched >= 10000:
            break

        time.sleep(0.2)

    df = pd.DataFrame(all_data)
    df.to_csv("clinical_trials_glioma_all_columns.csv", index=False)
    print(f"✅ Done! Saved {len(df)} trials to clinical_trials_Oligodendroglioma_all_columns.csv")

if __name__ == "__main__":
    fetch_all_glioma_trials_full()


🔍 Fetching up to 10,000 glioma Oligodendroglioma trials from ClinicalTrials.gov v2 API...
📄 Retrieved 408 (Total: 408)
✅ Done! Saved 408 trials to clinical_trials_Oligodendroglioma_all_columns.csv


# scraper with filter


In [None]:
import requests
import pandas as pd
import time

# 安全提取 textBlock
def get_text_block_safe(section):
    if isinstance(section, dict):
        return section.get("textBlock", "")
    return section if isinstance(section, str) else ""

# 标签生成器
def get_tags(desc_mod, eligibility_mod):
    text = get_text_block_safe(desc_mod.get("briefSummary", "")) + " "
    text += get_text_block_safe(desc_mod.get("detailedDescription", ""))
    text = text.lower()

    tags = []
    if any(k in text for k in ["mri", "pet", "fmri", "spect", "imaging"]):
        tags.append("imaging")
    if any(k in text for k in ["liquid biopsy", "ctdna", "cfdna"]):
        tags.append("liquid biopsy")
    if any(k in text for k in ["resection", "post-surgery", "surgical", "postoperative"]):
        tags.append("surgery")
    if any(k in text for k in ["chemotherapy", "temozolomide", "tmz"]):
        tags.append("chemotherapy")
    if any(k in text for k in ["immunotherapy", "checkpoint", "anti-pd", "nivolumab", "pembrolizumab"]):
        tags.append("immunotherapy")

    min_age = eligibility_mod.get("minimumAge", "").lower()
    if any(k in text for k in ["pediatric", "child", "children"]) or ("year" in min_age and "0" in min_age):
        tags.append("pediatric")

    return "; ".join(tags)

# 主函数：抓取并筛选
def fetch_filtered_trials():
    print("🔍 Fetching glioma interventional trials with valid status...")

    base_url = "https://clinicaltrials.gov/api/v2/studies"
    page_size = 1000
    next_token = None
    all_data = []
    total_fetched = 0

    while True:
        params = {
            "query.cond": "Glioblastoma",
            "pageSize": page_size,
            "format": "json"
        }
        if next_token:
            params["pageToken"] = next_token

        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print("❌ API error:", response.status_code)
            break

        json_data = response.json()
        studies = json_data.get("studies", [])
        next_token = json_data.get("nextPageToken", None)

        for s in studies:
            prot = s.get("protocolSection", {})

            id_mod = prot.get("identificationModule", {})
            status_mod = prot.get("statusModule", {})
            design_mod = prot.get("designModule", {})
            desc_mod = prot.get("descriptionModule", {})
            sponsor_mod = prot.get("sponsorCollaboratorsModule", {})
            conditions_mod = prot.get("conditionsModule", {})
            intervention_mod = prot.get("armsInterventionsModule", {})
            eligibility_mod = prot.get("eligibilityModule", {})
            contacts_mod = prot.get("contactsLocationsModule", {})

            # ✅ 筛选条件
            study_type = str(design_mod.get("studyType", "")).lower()
            status = str(status_mod.get("overallStatus", "")).lower()
            bad_statuses = [
                "terminated", "withdrawn", "unknown",
                "temporarily_not_available", "suspended", "no_longer_available"
            ]
            if study_type != "interventional" or status in bad_statuses:
                continue

            def safe_join(lst, key=None):
                if isinstance(lst, list):
                    return "; ".join([item.get(key, "") if isinstance(item, dict) else str(item) for item in lst])
                return ""

            locations = contacts_mod.get("locations", [])
            location_str = safe_join(locations, "locationFacility") if isinstance(locations, list) else ""

            all_data.append({
                "NCT ID": id_mod.get("nctId", ""),
                "Title": id_mod.get("officialTitle", "") or id_mod.get("briefTitle", ""),
                "Status": status_mod.get("overallStatus", ""),
                "Phases": design_mod.get("phases", ""),
                "Study Type": design_mod.get("studyType", ""),
                "Start Date": status_mod.get("startDateStruct", {}).get("date", ""),
                "Completion Date": status_mod.get("completionDateStruct", {}).get("date", ""),
                "Primary Completion Date": status_mod.get("primaryCompletionDateStruct", {}).get("date", ""),
                "Sponsor": sponsor_mod.get("leadSponsor", {}).get("name", ""),
                "Collaborators": safe_join(sponsor_mod.get("collaborators", []), "name"),
                "Conditions": safe_join(conditions_mod.get("conditions", [])),
                "Interventions": safe_join(intervention_mod.get("interventions", []), "name"),
                "Minimum Age": eligibility_mod.get("minimumAge", ""),
                "Maximum Age": eligibility_mod.get("maximumAge", ""),
                "Gender": eligibility_mod.get("sex", ""),
                "Brief Summary": get_text_block_safe(desc_mod.get("briefSummary", "")),
                "Detailed Description": get_text_block_safe(desc_mod.get("detailedDescription", "")),
                "Locations": location_str,
                "Countries": safe_join(contacts_mod.get("locationCountries", [])),
                "Study URL": f"https://clinicaltrials.gov/study/{id_mod.get('nctId', '')}",
                "Tags": get_tags(desc_mod, eligibility_mod)
            })

        total_fetched += len(studies)
        print(f"📄 Retrieved {len(studies)} (Total so far: {total_fetched})")

        if not next_token or total_fetched >= 10000:
            break

        time.sleep(0.2)

    df = pd.DataFrame(all_data)
    df.to_csv("clinical_trials_Glioblastoma_filtered.csv", index=False)
    print(f"✅ Done! Saved {len(df)} trials to clinical_trials_Glioblastoma_filtered.csv")

if __name__ == "__main__":
    fetch_filtered_trials()


🔍 Fetching glioma interventional trials with valid status...
📄 Retrieved 1000 (Total so far: 1000)
📄 Retrieved 1000 (Total so far: 2000)
📄 Retrieved 81 (Total so far: 2081)
✅ Done! Saved 1366 trials to clinical_trials_Glioblastoma_filtered.csv
