In [100]:
import requests
import pandas as pd
import re
import time
import math

""" === TrialFetcher Class ==="""
# Handles querying the ClinicalTrials.gov API and fetching raw study data
class TrialFetcher:
    def __init__(self, expr="Neurofibromatosis", status="ACTIVE_NOT_RECRUITING"):
        self.base_url = "https://clinicaltrials.gov/api/v2/studies"
        self.expr = expr
        self.status = status

    def fetch_all(self):
        """
        Fetches all studies that match the query and status, 
        handling pagination with pageToken.
        """
        results = []
        page_token = None
        while True:
            # Build query parameters
            params = {
                "query.term": self.expr,
                "filter.overallStatus": self.status,
                "pageSize": 100
            }
            if page_token: 
                params["pageToken"] = page_token

            # Send request to API
            try:
                r = requests.get(self.base_url, params=params)
                r.raise_for_status()
                data = r.json()
            except Exception as e:
                print("Request failed:", e)
                break

            # Extract study records
            studies = data.get("studies", [])
            results.extend(studies)

            # Check if more pages exist
            page_token = data.get("nextPageToken")
            if not page_token:
                break

            # Sleep to avoid hitting rate limits
            time.sleep(0.3)

        return results


"""=== TrialParser Class ==="""
# Parses a single study into a structured dictionary of fields
class TrialParser:
    def __init__(self, study):
        self.study = study
        self.parsed = {}

    def parse(self):
       # Master method to run all parsing steps on a study. Returns a dictionary of extracted fields.
        
        self._basic_info()
        self._principal_investigator()
        self._age()
        self._gender()
        self._eligibility()
        self._pregnancy()
        self._race()
        self._conditions()
        self._family()
        self._medications()
        self._drugs()
        self._surgery()
        self._comorbidities()
        return self.parsed

    def _basic_info(self):
        # Extracts identifiers, title, status, sponsor institution, and builds a ClinicalTrials.gov study URL.
        prot = self.study.get("protocolSection", {})
        ids = prot.get("identificationModule", {})
        status = prot.get("statusModule", {})
        sponsor = prot.get("sponsorCollaboratorsModule", {})
        nct_id = ids.get("nctId")

        self.parsed.update({
            "nct_id": ids.get("nctId"),
            "title": ids.get("briefTitle"),
            "status": status.get("overallStatus"),
            "institution": sponsor.get("leadSponsor", {}).get("name"),
            "url": f"https://clinicaltrials.gov/study/{nct_id}" if nct_id else None
        })

    def _principal_investigator(self):
        # Extracts the first listed Principal Investigator (PI), or returns NA if none available.
        contacts = self.study.get("protocolSection", {}).get("contactsLocationsModule", {})
        officials = contacts.get("overallOfficials", [])
        
        if officials:
            pi = officials[0].get("name", "NA")
        else:
            pi = "NA"
        
        self.parsed["Principal_investigator"] = pi

    def _age(self):
        # Extracts min/max age and creates a simplified, numeric age range string.
        # Converts all units to years and formats like: >18, 1–30, ≤21, or N/A
        # Rounds up to the nearest whole year.
        
        elig = self.study.get("protocolSection", {}).get("eligibilityModule", {})
        min_age = elig.get("minimumAge")
        max_age = elig.get("maximumAge")
    
        def to_years(age_str):
            if not age_str:
                return None
            age_str = age_str.lower().strip()
            match = re.search(r"(\d+)", age_str)
            if not match:
                return None
            num = int(match.group(1))
            if "year" in age_str:
                return num
            if "month" in age_str:
                return math.ceil(num / 12)
            if "week" in age_str:
                return math.ceil(num / 52)
            if "day" in age_str:
                return math.ceil(num / 365)
            return None
    
        min_val = to_years(min_age)
        max_val = to_years(max_age)
    
        # Build simplified label
        if min_val is None and max_val is None:
            age_range = "NA"
        elif min_val is not None and max_val is not None:
            age_range = f"{min_val}–{max_val}"
        elif min_val is not None:
            age_range = f">{min_val}"
        else:
            age_range = f"≤{max_val}"
    
        self.parsed.update({
            "Min_age": min_val,
            "Max_age": max_val,
            "Age_range": age_range,
        })

    def _gender(self):
        # Extracts sex eligibility and maps it into binary columns (Male/Female), while preserving the raw value returned by API.
        elig = self.study.get("protocolSection", {}).get("eligibilityModule", {})
        gender_raw = elig.get("sex")
        self.parsed["Gender_raw"] = gender_raw
    
        gender = (gender_raw or "").lower()
    
        if gender == "male":
            self.parsed.update({"Male": True, "Female": False, "Prefer_not_to_say": False})
        elif gender == "female":
            self.parsed.update({"Male": False, "Female": True, "Prefer_not_to_say": False})
        elif gender == "all":
            self.parsed.update({"Male": True, "Female": True, "Prefer_not_to_say": False})
        else:
            self.parsed.update({"Male": None, "Female": None, "Prefer_not_to_say": None})

    def _eligibility(self):
        # Extracts raw eligibility criteria text.
        elig = self.study.get("protocolSection", {}).get("eligibilityModule", {})
        text = (elig.get("eligibilityCriteria") or "").lower()
        self.parsed["eligibility_raw"] = text

    def _pregnancy(self):
        """ 
        Determines whether pregnancy is mentioned in the eligibility criteria and whether pregnant participants are included or excluded.
            Logic:
            - If "pregnan" (e.g., 'pregnant', 'pregnancy') is not found → no mention.
            - If any exclusion phrase appears (e.g., "not pregnant", "pregnant women are excluded") → excluded.
            - Otherwise → included.
            Outputs:
            - Pregnancy_yes: True if included, False if excluded, None if not mentioned.
            - Pregnancy_no:  True if excluded, False if included, None if not mentioned.
            - Pregnancy_reason: "included", "excluded", or "no mention".
        """

        txt = (self.parsed.get("eligibility_raw") or "").lower()

        if "pregnan" not in txt:
            self.parsed.update({
                "Pregnancy_yes": None,
                "Pregnancy_no": None,
                "Pregnancy_reason": "no mention"
            })
            return

        # Exclusion patterns
        exclude_patterns = [
            r"not.*pregnan",
            r"exclude.*pregnan",
            r"prohibit.*pregnan",
            r"must not.*pregnan",
            r"non[- ]?pregnant",
            r"pregnant women (are )?excluded",
            r"pregnancy.*(excluded|not allowed|contraindicated)"
        ]

        if any(re.search(pat, txt) for pat in exclude_patterns):
            self.parsed.update({
                "Pregnancy_yes": False,
                "Pregnancy_no": True,
                "Pregnancy_reason": "excluded"
            })
        else:
            self.parsed.update({
                "Pregnancy_yes": True,
                "Pregnancy_no": False,
                "Pregnancy_reason": "included"
            })

    def _race(self):
        """
        Context-aware race extractor:
        - finds short, human-related phrases (e.g. "chinese subjects", "asian patients", "participants of African American descent")
        - avoids biochemical/substring false-positives (e.g. 'cytarabine', 'hamster', 'cell')
        - dedupes and normalizes Race_reason, joined by commas
        - sets race columns to "YES"/"NO" and sets Other correctly
        """ 
        txt = (self.parsed.get("eligibility_raw") or "").lower()

        # Strict race tokens (words only)
        race_tokens = {
            "Asian": r"(asian|chinese|japanese|korean|filipino|vietnamese|thai|indian|pakistani|bangladeshi|indonesian|malaysian)",
            "American_Indian": r"(american indian|native american|alaska native)",
            "Black": r"(black|african american|afro[- ]?caribbean|nigerian|ghanaian|kenyan|ethiopian)",
            "Hispanic": r"(hispanic|latino|latina|latinx|mexican|puerto rican|cuban)",
            "Middle_Eastern": r"(middle eastern|arab(?!ine\b)|north african|mena|egyptian|lebanese|iranian|iraqi|syrian|moroccan|palestinian)",
            # 'White' restricted to explicit human description
            "White": r"(caucasian|\bwhite\b|european)",
            # 'Other' is processed separately, require explicit race sense (not generic 'other')
            "Other": r"(other (race|ethnicity|ethnic|group|minority)|mixed race|multiracial|non[- ]?white|ethnic minority|pacific islander|native hawaiian|maori|aboriginal)"
        }

        # human nouns that indicate a participant/subject mention
        human_nouns = r"(participant|participants|subject|subjects|patient|patients|individual|individuals|person|people|volunteer|volunteers|cohort|population|group)"

        # exclusion context for clear non-human / lab terms / biochemical words
        exclusion_context = re.compile(
            r"(hamster|mouse|rat|cell|cells|tissue|ovary|ovarian|line|model|xenograft|blood|lesion|patch|matter|membrane|plaque|protein|enzyme|gene|sample|cytarabine|arabinoside)",
            flags=re.I
        )

        captured = []          # raw matched short phrases (as found)
        matched_races = {k: False for k in race_tokens.keys() if k != "Other"}
        other_mentioned = False

        # Patterns we try (short phrase forms)
        # 1) "<race_token> <human_noun>"  -> "chinese subjects"
        # 2) "<human_noun> (of|from) <race_token>" -> "participants of asian descent"
        # 3) fallback: "<race_token>" near a human noun within same short chunk (up to 3 words in-between)
        for race, token in race_tokens.items():
            if race == "Other":
                # check other separately
                other_pat = re.compile(rf"\b{token}\b", flags=re.I)
                if other_pat.search(txt):
                    # ensure it's not clearly a false positive (e.g., "other criteria")
                    if not re.search(r"\bother (criteria|disease|treatment|drug|agent|therapy|symptom|sign)\b", txt):
                        other_mentioned = True
                continue

            # exact phrase: race + human noun (preferred)
            p1 = re.compile(rf"\b{token}\s+(?:{human_nouns})\b", flags=re.I)
            # human noun + of/from + race (e.g., "participants of Asian descent")
            p2 = re.compile(rf"\b(?:{human_nouns})\s+(?:of|from)\s+{token}\b", flags=re.I)
            # short proximity fallback: up to 3 words between race and human noun
            p3 = re.compile(rf"\b{token}\b(?:\W+\w+){{0,3}}?\W+(?:{human_nouns})\b|\b(?:{human_nouns})\b(?:\W+\w+){{0,3}}?\W+\b{token}\b",
                            flags=re.I)

            matches = []
            for p in (p1, p2, p3):
                for m in p.findall(txt):
                    # .findall returns different shapes depending on groups; build a safe string
                    if isinstance(m, tuple):
                        # join non-empty parts
                        snippet = " ".join([part for part in m if part]).strip()
                    else:
                        snippet = m.strip()
                    if snippet:
                        matches.append(snippet)

            # Filter matches that include lab/biochem words
            valid = [m for m in matches if not exclusion_context.search(m)]
            if valid:
                # mark this race as matched and record short phrases
                matched_races[race] = True
                # normalize spacing and lowercase the captured phrase
                for v in valid:
                    captured.append(re.sub(r"\s+", " ", v.lower()).strip())

        # Deduplicate preserving order, normalized by lowercase
        seen = set()
        unique = []
        for phrase in captured:
            norm = phrase.strip().lower()
            if norm and norm not in seen:
                seen.add(norm)
                unique.append(phrase.strip())

        # Set Race_reason (comma-separated) or "no mention"
        if unique:
            self.parsed["Race_reason"] = ", ".join(unique)
        else:
            self.parsed["Race_reason"] = "no mention"

        # Now set each race column to "YES"/"NO"
        for race in matched_races:
            self.parsed[race] = "YES" if matched_races[race] else "NO"

        # Decide Other:
        # - If no mention at all -> Other = NO
        # - If mention exists but no matched known race -> Other YES only if other_mentioned or there's raw "race" phrases
        if self.parsed["Race_reason"] == "no mention":
            self.parsed["Other"] = "NO"
        else:
            # if any known race matched, Other = NO
            if any(matched_races.values()):
                self.parsed["Other"] = "NO"
            else:
                # no known races matched but something was mentioned; set Other YES only if explicit 'other' pattern was found
                self.parsed["Other"] = "YES" if other_mentioned else "NO"



    def _conditions(self):
        # Identifies which neurofibromatosis subtype or related conditions are being studied.
        conds = self.study.get("protocolSection", {}).get("conditionsModule", {}).get("conditions", [])
        cond_str = " ".join([c.lower() for c in conds])

        nf1_match = re.search(r"\bnf[\s-]?1\b|\bneurofibromatosis type[\s-]?1\b", cond_str)
        nf2_match = re.search(r"\bnf[\s-]?2\b|\bneurofibromatosis type[\s-]?2\b", cond_str)
        schw_match = re.search(r"schwann?omatosis", cond_str)
        under_inv_match = re.search(r"under investigation|investigational|being studied", cond_str)

        # Build the parsed dictionary
        self.parsed.update({
            "Neurofibromatosis Type 1": bool(nf1_match),
            "Neurofibromatosis Type 2": bool(nf2_match),
            "Schwannomatosis": bool(schw_match),
            "Under_Investigation": bool(under_inv_match),
            "conditions_source_text": ", ".join(
                filter(None, [
                    nf1_match.group(0) if nf1_match else None,
                    nf2_match.group(0) if nf2_match else None,
                    schw_match.group(0) if schw_match else None,
                    under_inv_match.group(0) if under_inv_match else None,
                ])
            ) or "" # if no match return nothing
        })

    def _family(self):
        """
        Detects mentions of family members in eligibility text,
        only if mentioned in relation to a medical condition.
        Extracts matching phrases into Family_source_text.
        """
        txt = (self.parsed.get("eligibility_raw") or "").lower()

        # Family and condition patterns
        family_terms = r"(parent[s]?|sibling[s]?|child(ren)?|relative|family member|cousin|aunt|uncle|grandparent)"
        condition_terms = r"(nf[\s-]?[12]|neurofibromatosis|schwannomatosis|tumou?r|disease|condition|diagnos|genetic)"

        # Pattern allowing up to 15 words between family and condition terms
        pattern = rf"(\b{family_terms}\b(?:\W+\w+){{0,15}}?\b{condition_terms}\b|\b{condition_terms}\b(?:\W+\w+){{0,15}}?\b{family_terms}\b)"

        # Find matches of family-condition contexts
        matches = re.findall(pattern, txt)

        # If matches are found, determine which family members are mentioned
        if matches:
            context_text = " ".join([m[0] if isinstance(m, tuple) else m for m in matches])

            parents = bool(re.search(r"\bparent(s)?\b", context_text))
            siblings = bool(re.search(r"\bsibling(s)?\b", context_text))
            children = bool(re.search(r"\bchild(ren)?\b", context_text))
            other = bool(re.search(r"\b(relative|family member|cousin|aunt|uncle|grandparent)\b", context_text))
            no_one = not (parents or siblings or children or other)

            # Deduplicate phrases
            seen = set()
            unique_phrases = [x.strip() for x in matches if x[0] not in seen and not seen.add(x[0])]
            self.parsed["Family_source_text"] = " | ".join(unique_phrases)

        else:
            # No family-condition context found
            parents = siblings = children = other = False
            no_one = True
            self.parsed["Family_source_text"] = ""

        # Update parsed fields
        self.parsed.update({
            "Parents": parents,
            "Siblings": siblings,
            "Children": children,
            "Other": other,
            "No_one_in_family": no_one
        })


    def _medications(self):
        """
        Detects if the eligibility text mentions participants being on medication,
        receiving treatment, or undergoing therapy. Adds 'Medication_source_text'
        to show the exact context phrase(s).
        """
        txt = (self.parsed.get("eligibility_raw") or "").lower()

        # Contextual medication-related patterns
        med_patterns = [
            r"receiving (?:any )?(?:drug|treatment|therapy|medication)",
            r"patients? (?:on|undergoing|receiving) (?:any )?(?:drug|therapy|treatment|medication)",
            r"use of (?:an|any) investigational (?:drug|therapy)",
            r"active (?:pharmaceutical|medical) therapy",
            r"currently (?:taking|receiving|under) (?:any )?(?:drug|treatment|therapy|medication)",
            r"any medication for treatment of",
            r"treated with",
            r"under (?:treatment|therapy)",
            r"receiving investigational (?:treatment|therapy)"
        ]

        found_phrases = []
        for pat in med_patterns:
            # Match full sentence fragment containing the keyword
            matches = re.findall(rf"([^.]*{pat}[^.]*)", txt)
            # Ensure all are strings, not tuples
            found_phrases.extend([m.strip() for m in matches if isinstance(m, str)])

        # Deduplicate
        seen = set()
        found_phrases = [f for f in found_phrases if not (f in seen or seen.add(f))]

        if found_phrases:
            self.parsed.update({
                "Medication_yes": True,
                "Medication_no": False,
                "Medication_source_text": ", ".join(found_phrases)
            })
        else:
            self.parsed.update({
                "Medication_yes": False,
                "Medication_no": True,
                "Medication_source_text": ""
            })


    def _drugs(self):
        """
        Detects known and unlisted drugs in eligibility criteria text.
        - Flags specific known drugs.
        - Captures all drug names (known and other) into 'Drug_source_text'.
        - 'Other_drug' = True if any drug outside the known list is found.
        """
    
        txt = (self.parsed.get("eligibility_raw") or "").lower()

        # --- Known drugs ---
        known_drugs = {
            "Selumetinib(kosulego)": r"\bselumetinib\b|\bkosulego\b",
            "Bevacizumab(avastin)": r"\bbevacizumab\b|\bavastin\b",
            "Everolimus(afinitor)": r"\beverolimus\b|\bafinitor\b",
            "Trametinib(mekinist)": r"\btrametinib\b|\bmekinist\b",
        }

        # --- Extended known list ---
        other_known_drugs = [
            "ipilimumab", "nivolumab", "sorafenib", "tipifarnib", "dabrafenib",
            "fluconazole", "binimetinib", "cobimetinib", "pembrolizumab", "mirdametinib",
            "cetuximab", "sirolimus", "temsirolimus", "brigatinib", "neratinib",
            "vemurafenib", "dolutegravir", "emtricitabine", "raltegravir", "tenofovir",
            "mitomycin", "cytarabine", "fludarabine", "tacrolimus", "carboplatin", "imatinib"
        ]

        # --- Generic drug name pattern ---
        generic_drug_pattern = r"\b[a-z]{3,}(mab|nib|limus|zole|platin|mycin|cillin|vir|trexate|xan|prost|gliflozin|gliptin|lukast|ciclovir|oxacin|olol|dipine)\b"

        found_known = set()
        found_other = set()

        # --- Match known drugs ---
        for drug, pat in known_drugs.items():
            if re.search(pat, txt):
                self.parsed[drug] = True
                found_known.add(drug.split("(")[0].lower())  # store the clean drug name
            else:
                self.parsed[drug] = False

        # --- Match extended known drugs ---
        for drug in other_known_drugs:
            if re.search(rf"\b{re.escape(drug)}\b", txt):
                found_other.add(drug.lower())

        # --- Dynamic generic drug detection ---
        dynamic_matches = re.findall(generic_drug_pattern, txt)
        if dynamic_matches:
            # Re-run with context capture to get full words, not just suffix
            word_matches = re.findall(r"\b[a-z]{3,}(?:mab|nib|limus|zole|platin|mycin|cillin|vir|trexate|xan|prost|gliflozin|gliptin|lukast|ciclovir|oxacin|olol|dipine)\b", txt)
            found_other.update(map(str.lower, word_matches))

        # --- Combine and deduplicate ---
        all_drugs = sorted(set(found_known) | found_other)

        # --- Flag other_drug if any not in known list ---
        other_drug_flag = any(d not in {k.split("(")[0].lower() for k in known_drugs} for d in found_other)

        # --- Update parsed dict ---
        self.parsed.update({
            "Other_drug": other_drug_flag,
            "Drug_source_text": ", ".join(all_drugs)
        })


    def _surgery(self):
        """
        Detects whether surgery or related procedures are mentioned in eligibility text.
        - Flags Surgery_yes / Surgery_no.
        - Returns only the matched surgical terms (not full sentences) in Surgery_source_text.
        """
        txt = (self.parsed.get("eligibility_raw") or "").lower()

        # Define all surgery-related terms
        surgery_terms = [
            "surgery", "surgical", "operation", "operative",
            "resection", "biopsy", "excision", "tumor removal",
            "debulking", "neurosurgery", "orthopedic surgery"
        ]

        # Match any of the keywords
        pattern = r"\b(" + "|".join(map(re.escape, surgery_terms)) + r")\b"
        matches = re.findall(pattern, txt)

        if matches:
            unique_terms = sorted(set(matches))
            self.parsed.update({
                "Surgery_yes": True,
                "Surgery_no": False,
                "Surgery_source_text": ", ".join(unique_terms)
            })
        else:
            self.parsed.update({
                "Surgery_yes": False,
                "Surgery_no": True,
                "Surgery_source_text": ""
            })


    def _comorbidities(self):
        """
        Detects comorbidities mentioned in the eligibility text.
        - Flags Diabetes, Hypertension, and Asthma individually (True/False)
        - Flags No_comorbidity if no condition is mentioned
        - Flags Other_comorbidity if other illnesses are found
        - Extracts all matched comorbidity terms into Comorbidity_source_text
        """
        txt = (self.parsed.get("eligibility_raw") or "").lower()

        # Define regex patterns for specific comorbidities
        comorb_patterns = {
            "Diabetes": r"\bdiabet(es|ic)\b",
            "Hypertension": r"\bhypertension|high blood pressure\b",
            "Asthma": r"\basthma\b"
        }

        # Broader set of possible comorbidities for "Other_comorbidity"
        other_patterns = [
            r"\bcancer|tumou?r",
            r"\bepilepsy|seizure\b",
            r"\bheart disease|cardiac|heart failure\b",
            r"\bliver|hepatic\b",
            r"\bstroke|cerebrovascular\b",
            r"\bpsychiatric|mental|depression|anxiety|schizo\b",
            r"\bthyroid|endocrine\b"
        ]

        found_terms = []

        # Check specific comorbidities
        for cond, pat in comorb_patterns.items():
            match = re.findall(pat, txt)
            self.parsed[cond] = bool(match)
            if match:
                found_terms.append(cond)

        # Detect other comorbidities
        other_found = []
        for pat in other_patterns:
            match = re.findall(pat, txt)
            if match:
                other_found.extend(match)

        # Clean and combine captured terms
        other_found_cleaned = list(set([m.strip() for m in other_found if m]))
        if other_found_cleaned:
            found_terms.extend(other_found_cleaned)

        # Determine boolean flags
        any_comorb = bool(found_terms)
        no_comorb = not any_comorb
        other_comorb = bool(other_found_cleaned)

        self.parsed.update({
            "No_comorbidity": no_comorb,
            "Other_comorbidity": other_comorb,
            "Comorbidity_source_text": ", ".join(sorted(set(found_terms))) if found_terms else ""
        })



# === Helper Function ===
# Converts boolean values to YES/NO/NA strings for cleaner CSV export
def bool_to_yesno(value):
    if pd.isna(value):
        return "NA"
    elif value is True:
        return "YES"
    elif value is False:
        return "NO"
    else:
        return value

# === Runner ===
if __name__ == "__main__":
    fetcher = TrialFetcher(expr="Neurofibromatosis", status="ACTIVE_NOT_RECRUITING")
    print("Fetching trials...")
    raw_trials = fetcher.fetch_all()
    print(f"Fetched {len(raw_trials)} studies.")

    print("Parsing trials...")
    parsed_trials = [TrialParser(study).parse() for study in raw_trials]
    df = pd.DataFrame(parsed_trials)

    # Convert only boolean columns to YES/NO/NA
    for col in df.columns:
        if df[col].dropna().isin([True, False]).any():
            df[col] = df[col].map(bool_to_yesno)

    print(df.head())
    df.to_csv("clinical_trials_parsed.csv", index=False)
    print("Saved to clinical_trials_parsed.csv")

Fetching trials...
Fetched 41 studies.
Parsing trials...
        nct_id                                              title  \
0  NCT03741101  Treatment of NF1-related Plexiform Neurofibrom...   
1  NCT00924196  Natural History Study of Patients With Neurofi...   
2  NCT05276973  Testing the Addition of Ipatasertib to the Usu...   
3  NCT06287463  Study of DCC-3084 in Participants With Advance...   
4  NCT04931342  A Study Evaluating the Efficacy and Safety of ...   

                  status                      institution  \
0  ACTIVE_NOT_RECRUITING                     Region Skane   
1  ACTIVE_NOT_RECRUITING  National Cancer Institute (NCI)   
2  ACTIVE_NOT_RECRUITING  National Cancer Institute (NCI)   
3  ACTIVE_NOT_RECRUITING   Deciphera Pharmaceuticals, LLC   
4  ACTIVE_NOT_RECRUITING                Hoffmann-La Roche   

                                            url     Principal_investigator  \
0  https://clinicaltrials.gov/study/NCT03741101           Björn Sigurdsson   
1  ht

In [101]:
df.head(10)

Unnamed: 0,nct_id,title,status,institution,url,Principal_investigator,Min_age,Max_age,Age_range,Gender_raw,...,Drug_source_text,Surgery_yes,Surgery_no,Surgery_source_text,Diabetes,Hypertension,Asthma,No_comorbidity,Other_comorbidity,Comorbidity_source_text
0,NCT03741101,Treatment of NF1-related Plexiform Neurofibrom...,ACTIVE_NOT_RECRUITING,Region Skane,https://clinicaltrials.gov/study/NCT03741101,Björn Sigurdsson,1.0,17.0,1–17,ALL,...,,NO,YES,,NO,NO,NO,NO,YES,"heart failure, hepatic, liver, tumor"
1,NCT00924196,Natural History Study of Patients With Neurofi...,ACTIVE_NOT_RECRUITING,National Cancer Institute (NCI),https://clinicaltrials.gov/study/NCT00924196,"Brigitte C Widemann, M.D.",1.0,,>1,ALL,...,,YES,NO,"surgery, surgical",NO,NO,NO,NO,YES,tumor
2,NCT05276973,Testing the Addition of Ipatasertib to the Usu...,ACTIVE_NOT_RECRUITING,National Cancer Institute (NCI),https://clinicaltrials.gov/study/NCT05276973,Katherine C Fuh,18.0,,>18,FEMALE,...,carboplatin,YES,NO,"debulking, surgery",YES,NO,NO,NO,YES,"Diabetes, cancer, cardiac, liver, psychiatric,..."
3,NCT06287463,Study of DCC-3084 in Participants With Advance...,ACTIVE_NOT_RECRUITING,"Deciphera Pharmaceuticals, LLC",https://clinicaltrials.gov/study/NCT06287463,,18.0,,>18,ALL,...,,YES,NO,surgery,NO,NO,NO,NO,YES,"cancer, cardiac, tumor"
4,NCT04931342,A Study Evaluating the Efficacy and Safety of ...,ACTIVE_NOT_RECRUITING,Hoffmann-La Roche,https://clinicaltrials.gov/study/NCT04931342,Clinical Trials,18.0,,>18,FEMALE,...,,YES,NO,surgery,NO,NO,NO,NO,YES,"cancer, tumor"
5,NCT05363267,"NF-1, Nutraceutical Intervention",ACTIVE_NOT_RECRUITING,"Masonic Cancer Center, University of Minnesota",https://clinicaltrials.gov/study/NCT05363267,"Christopher Moertel, MD",18.0,,>18,ALL,...,selumetinib,NO,YES,,NO,NO,NO,NO,YES,psychiatric
6,NCT04924608,Efficacy and Safety of Selumetinib in Adults W...,ACTIVE_NOT_RECRUITING,AstraZeneca,https://clinicaltrials.gov/study/NCT04924608,"Alice P. Chen, MD",18.0,,>18,ALL,...,,NO,YES,,NO,YES,NO,NO,YES,"Hypertension, heart disease, heart failure"
7,NCT03975829,Pediatric Long-Term Follow-up and Rollover Study,ACTIVE_NOT_RECRUITING,Novartis Pharmaceuticals,https://clinicaltrials.gov/study/NCT03975829,Novartis Pharmaceuticals,1.0,99.0,1–99,ALL,...,"dabrafenib, trametinib",NO,YES,,NO,NO,NO,NO,YES,mental
8,NCT03363217,Trametinib for Pediatric Neuro-oncology Patien...,ACTIVE_NOT_RECRUITING,St. Justine's Hospital,https://clinicaltrials.gov/study/NCT03363217,"Sébastien Perreault, MD",1.0,25.0,1–25,ALL,...,trametinib,YES,NO,"biopsy, surgery",YES,NO,NO,NO,YES,"Diabetes, cancer, cardiac, heart failure, live..."
9,NCT01552434,Bevacizumab and Temsirolimus Alone or in Combi...,ACTIVE_NOT_RECRUITING,M.D. Anderson Cancer Center,https://clinicaltrials.gov/study/NCT01552434,Sarina A Piha-Paul,,,,ALL,...,"bevacizumab, cetuximab, sirolimus, temsirolimus",YES,NO,surgery,NO,YES,NO,NO,YES,"Hypertension, cancer, cerebrovascular"


In [102]:
df.columns

Index(['nct_id', 'title', 'status', 'institution', 'url',
       'Principal_investigator', 'Min_age', 'Max_age', 'Age_range',
       'Gender_raw', 'Male', 'Female', 'Prefer_not_to_say', 'eligibility_raw',
       'Pregnancy_yes', 'Pregnancy_no', 'Pregnancy_reason', 'Race_reason',
       'Asian', 'American_Indian', 'Black', 'Hispanic', 'Middle_Eastern',
       'White', 'Other', 'Neurofibromatosis Type 1',
       'Neurofibromatosis Type 2', 'Schwannomatosis', 'Under_Investigation',
       'conditions_source_text', 'Family_source_text', 'Parents', 'Siblings',
       'Children', 'No_one_in_family', 'Medication_yes', 'Medication_no',
       'Medication_source_text', 'Selumetinib(kosulego)',
       'Bevacizumab(avastin)', 'Everolimus(afinitor)', 'Trametinib(mekinist)',
       'Other_drug', 'Drug_source_text', 'Surgery_yes', 'Surgery_no',
       'Surgery_source_text', 'Diabetes', 'Hypertension', 'Asthma',
       'No_comorbidity', 'Other_comorbidity', 'Comorbidity_source_text'],
      dtype='ob

In [103]:
df[['Diabetes', 'Hypertension', 'Asthma', 'No_comorbidity', 'Other_comorbidity', 'Comorbidity_source_text']]

Unnamed: 0,Diabetes,Hypertension,Asthma,No_comorbidity,Other_comorbidity,Comorbidity_source_text
0,NO,NO,NO,NO,YES,"heart failure, hepatic, liver, tumor"
1,NO,NO,NO,NO,YES,tumor
2,YES,NO,NO,NO,YES,"Diabetes, cancer, cardiac, liver, psychiatric,..."
3,NO,NO,NO,NO,YES,"cancer, cardiac, tumor"
4,NO,NO,NO,NO,YES,"cancer, tumor"
5,NO,NO,NO,NO,YES,psychiatric
6,NO,YES,NO,NO,YES,"Hypertension, heart disease, heart failure"
7,NO,NO,NO,NO,YES,mental
8,YES,NO,NO,NO,YES,"Diabetes, cancer, cardiac, heart failure, live..."
9,NO,YES,NO,NO,YES,"Hypertension, cancer, cerebrovascular"


In [91]:
df['Surgery_source_text'][10]

'surgery'