In [92]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
# import selenium exceptions
from selenium.common.exceptions import *
from dataclasses import dataclass
import os

from bs4 import BeautifulSoup
ctg_studies = pd.read_csv('ctg-studies.csv')
# Zip together 'NCT Number' and "Study URL"
studies = list(zip(ctg_studies['NCT Number'], ctg_studies['Study URL']))

options = Options()
# options.add_argument('--headless')
options.add_argument('--window-size=1920x1080')
driver = webdriver.Chrome(options=options)

unscraped_links = []
df = pd.DataFrame(columns=['NCT Number', 'Study URL', 'Table'])

@dataclass
class Study:
    nct: str
    link: str
    table: str

studies_dc = []
# Create an empty file 'studies_scraped_tables.csv' to store the scraped tables if it doesn't exist
if not os.path.exists('studies_scraped_tables.csv'):
    with open('studies_scraped_tables.csv', 'w') as f:
        f.write('NCT Number,Study URL,Table\n')

for nct, link in studies[:1]:
    # 1. Use Selenium to open the link
    # 2. Click on the "Results Posted" tab
    # 3. Click on the "Expand all" button with the attribute data-ga-category="Baseline Characteristics"
    # 4. Extract the first instance of a <table> tag that is a child of a <ctg-sticky-container> tag
    try:
        driver.get(link)
        # Wait until the "Results Posted" tab is clickable
        WebDriverWait(driver, 5).until(
            lambda driver: driver.find_element(By.XPATH, "//*[contains(text(), 'Results Posted')]").is_displayed()
        )
        driver.find_element(By.XPATH, "//*[contains(text(), 'Results Posted')]").click()
        print('Found Results Posted tab')
        # Wait until the "Expand all" button is clickable. Use XPATH to find the button by its data-ga-category attribute
        WebDriverWait(driver, 5).until(
            lambda driver: driver.find_element(By.XPATH, "//button[@data-ga-action='Baseline Characteristics']")
        )
        print('Found Results Posted tab and Expand all button')
        driver.find_element(By.XPATH, "//button[@data-ga-action='Baseline Characteristics']").click()
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        table = soup.select_one('ctg-baseline-characteristics').select_one('table').prettify()
        if table:
            # Store in a dataframe with columns "NCT Number", "Study URL", and "Table"
            # First store in dataclass
            study = Study(nct, link, table)
            studies_dc.append(study)
            # write to csv
            with open('studies_scraped_tables.csv', 'a') as f:
                f.write(f'{study.nct},{study.link},{study.table}\n')
        else:
            print('No table found for', link)
            unscraped_links.append(link)
    except Exception as e:
        print(e)
        driver.close()

In [60]:
soup_table

'<table _ngcontent-ng-c4179008994="" style="width: max(417px, 100%);">\n <colgroup _ngcontent-ng-c4179008994="">\n  <col _ngcontent-ng-c4179008994="" style="width: 162px;"/>\n  <col _ngcontent-ng-c4179008994="" style="width: 1fr;"/>\n  <!-- -->\n  <!-- -->\n  <!-- -->\n  <!-- -->\n </colgroup>\n <!-- -->\n <thead _ngcontent-ng-c4179008994="">\n  <tr _ngcontent-ng-c4179008994="" class="sticky-head">\n   <th _ngcontent-ng-c4179008994="" class="sticky-col">\n    Arm/Group Title\n   </th>\n   <th _ngcontent-ng-c4179008994="" colspan="1">\n    Afatinib 50mg\n   </th>\n   <!-- -->\n  </tr>\n  <tr _ngcontent-ng-c4179008994="">\n   <td _ngcontent-ng-c4179008994="" class="sticky-col">\n    Arm/Group Description\n   </td>\n   <td _ngcontent-ng-c4179008994="" colspan="1">\n    <!-- -->\n    <div _ngcontent-ng-c4179008994="" class="rel">\n     <div _ngcontent-ng-c4179008994="" class="markup-collapsed" id="baseline-0-0">\n      <span _ngcontent-ng-c4179008994="">\n       Afatinib 50mg film coated t

In [88]:
soup_table = BeautifulSoup(studies_dc[0].table, "html.parser").prettify()
# Remove all of the text in between a < and > for each tag, except for the tag type
def remove_text_between_tags(table_str: str):
    """
    Removes all of the text in between a < and > for each tag, except for the tag type
    """

    # Remove all of the text in between a < and > for each tag, except for the tag type
    new_table = ''
    index = 0
    while index < len(table_str):
        if table_str[index] == '<':
            new_table += table_str[index]
            index += 1
            # add characters until the next space is found
            while (table_str[index] != ' ') and (table_str[index] != '>'):
                new_table += table_str[index]
                index += 1
            while table_str[index] != '>':
                index += 1
        else:
            new_table += table_str[index]
        index += 1
    
    # Remove all \n and spaces from string
    new_table = new_table.replace('\n', '')
    new_table = new_table.replace(' ', '')
    new_table = new_table.replace('<!--', '')
    return new_table

table_str = remove_text_between_tags(soup_table)

In [90]:
len(table_str)

1745

In [9]:
ctg_studies = pd.read_csv('ctg-studies.csv')
# Zip together 'NCT Number' and "Study URL"
studies = list(zip(ctg_studies['NCT Number'], ctg_studies['Study URL']))

In [12]:
studies[1503:]

[('NCT00520676', 'https://clinicaltrials.gov/study/NCT00520676'),
 ('NCT00979576', 'https://clinicaltrials.gov/study/NCT00979576'),
 ('NCT02856581', 'https://clinicaltrials.gov/study/NCT02856581'),
 ('NCT00406276', 'https://clinicaltrials.gov/study/NCT00406276'),
 ('NCT03041181', 'https://clinicaltrials.gov/study/NCT03041181'),
 ('NCT02151981', 'https://clinicaltrials.gov/study/NCT02151981'),
 ('NCT00993499', 'https://clinicaltrials.gov/study/NCT00993499'),
 ('NCT00538681', 'https://clinicaltrials.gov/study/NCT00538681'),
 ('NCT03043599', 'https://clinicaltrials.gov/study/NCT03043599'),
 ('NCT00391274', 'https://clinicaltrials.gov/study/NCT00391274'),
 ('NCT01049776', 'https://clinicaltrials.gov/study/NCT01049776'),
 ('NCT02367781', 'https://clinicaltrials.gov/study/NCT02367781'),
 ('NCT00632281', 'https://clinicaltrials.gov/study/NCT00632281'),
 ('NCT01750281', 'https://clinicaltrials.gov/study/NCT01750281'),
 ('NCT03382899', 'https://clinicaltrials.gov/study/NCT03382899'),
 ('NCT0126

In [13]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
# import selenium exceptions
from selenium.common.exceptions import *
from dataclasses import dataclass
import os

from bs4 import BeautifulSoup
ctg_studies = pd.read_csv('lung_cancer_studies_zipcodes_v2.csv')
# Zip together 'NCT Number' and "Study URL"
studies = list(zip(ctg_studies['NCT Number'], ctg_studies['Study URL']))

options = Options()
# options.add_argument('--headless')
options.add_argument('--window-size=1920x1080')
driver = webdriver.Chrome(options=options)

unscraped_links = []
df = pd.DataFrame(columns=['NCT Number', 'Study URL', 'Table'])

@dataclass
class Study:
    nct: str
    link: str
    table: str

studies_dc = []
def remove_text_between_tags(table_str: str):
    """
    Removes all of the text in between a < and > for each tag, except for the tag type
    """

    # Remove all of the text in between a < and > for each tag, except for the tag type
    new_table = ''
    index = 0
    while index < len(table_str):
        if table_str[index] == '<':
            new_table += table_str[index]
            index += 1
            # add characters until the next space is found
            while (table_str[index] != ' ') and (table_str[index] != '>'):
                new_table += table_str[index]
                index += 1
            while table_str[index] != '>':
                index += 1
        else:
            new_table += table_str[index]
        index += 1
    
    # Remove all \n and spaces from string
    new_table = new_table.replace('\n', '')
    new_table = new_table.replace(' ', '')
    new_table = new_table.replace('<!--', '')
    return new_table

for nct, link in studies:
    # 1. Use Selenium to open the link
    # 2. Click on the "Results Posted" tab
    # 3. Click on the "Expand all" button with the attribute data-ga-category="Baseline Characteristics"
    # 4. Extract the first instance of a <table> tag that is a child of a <ctg-sticky-container> tag
    try:
        driver.get(link)
        # Wait until the "Results Posted" tab is clickable
        WebDriverWait(driver, 5).until(
            lambda driver: driver.find_element(By.XPATH, "//*[contains(text(), 'Results Posted')]").is_displayed()
        )
        driver.find_element(By.XPATH, "//*[contains(text(), 'Results Posted')]").click()
        # Wait until the "Expand all" button is clickable. Use XPATH to find the button by its data-ga-category attribute
        WebDriverWait(driver, 5).until(
            lambda driver: driver.find_element(By.XPATH, "//button[@data-ga-action='Baseline Characteristics']")
        )
        driver.find_element(By.XPATH, "//button[@data-ga-action='Baseline Characteristics']").click()
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        table = soup.select_one('ctg-baseline-characteristics').select_one('table').prettify()
        if table:
            # Store in a dataframe with columns "NCT Number", "Study URL", and "Table"
            # First store in dataclass
            # table = remove_text_between_tags(table)
            study = Study(nct, link, table)
            studies_dc.append(study)
            # write to tsv
            # with open('studies_scraped_tables.tsv', 'a') as f:
            #     f.write(f'{study.nct}\t{study.link}\t{study.table}\n')
        else:
            print('No table found for', link)
            unscraped_links.append(link)
            with open('unscraped_ncts.txt', 'a') as f:
                f.write(nct + '\n')
    except Exception as e:
        print(e)
        df = pd.DataFrame(studies_dc)
        studies_dc.to_stata('studies_scraped_tables_error.dta', write_index=False, version=118)
        driver.close()

# Save to stata .dta file
df = pd.DataFrame(studies_dc)
studies_dc.to_stata('studies_scraped_tables.dta', write_index=False, version=118)

AttributeError: 'list' object has no attribute 'to_stata'

In [17]:
df.to_stata('studies_scraped_tables_part2.dta', write_index=False, version=118)

In [3]:
len(studies_dc)

1503

In [8]:
pd.DataFrame(studies_dc).to_stata('studies_scraped_tables_part1.dta', write_index=False, version=118)

In [18]:
# Load and combine the two .dta files
part1 = pd.read_stata('studies_scraped_tables_part1.dta')
part2 = pd.read_stata('studies_scraped_tables_part2.dta')

combined = pd.concat([part1, part2], ignore_index=True)

#save
combined.to_stata('studies_scraped_tables.dta', write_index=False, version=118)

In [57]:
import pandas as pd
from jinja2 import Template

# Sample DataFrame
data = pd.read_stata('studies_scraped_tables.dta')
ctg_studies = pd.read_excel('lung_cancer_studies_zipcodes_v2.xlsx')

ctg_studies['NCT Number'] = ctg_studies['NCT Number'].astype(str)
# make original_studies['nct'] strings
data['nct'] = data['nct'].astype(str)

# list of ctg-studies NCT Number
ncts = ctg_studies['NCT Number'].tolist()

# filter data to only include ncts in ncts list
data = data[data['nct'].isin(ncts)]

# sort by nct, ascending
data = data.sort_values(by=['nct'])

data = data[data['nct'] == 'NCT00300495']

# Load the HTML template
with open('template.html', 'r') as file:
    template_content = file.read()
template = Template(template_content)

# Create a list of tables as (identifier, HTML_table) pairs
tables = [(row['nct'], row['table']) for _, row in data.iterrows()]

# Render the HTML
html_output = template.render(tables=tables)

# Save the HTML to a file
with open('output_NCT00300495.html', 'w') as output_file:
    output_file.write(html_output)

In [20]:
pd.read_stata('studies_scraped_tables.dta')

Unnamed: 0,nct,link,table
0,NCT00730925,https://clinicaltrials.gov/study/NCT00730925,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
1,NCT02448225,https://clinicaltrials.gov/study/NCT02448225,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
2,NCT00480025,https://clinicaltrials.gov/study/NCT00480025,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
3,NCT00216125,https://clinicaltrials.gov/study/NCT00216125,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
4,NCT00356525,https://clinicaltrials.gov/study/NCT00356525,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
...,...,...,...
1626,NCT00750269,https://clinicaltrials.gov/study/NCT00750269,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
1627,NCT00323869,https://clinicaltrials.gov/study/NCT00323869,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
1628,NCT00280735,https://clinicaltrials.gov/study/NCT00280735,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
1629,NCT02766335,https://clinicaltrials.gov/study/NCT02766335,"<table _ngcontent-ng-c4179008994="""" style=""wid..."


In [33]:
original_studies = pd.read_stata('studies_scraped_tables_original.dta')
ctg_studies = pd.read_excel('lung_cancer_studies_zipcodes_v2.xlsx')
# make NCT Number strings
ctg_studies['NCT Number'] = ctg_studies['NCT Number'].astype(str)
# make original_studies['nct'] strings
original_studies['nct'] = original_studies['nct'].astype(str)
ctg_studies = ctg_studies[~ctg_studies['NCT Number'].isin(original_studies['nct'])]
print(f"NCTs to scrape: {len(ctg_studies)}")

NCTs to scrape: 0


In [32]:
ctg_studies['NCT Number'].as

0       NCT00003492
1       NCT00003508
2       NCT00003726
3       NCT00003869
4       NCT00003901
           ...     
1254    NCT04940221
1255    NCT04971187
1256    NCT05030454
1257    NCT05091528
1258    NCT05553808
Name: NCT Number, Length: 1259, dtype: object

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from dataclasses import dataclass
import re

@dataclass
class StudyDetails:
    nct: str
    total_participants: int
    num_female: int
    num_male: int

studies = pd.read_stata('studies_scraped_tables.dta')
ctg_studies = pd.read_excel('lung_cancer_studies_zipcodes_v2.xlsx')
# studies that are alos in ctg_studies
studies = studies[studies['nct'].isin(ctg_studies['NCT Number'])]
studies = studies.sort_values(by=['nct'])

def modify_table_strings(table: str) -> str:
    table = table.replace('<!-- -->', ' ')
    table = table.replace('\n   ', ' ')
    table = table.replace('\n', ' ')
    return table

# apply modify_table_strings to each row in studies['table']
studies['table'] = studies['table'].apply(modify_table_strings)
# Drop "Total Number", "Female", and "Male" from ctg_studies
ctg_studies = ctg_studies[['NCT Number', 'Study Title', 'Study URL', 'Study Status',
       'Study Results', 'Conditions', 'Interventions', 'Surgery?', 'Drug?',
       'Behavioural?', 'Observational?', 'Phases', 'Funder Type', 'Study Type',
       'Results First Posted', 'Locations', 'Zipcodes']]

In [5]:
def get_sex_breakdown(row):
    # Iterate through each tag in the first element of soup
    # first tag in soup.children
    nct = row[0]
    table = row[1]
    num_female, num_male, total_participants = -1, -1, -1
    try:
        soup = BeautifulSoup(table, 'html.parser')
        soup_table = soup.select_one('table')
        
        # get the columns in the soup_table
        cols = soup_table.find_all('th')
        # check the .text of the cols for "Total", case insensitive
        total_col = None
        for idx, col in enumerate(cols):
            if "Total" in col.text or "total" in col.text:
                total_col = idx
                break

        # Find the element that has the text "Age" in it.
        # Then find the root parent
        tds = soup_table.find_all('td')
        relevant_tbody = ""
        for td in tds:
            if ('Sex' in td.text) or ("Gender" in td.text):
                relevant_tbody = td.parent.find_next("tbody")      
        if relevant_tbody == "": return (nct, -1, -1, -1)      

        sex_soup = BeautifulSoup(str(relevant_tbody), 'html.parser')

        trs = sex_soup.find_all('tr')
        if len(cols) == 2:
            # Continue as normal
            for tr in trs:
                tds = tr.find_all('td')
                if "Female" in tds[0].text:
                    if "%" in tds[-1].text:
                        # extract the first digit using regex
                        num_female = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[-1].text).group(2))
                    else: num_female = ''.join(filter(str.isdigit, tds[-1].text))
                    if num_female == "": num_female = -1
                    else: num_female = int(num_female)
                elif "Male" in tds[0].text:
                    if "%" in tds[-1].text:
                        # extract the first digit using regex
                        num_male = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[-1].text).group(2))
                    else: num_male = ''.join(filter(str.isdigit, tds[-1].text))
                    if num_male == "": num_male = -1
                    else: num_male = int(num_male)
                elif "Number Analyzed" in tds[0].text:
                    total_participants = ''.join(filter(str.isdigit, tds[-1].text))
                    if total_participants == "": total_participants = -1
                    else: total_participants = int(total_participants)
        elif len(cols) == 3 and total_col is not None:
            # Use total col
            print("col == 3 and total_col is not None: ", nct)
            pass
        elif len(cols) == 3 and total_col is None:
            # combine the two columns
            print("col == 3 and total_col is None: ", nct)
            pass
        elif len(cols) > 3 and total_col is not None:
            # use the total column
            # For each tr in trs, get the td at the index of total_col
            for tr in trs:
                tds = tr.find_all('td')
                if "Female" in tds[0].text:
                    if "%" in tds[total_col].text:
                        # extract the first digit using regex
                        num_female = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[total_col].text).group(2))
                    else: num_female = ''.join(filter(str.isdigit, tds[total_col].text))
                    if num_female == "": num_female = -1
                    else: num_female = int(num_female)
                elif "Male" in tds[0].text:
                    if "%" in tds[total_col].text:
                        # extract the first digit using regex
                        num_male = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[total_col].text).group(2))
                    else: num_male = ''.join(filter(str.isdigit, tds[total_col].text))
                    if num_male == "": num_male = -1
                    else: num_male = int(num_male)
                elif "Number Analyzed" in tds[0].text:
                    total_participants = ''.join(filter(str.isdigit, tds[total_col].text))
                    if total_participants == "": total_participants = -1
                    else: total_participants = int(total_participants)
        elif len(cols) > 3 and total_col is None:
            # need to manually check
            print("col > 3: Gender Breakdown manually check: ", nct)
            pass
    except Exception as e:
        print("Sex Exception: ", nct)
        return (nct, -1, -1, -1)
    return (nct, num_female, num_male, total_participants)

def get_age_breakdown(row):
    nct = row[0]
    table = row[1]
    mean, median = -1, -1
    mean_flag, median_flag = False, False

    if "Age, Continuous" not in table: return (nct, -1, -1)
    try:
        soup = BeautifulSoup(table, 'html.parser')
        soup_table = soup.select_one('table')
        # get the columns in the soup_table
        cols = soup_table.find_all('th')
        # check the .text of the cols for "Total", case insensitive
        total_col = None
        for idx, col in enumerate(cols):
            if "Total" in col.text or "total" in col.text:
                total_col = idx
                break

        # Find the element that has the text "Age" in it.
        # Then find the root parent
        tds = soup_table.find_all('td')
        relevant_tbody = ""
        for td in tds:
            if ('Age, Continuous' in td.text):
                if ("Mean") in td.text: mean_flag = True
                if ("Median") in td.text: median_flag = True
                relevant_tbody = td.parent.find_next("tbody")  
        if relevant_tbody == "": 
            print(f"Could not find relevant_tbody for {nct}")
            return (nct, -1, -1, -1)      

        age_soup = BeautifulSoup(str(relevant_tbody), 'html.parser')
        trs = age_soup.find_all('tr')
        tr =  trs[-1]
        
        if len(cols) == 2:
            # Continue as normal
            tds = tr.find_all('td')

            if mean_flag: mean = float(re.match(r'\s*([0-9]+(?:\.\d+)?)\s*', tds[-1].text).group(1))
            if median_flag: median = float(re.match(r'\s*([0-9]+(?:\.\d+)?)\s*', tds[-1].text).group(1))
        elif len(cols) == 3 and total_col is not None:
            # Use total col
            print("col == 3 and total_col is not None: ", nct)
            pass
        elif len(cols) == 3 and total_col is None:
            # combine the two columns
            print("col == 3 and total_col is None: ", nct)
            pass
        elif len(cols) > 3 and total_col is not None:
            tds = tr.find_all('td')
            # print(nct)
            if mean_flag: mean = float(re.match(r'\s*([0-9]+(?:\.\d+)?)\s*', tds[total_col].text).group(1))
            # print(tds[total_col].text)
            if median_flag: median = float(re.match(r'\s*([0-9]+(?:\.\d+)?)\s*', tds[total_col].text).group(1))
        elif len(cols) > 3 and total_col is None:
            # need to manually check
            print("col > 3: Gender Breakdown manually check: ", nct)
            pass
    except Exception as e:
        print("Age Exception: ", nct)
        return (nct, -1, -1)
    return (nct, mean, median)

@dataclass
class RaceEthnicity:
    american_indian: int
    asian: int
    native_hawaiian: int
    black: int
    white: int
    mixed: int
    race_unknown: int
    hispanic: int
    not_hispanic: int
    ethnicity_unknown: int
    race_ethnicity_flag: int

def get_race_breakdown(row):
    # Flag if "Race/Ethnicity" is in the table
    race_ethnicity_flag = False
    nct = row[0]
    table = row[1]

    # race categories
    american_indian, asian, native_hawaiian, black, white, mixed, race_unknown = -1, -1, -1, -1, -1, -1, -1

    # nih ethnicity
    hispanic, not_hispanic, ethnicity_unknown = -1, -1, -1

    if "Race" in table or "Ethnicity" in table: race_ethnicity_flag = True
    else: return (nct, RaceEthnicity(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))

    try:
        soup = BeautifulSoup(table, 'html.parser')
        soup_table = soup.select_one('table')
        # get the columns in the soup_table
        cols = soup_table.find_all('th')
        # check the .text of the cols for "Total", case insensitive
        total_col = None
        for idx, col in enumerate(cols):
            if "Total" in col.text or "total" in col.text:
                total_col = idx
                break

        tds = soup_table.find_all('td')
        ethnicity_tbody, race_tbody = "", ""
        for td in tds:
            if ('Ethnicity (NIH/OMB)' in td.text):
                ethnicity_tbody = td.parent.find_next("tbody")  
            if ("Race (NIH/OMB)" in td.text):
                race_tbody = td.parent.find_next("tbody")

        # Ethnicity
        ethnicity_soup = BeautifulSoup(str(ethnicity_tbody), 'html.parser')
        trs = ethnicity_soup.find_all('tr')
        if len(cols) == 2:
            for tr in trs:
                tds = tr.find_all('td')
                if "Not Hispanic or Latino" in tds[0].text:
                    not_hispanic = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[-1].text).group(2))
                    # print("Not Hispanic or Latino: ", not_hispanic)
                elif "Hispanic or Latino" in tds[0].text:
                    hispanic = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[-1].text).group(2))
                    # print("Hispanic or Latino: ", hispanic)
                elif "Unknown or Not Reported" in tds[0].text:
                    ethnicity_unknown = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[-1].text).group(2))
                    # print("Unknown ethnicity:", ethnicity_unknown)
        else:
            for tr in trs:
                tds = tr.find_all('td')
                if "Not Hispanic or Latino" in tds[0].text:
                    not_hispanic = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[total_col].text).group(2))
                    # print("Not Hispanic or Latino: ", not_hispanic)
                elif "Hispanic or Latino" in tds[0].text:
                    hispanic = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[total_col].text).group(2))
                    # print("Hispanic or Latino: ", hispanic)
                elif "Unknown or Not Reported" in tds[0].text:
                    ethnicity_unknown = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[total_col].text).group(2))
                    # print("Unknown ethnicity:", ethnicity_unknown)

        # Race
        race_soup = BeautifulSoup(str(race_tbody), 'html.parser')
        trs = race_soup.find_all('tr')
        if len(cols) == 2:
            for tr in trs:
                tds = tr.find_all('td')
                if "American Indian or Alaska Native" in tds[0].text:
                    american_indian = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[-1].text).group(2))
                    print("American Indian or Alaska Native:", american_indian)
                elif "Asian" in tds[0].text:
                    asian = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[-1].text).group(2))
                    # print("Asian:", asian)
                elif "Native Hawaiian or Other Pacific Islander" in tds[0].text:
                    native_hawaiian = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[-1].text).group(2))
                    # print("Native Hawaiian or Other Pacific Islander:", native_hawaiian)
                elif "Black or African American" in tds[0].text:
                    black = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[-1].text).group(2))
                    # print("Black or African American:", black)
                elif "White" in tds[0].text:
                    white = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[-1].text).group(2))
                    # print("White:", white)
                elif "Unknown or Not Reported" in tds[0].text:
                    race_unknown = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[-1].text).group(2))
                    # print("Race unknown:", race_unknown)
                elif "More than one race" in tds[0].text:
                    mixed = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[-1].text).group(2))
                    # print("Mixed:", mixed)
        else:
            for tr in trs:
                tds = tr.find_all('td')
                if "American Indian or Alaska Native" in tds[0].text:
                    american_indian = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[total_col].text).group(2))
                    print("American Indian or Alaska Native:", american_indian)
                elif "Asian" in tds[0].text:
                    asian = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[total_col].text).group(2))
                    # print("Asian:", asian)
                elif "Native Hawaiian or Other Pacific Islander" in tds[0].text:
                    native_hawaiian = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[total_col].text).group(2))
                    # print("Native Hawaiian or Other Pacific Islander:", native_hawaiian)
                elif "Black or African American" in tds[0].text:
                    black = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[total_col].text).group(2))
                    # print("Black or African American:", black)
                elif "White" in tds[0].text:
                    white = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[total_col].text).group(2))
                    # print("White:", white)
                elif "Unknown or Not Reported" in tds[0].text:
                    race_unknown = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[total_col].text).group(2))
                    # print("Race unknown:", race_unknown)
                elif "More than one race" in tds[0].text:
                    mixed = int(re.match(r'\s*([^\d]+)\s*(\d+)', tds[total_col].text).group(2))
                    # print("Mixed:", mixed)

    except Exception as e:
        print(f"Race/Ethnicity Exception: {nct}")
        return (nct, RaceEthnicity(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1))

    ret_object = RaceEthnicity(american_indian=american_indian, \
                            asian=asian,\
                            native_hawaiian=native_hawaiian,\
                            black=black,\
                            white=white,\
                            race_unknown=race_unknown,\
                            mixed=mixed,\
                            hispanic=hispanic,\
                            not_hispanic=not_hispanic,\
                            ethnicity_unknown=ethnicity_unknown,\
                            race_ethnicity_flag=race_ethnicity_flag)
    
    return (nct, ret_object)

In [6]:
study_details = []
ncts_tables = list(zip(studies['nct'], studies['table']))

gender_breakdowns = [get_sex_breakdown(row) for row in ncts_tables]
age_breakdowns = [get_age_breakdown(row) for row in ncts_tables]
race_ethnicity_breakdowns = [get_race_breakdown(row) for row in ncts_tables]

Sex Exception:  NCT00300495
Sex Exception:  NCT02712905
Sex Exception:  NCT04314284
Age Exception:  NCT00137839
Age Exception:  NCT00252382
Age Exception:  NCT00373425
Age Exception:  NCT00492206
Age Exception:  NCT00531284
Age Exception:  NCT00550654
Age Exception:  NCT00840749
Age Exception:  NCT01021215
Age Exception:  NCT01177397
Age Exception:  NCT01523587
Age Exception:  NCT01524783
Age Exception:  NCT01587703
Age Exception:  NCT01970865
Age Exception:  NCT02027428
Age Exception:  NCT02034123
Age Exception:  NCT02213133
Age Exception:  NCT02222922
Age Exception:  NCT02289456
Age Exception:  NCT02289690
Age Exception:  NCT02296125
Age Exception:  NCT02336451
Age Exception:  NCT02393248
Age Exception:  NCT02411448
Age Exception:  NCT02423343
Age Exception:  NCT02451930
Age Exception:  NCT02452554
Age Exception:  NCT02546986
Age Exception:  NCT02642042
Age Exception:  NCT02695290
Age Exception:  NCT02701400
Age Exception:  NCT02702921
Age Exception:  NCT02711137
Age Exception:  NCT0

In [4]:
get_age_breakdown(("NCT04940221", studies[studies['nct'] == "NCT04940221"]['table'].values[0]))

('NCT04940221', '64.1', -1)

In [11]:
gender_breakdowns_df = pd.DataFrame(gender_breakdowns, columns=["NCT Number", "Female", "Male", "Total Number"])
age_breakdowns_df = pd.DataFrame(age_breakdowns, columns=["NCT Number", "Mean", "Median"])
race_ethnicity_breakdowns_flattened = [(nct, data.american_indian, data.asian, data.native_hawaiian, 
                              data.black, data.white, data.mixed, data.race_unknown, data.hispanic, data.not_hispanic, 
                              data.ethnicity_unknown, data.race_ethnicity_flag) for nct, data in race_ethnicity_breakdowns]
race_ethnicity_breakdowns_df = pd.DataFrame(race_ethnicity_breakdowns_flattened, columns=["NCT Number", "Native American", "Asian", "Pacific", 
                                                                                "Black", "White", "Mixed", "Unknown Race", "Hispanic",
                                                                                "Non-His", "Unknown Ethnicity", "Race/Ethnicity Flag"])
new_ctg_studies = ctg_studies.merge(gender_breakdowns_df, on="NCT Number", how="left")
new_ctg_studies = new_ctg_studies.merge(age_breakdowns_df, on="NCT Number", how="left")
new_ctg_studies = new_ctg_studies.merge(race_ethnicity_breakdowns_df, on="NCT Number", how="left")

# new_ctg_studies.to_excel('lung_cancer_studies_scraped.xlsx', index=False)

In [12]:
start_dates = pd.read_csv("ctg-studies (6).csv")
# merge with new_ctg_studies on NCT Number
new_ctg_studies = new_ctg_studies.merge(start_dates, on="NCT Number", how="left")
new_ctg_studies.to_excel('lung_cancer_studies_scraped.xlsx', index=False)

In [13]:
new_ctg_studies[(new_ctg_studies["Race/Ethnicity Flag"] == True) & (new_ctg_studies["White"] == -1)]

Unnamed: 0,NCT Number,Study Title,Study URL,Study Status,Study Results,Conditions,Interventions,Surgery?,Drug?,Behavioural?,...,Pacific,Black,White,Mixed,Unknown Race,Hispanic,Non-His,Unknown Ethnicity,Race/Ethnicity Flag,Start Date
4,NCT00003901,Prognostic Study of Metastases in Patients Wit...,https://clinicaltrials.gov/study/NCT00003901,COMPLETED,YES,Lung Cancer,OTHER: immunohistochemistry staining method|PR...,1,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,True,1999-07
5,NCT00004547,"Treatment of Peritoneal Cancer With Surgery, P...",https://clinicaltrials.gov/study/NCT00004547,COMPLETED,YES,Abdominal Neoplasm|Colonic Neoplasm|Mesothelio...,PROCEDURE: Surgery|PROCEDURE: Continuous hyper...,0,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,True,2000-01
9,NCT00043108,"Combination Chemotherapy, Surgery, and Radiati...",https://clinicaltrials.gov/study/NCT00043108,COMPLETED,YES,Lung Cancer,DRUG: carboplatin|DRUG: paclitaxel|PROCEDURE: ...,1,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,True,2002-07
10,NCT00045162,S0124: Cisplatin Combined With Irinotecan or E...,https://clinicaltrials.gov/study/NCT00045162,COMPLETED,YES,Lung Cancer,DRUG: cisplatin|DRUG: etoposide|DRUG: irinotec...,0,1,0,...,-1,-1,-1,-1,-1,14,577,60,True,2002-11
28,NCT00073008,A Study Of Oral GW572016 In Advanced Or Metast...,https://clinicaltrials.gov/study/NCT00073008,TERMINATED,YES,"Lung Cancer, Non-Small Cell",DRUG: GW572016 (lapatinib),0,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,True,2003-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1237,NCT04422210,"A Study Evaluating The Safety, Tolerability, P...",https://clinicaltrials.gov/study/NCT04422210,TERMINATED,YES,Small Cell Lung Cancer,DRUG: Venetoclax|DRUG: Atezolizumab|DRUG: Carb...,0,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,True,9/22/20
1242,NCT04491084,"FLT3 Ligand, CD40 Agonist Antibody, and Stereo...",https://clinicaltrials.gov/study/NCT04491084,TERMINATED,YES,Non Small Cell Lung Cancer|Lung Cancer,DRUG: FLT3 Ligand (CDX-301)|BIOLOGICAL: anti-C...,0,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,True,1/1/21
1245,NCT04581824,Efficacy Comparison of Dostarlimab Plus Chemot...,https://clinicaltrials.gov/study/NCT04581824,ACTIVE_NOT_RECRUITING,YES,"Lung Cancer, Non-Small Cell",DRUG: Dostarlimab|DRUG: Pembrolizumab|DRUG: Ch...,0,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,True,11/19/20
1251,NCT04644315,A Home-Based Approach Study to Evaluate the Ef...,https://clinicaltrials.gov/study/NCT04644315,TERMINATED,YES,Neoplasms|Colorectal Neoplasms|Melanoma|Pancre...,DRUG: Alectinib,0,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,True,5/24/21
