In [92]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
# import selenium exceptions
from selenium.common.exceptions import *
from dataclasses import dataclass
import os

from bs4 import BeautifulSoup
ctg_studies = pd.read_csv('ctg-studies.csv')
# Zip together 'NCT Number' and "Study URL"
studies = list(zip(ctg_studies['NCT Number'], ctg_studies['Study URL']))

options = Options()
# options.add_argument('--headless')
options.add_argument('--window-size=1920x1080')
driver = webdriver.Chrome(options=options)

unscraped_links = []
df = pd.DataFrame(columns=['NCT Number', 'Study URL', 'Table'])

@dataclass
class Study:
    nct: str
    link: str
    table: str

studies_dc = []
# Create an empty file 'studies_scraped_tables.csv' to store the scraped tables if it doesn't exist
if not os.path.exists('studies_scraped_tables.csv'):
    with open('studies_scraped_tables.csv', 'w') as f:
        f.write('NCT Number,Study URL,Table\n')

for nct, link in studies[:1]:
    # 1. Use Selenium to open the link
    # 2. Click on the "Results Posted" tab
    # 3. Click on the "Expand all" button with the attribute data-ga-category="Baseline Characteristics"
    # 4. Extract the first instance of a <table> tag that is a child of a <ctg-sticky-container> tag
    try:
        driver.get(link)
        # Wait until the "Results Posted" tab is clickable
        WebDriverWait(driver, 5).until(
            lambda driver: driver.find_element(By.XPATH, "//*[contains(text(), 'Results Posted')]").is_displayed()
        )
        driver.find_element(By.XPATH, "//*[contains(text(), 'Results Posted')]").click()
        print('Found Results Posted tab')
        # Wait until the "Expand all" button is clickable. Use XPATH to find the button by its data-ga-category attribute
        WebDriverWait(driver, 5).until(
            lambda driver: driver.find_element(By.XPATH, "//button[@data-ga-action='Baseline Characteristics']")
        )
        print('Found Results Posted tab and Expand all button')
        driver.find_element(By.XPATH, "//button[@data-ga-action='Baseline Characteristics']").click()
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        table = soup.select_one('ctg-baseline-characteristics').select_one('table').prettify()
        if table:
            # Store in a dataframe with columns "NCT Number", "Study URL", and "Table"
            # First store in dataclass
            study = Study(nct, link, table)
            studies_dc.append(study)
            # write to csv
            with open('studies_scraped_tables.csv', 'a') as f:
                f.write(f'{study.nct},{study.link},{study.table}\n')
        else:
            print('No table found for', link)
            unscraped_links.append(link)
    except Exception as e:
        print(e)
        driver.close()

In [60]:
soup_table

'<table _ngcontent-ng-c4179008994="" style="width: max(417px, 100%);">\n <colgroup _ngcontent-ng-c4179008994="">\n  <col _ngcontent-ng-c4179008994="" style="width: 162px;"/>\n  <col _ngcontent-ng-c4179008994="" style="width: 1fr;"/>\n  <!-- -->\n  <!-- -->\n  <!-- -->\n  <!-- -->\n </colgroup>\n <!-- -->\n <thead _ngcontent-ng-c4179008994="">\n  <tr _ngcontent-ng-c4179008994="" class="sticky-head">\n   <th _ngcontent-ng-c4179008994="" class="sticky-col">\n    Arm/Group Title\n   </th>\n   <th _ngcontent-ng-c4179008994="" colspan="1">\n    Afatinib 50mg\n   </th>\n   <!-- -->\n  </tr>\n  <tr _ngcontent-ng-c4179008994="">\n   <td _ngcontent-ng-c4179008994="" class="sticky-col">\n    Arm/Group Description\n   </td>\n   <td _ngcontent-ng-c4179008994="" colspan="1">\n    <!-- -->\n    <div _ngcontent-ng-c4179008994="" class="rel">\n     <div _ngcontent-ng-c4179008994="" class="markup-collapsed" id="baseline-0-0">\n      <span _ngcontent-ng-c4179008994="">\n       Afatinib 50mg film coated t

In [88]:
soup_table = BeautifulSoup(studies_dc[0].table, "html.parser").prettify()
# Remove all of the text in between a < and > for each tag, except for the tag type
def remove_text_between_tags(table_str: str):
    """
    Removes all of the text in between a < and > for each tag, except for the tag type
    """

    # Remove all of the text in between a < and > for each tag, except for the tag type
    new_table = ''
    index = 0
    while index < len(table_str):
        if table_str[index] == '<':
            new_table += table_str[index]
            index += 1
            # add characters until the next space is found
            while (table_str[index] != ' ') and (table_str[index] != '>'):
                new_table += table_str[index]
                index += 1
            while table_str[index] != '>':
                index += 1
        else:
            new_table += table_str[index]
        index += 1
    
    # Remove all \n and spaces from string
    new_table = new_table.replace('\n', '')
    new_table = new_table.replace(' ', '')
    new_table = new_table.replace('<!--', '')
    return new_table

table_str = remove_text_between_tags(soup_table)

In [89]:
table_str

'<table<colgroup<col<col</colgroup<thead<tr<thArm/GroupTitle</th<thAfatinib50mg</th</tr<tr<tdArm/GroupDescription</td<td<div<div<spanAfatinib50mgfilmcoatedtabletswereadministeredoncedailyaslongastheyweretoleratedbypatients,untildiseaseprogression(accordingtotheresponseevaluationcriteriainsolidtumors)</span</div</div<a<svg<use</use</svgShowmore</a</td</tr<tr<td<spanOverallNumberofBaselineParticipants</span</td<td<span41</span</td</tr<tr<td<div<div<span[NotSpecified]</span</div</div<spanBaselineAnalysisPopulationDescription</span</td<td<div<span[NotSpecified]</span</div</td</tr</thead<tr<td<div<div<buttonExpandall</button/<buttonCollapseall</button</div</div</td<td</td</tr<tr<td<div<a<div<div<spanAge,Continuous</span</div<div<spanMean(StandardDeviation)</span|<spanUnitofmeasure:years</span</div</div<div<svg<use</use</svg</div</a<div<div<div<spanAge,Continuous</span</div<div<spanMean(StandardDeviation)</span|<spanUnitofmeasure:years</span</div</div<div<svg<use</use</svg</div</div</div</td

In [90]:
len(table_str)

1745

In [9]:
ctg_studies = pd.read_csv('ctg-studies.csv')
# Zip together 'NCT Number' and "Study URL"
studies = list(zip(ctg_studies['NCT Number'], ctg_studies['Study URL']))

In [12]:
studies[1503:]

[('NCT00520676', 'https://clinicaltrials.gov/study/NCT00520676'),
 ('NCT00979576', 'https://clinicaltrials.gov/study/NCT00979576'),
 ('NCT02856581', 'https://clinicaltrials.gov/study/NCT02856581'),
 ('NCT00406276', 'https://clinicaltrials.gov/study/NCT00406276'),
 ('NCT03041181', 'https://clinicaltrials.gov/study/NCT03041181'),
 ('NCT02151981', 'https://clinicaltrials.gov/study/NCT02151981'),
 ('NCT00993499', 'https://clinicaltrials.gov/study/NCT00993499'),
 ('NCT00538681', 'https://clinicaltrials.gov/study/NCT00538681'),
 ('NCT03043599', 'https://clinicaltrials.gov/study/NCT03043599'),
 ('NCT00391274', 'https://clinicaltrials.gov/study/NCT00391274'),
 ('NCT01049776', 'https://clinicaltrials.gov/study/NCT01049776'),
 ('NCT02367781', 'https://clinicaltrials.gov/study/NCT02367781'),
 ('NCT00632281', 'https://clinicaltrials.gov/study/NCT00632281'),
 ('NCT01750281', 'https://clinicaltrials.gov/study/NCT01750281'),
 ('NCT03382899', 'https://clinicaltrials.gov/study/NCT03382899'),
 ('NCT0126

In [13]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
# import selenium exceptions
from selenium.common.exceptions import *
from dataclasses import dataclass
import os

from bs4 import BeautifulSoup
ctg_studies = pd.read_csv('lung_cancer_studies_zipcodes_v2.csv')
# Zip together 'NCT Number' and "Study URL"
studies = list(zip(ctg_studies['NCT Number'], ctg_studies['Study URL']))

options = Options()
# options.add_argument('--headless')
options.add_argument('--window-size=1920x1080')
driver = webdriver.Chrome(options=options)

unscraped_links = []
df = pd.DataFrame(columns=['NCT Number', 'Study URL', 'Table'])

@dataclass
class Study:
    nct: str
    link: str
    table: str

studies_dc = []
def remove_text_between_tags(table_str: str):
    """
    Removes all of the text in between a < and > for each tag, except for the tag type
    """

    # Remove all of the text in between a < and > for each tag, except for the tag type
    new_table = ''
    index = 0
    while index < len(table_str):
        if table_str[index] == '<':
            new_table += table_str[index]
            index += 1
            # add characters until the next space is found
            while (table_str[index] != ' ') and (table_str[index] != '>'):
                new_table += table_str[index]
                index += 1
            while table_str[index] != '>':
                index += 1
        else:
            new_table += table_str[index]
        index += 1
    
    # Remove all \n and spaces from string
    new_table = new_table.replace('\n', '')
    new_table = new_table.replace(' ', '')
    new_table = new_table.replace('<!--', '')
    return new_table

for nct, link in studies:
    # 1. Use Selenium to open the link
    # 2. Click on the "Results Posted" tab
    # 3. Click on the "Expand all" button with the attribute data-ga-category="Baseline Characteristics"
    # 4. Extract the first instance of a <table> tag that is a child of a <ctg-sticky-container> tag
    try:
        driver.get(link)
        # Wait until the "Results Posted" tab is clickable
        WebDriverWait(driver, 5).until(
            lambda driver: driver.find_element(By.XPATH, "//*[contains(text(), 'Results Posted')]").is_displayed()
        )
        driver.find_element(By.XPATH, "//*[contains(text(), 'Results Posted')]").click()
        # Wait until the "Expand all" button is clickable. Use XPATH to find the button by its data-ga-category attribute
        WebDriverWait(driver, 5).until(
            lambda driver: driver.find_element(By.XPATH, "//button[@data-ga-action='Baseline Characteristics']")
        )
        driver.find_element(By.XPATH, "//button[@data-ga-action='Baseline Characteristics']").click()
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        table = soup.select_one('ctg-baseline-characteristics').select_one('table').prettify()
        if table:
            # Store in a dataframe with columns "NCT Number", "Study URL", and "Table"
            # First store in dataclass
            # table = remove_text_between_tags(table)
            study = Study(nct, link, table)
            studies_dc.append(study)
            # write to tsv
            # with open('studies_scraped_tables.tsv', 'a') as f:
            #     f.write(f'{study.nct}\t{study.link}\t{study.table}\n')
        else:
            print('No table found for', link)
            unscraped_links.append(link)
            with open('unscraped_ncts.txt', 'a') as f:
                f.write(nct + '\n')
    except Exception as e:
        print(e)
        df = pd.DataFrame(studies_dc)
        studies_dc.to_stata('studies_scraped_tables_error.dta', write_index=False, version=118)
        driver.close()

# Save to stata .dta file
df = pd.DataFrame(studies_dc)
studies_dc.to_stata('studies_scraped_tables.dta', write_index=False, version=118)

AttributeError: 'list' object has no attribute 'to_stata'

In [17]:
df.to_stata('studies_scraped_tables_part2.dta', write_index=False, version=118)

In [3]:
len(studies_dc)

1503

In [8]:
pd.DataFrame(studies_dc).to_stata('studies_scraped_tables_part1.dta', write_index=False, version=118)

In [18]:
# Load and combine the two .dta files
part1 = pd.read_stata('studies_scraped_tables_part1.dta')
part2 = pd.read_stata('studies_scraped_tables_part2.dta')

combined = pd.concat([part1, part2], ignore_index=True)

#save
combined.to_stata('studies_scraped_tables.dta', write_index=False, version=118)

In [36]:
import pandas as pd
from jinja2 import Template

# Sample DataFrame
data = pd.read_stata('studies_scraped_tables_original.dta')
ctg_studies = pd.read_excel('lung_cancer_studies_zipcodes_v2.xlsx')

ctg_studies['NCT Number'] = ctg_studies['NCT Number'].astype(str)
# make original_studies['nct'] strings
data['nct'] = data['nct'].astype(str)

# list of ctg-studies NCT Number
ncts = ctg_studies['NCT Number'].tolist()

# filter data to only include ncts in ncts list
data = data[data['nct'].isin(ncts)]

# sort by nct, ascending
data = data.sort_values(by=['nct'])

# Load the HTML template
with open('template.html', 'r') as file:
    template_content = file.read()
template = Template(template_content)

# Create a list of tables as (identifier, HTML_table) pairs
tables = [(row['nct'], row['table']) for _, row in data.iterrows()]

# Render the HTML
html_output = template.render(tables=tables)

# Save the HTML to a file
with open('output.html', 'w') as output_file:
    output_file.write(html_output)

In [20]:
pd.read_stata('studies_scraped_tables.dta')

Unnamed: 0,nct,link,table
0,NCT00730925,https://clinicaltrials.gov/study/NCT00730925,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
1,NCT02448225,https://clinicaltrials.gov/study/NCT02448225,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
2,NCT00480025,https://clinicaltrials.gov/study/NCT00480025,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
3,NCT00216125,https://clinicaltrials.gov/study/NCT00216125,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
4,NCT00356525,https://clinicaltrials.gov/study/NCT00356525,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
...,...,...,...
1626,NCT00750269,https://clinicaltrials.gov/study/NCT00750269,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
1627,NCT00323869,https://clinicaltrials.gov/study/NCT00323869,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
1628,NCT00280735,https://clinicaltrials.gov/study/NCT00280735,"<table _ngcontent-ng-c4179008994="""" style=""wid..."
1629,NCT02766335,https://clinicaltrials.gov/study/NCT02766335,"<table _ngcontent-ng-c4179008994="""" style=""wid..."


In [33]:
original_studies = pd.read_stata('studies_scraped_tables_original.dta')
ctg_studies = pd.read_excel('lung_cancer_studies_zipcodes_v2.xlsx')
# make NCT Number strings
ctg_studies['NCT Number'] = ctg_studies['NCT Number'].astype(str)
# make original_studies['nct'] strings
original_studies['nct'] = original_studies['nct'].astype(str)
ctg_studies = ctg_studies[~ctg_studies['NCT Number'].isin(original_studies['nct'])]
print(f"NCTs to scrape: {len(ctg_studies)}")

NCTs to scrape: 0


In [32]:
ctg_studies['NCT Number'].as

0       NCT00003492
1       NCT00003508
2       NCT00003726
3       NCT00003869
4       NCT00003901
           ...     
1254    NCT04940221
1255    NCT04971187
1256    NCT05030454
1257    NCT05091528
1258    NCT05553808
Name: NCT Number, Length: 1259, dtype: object