Extract Data from the NNT for input into the Auto LR generator

In [4]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

def get_specialty_links():
    """
    Extracts specialties and their corresponding article links from the webpage.
    Returns a list of dictionaries with specialty names and associated links.
    """

    url = 'https://thennt.com/home-lr/'
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')

    # Locate the section with "Diagnosis (LR) Reviews by Specialty"
    specialty_section = soup.find('div', class_='well subdisplay accordion_caption', id='lr-byspecialty')

    if not specialty_section:
        print("Could not find the 'Diagnosis (LR) Reviews by Specialty' section on the webpage.")
        return []

    results = []

    # Find all specialty headings (e.g., h3)
    subheadings = specialty_section.find_all('h3')

    for subheading in subheadings:
        subheading_text = subheading.get_text(strip=True)  # Get specialty name
        links = []

        # Find the next unordered list (ul) which contains links
        next_ul = subheading.find_next_sibling('ul')

        if next_ul:
            for a_tag in next_ul.find_all('a', href=True):
                link_text = a_tag.get_text(strip=True)  # Link display name
                link_href = a_tag['href']  # Actual URL
                links.append({'display_name': link_text, 'url': link_href})

        results.append({'specialty': subheading_text, 'links': links})

    return results

def extract_likelihood_ratios(page_content):
    """
    Parses all likelihood ratio tables within <article class="lr_cards_details">.
    Extracts findings and their likelihood ratios, ensuring sequential <td> pairs are handled correctly.
    Returns a list of tuples: (finding, likelihood ratio).
    """
    soup = BeautifulSoup(page_content, 'html.parser')
    results = []

    # Locate the section containing likelihood ratio tables
    lr_section = soup.find('article', class_='lr_cards_details')
    if not lr_section:
        return results  # Return empty if no section found

    # Find all tables inside the LR card
    tables = lr_section.find_all('table', class_='lrtable')

    for table in tables:
        # Grab all <tr> elements
        all_rows = table.find_all('tr')

        # Filter out any row that only has <th> (i.e., a header row)
        data_rows = []
        for row in all_rows:
            # If there's at least one <td> in this row, treat it as a data row
            if row.find_all('td'):
                data_rows.append(row)

        # If we have real data rows, parse them
        if data_rows:
            for row in data_rows:
                cols = row.find_all('td')
                # If the row has exactly 2 <td>, treat them as (finding, LR)
                if len(cols) == 2:
                    finding = cols[0].get_text(strip=True)
                    lr_value = cols[1].get_text(strip=True)
                    # If there's an <a> inside the LR cell, grab its text
                    link = cols[1].find('a')
                    if link:
                        lr_value = link.get_text(strip=True) or lr_value
                    if not lr_value:
                        lr_value = "Not reported"

                    results.append((finding, lr_value))

        else:
            # Fallback: if there are no valid data rows, we process all <td> in pairs
            cols = table.find_all('td')
            for i in range(0, len(cols) - 1, 2):
                finding = cols[i].get_text(strip=True)
                lr_value_element = cols[i + 1]

                # Extract the likelihood ratio, handling nested <a> and <br/>
                link = lr_value_element.find('a')
                if link:
                    lr_value = link.get_text(strip=True)
                else:
                    lr_value = lr_value_element.get_text(strip=True)

                if not lr_value:
                    lr_value = "Not reported"

                results.append((finding, lr_value))

    return results

def fetch_webpages(specialty_links):
    """
    Iterates through all the extracted links, fetches the webpage content, 
    and extracts likelihood ratio findings.
    """
    findings_by_display_name = {}

    for item in specialty_links:
        print(f"Fetching pages for Specialty: {item['specialty']}")

        for link in item['links']:
            display_name = link['display_name']
            url = link['url']

            try:
                print(f"  - Fetching: {display_name} ({url})")
                response = requests.get(url)

                if response.status_code == 200:
                    print(f"    Success: {display_name} page fetched.")
                    
                    # Extract likelihood ratio findings
                    findings = extract_likelihood_ratios(response.text)
                    
                    # Store the extracted data
                    findings_by_display_name[display_name] = findings

                else:
                    print(f"    Failed to fetch {display_name} - Status Code: {response.status_code}")

                time.sleep(1)  # Optional: Add a delay to avoid overwhelming the server

            except requests.RequestException as e:
                print(f"    Error fetching {display_name}: {e}")

        print("\n")  # Add space between specialties for readability

    return findings_by_display_name

def save_to_excel(findings_data, filename="nnt_lrs.xlsx", blank_values=False):
    """
    Saves likelihood ratios to an Excel file with each display_name as a separate sheet.
    If blank_values is True, the Likelihood Ratio column is left blank.
    The first row contains the full display_name, and column headers start from the second row.
    """
    with pd.ExcelWriter(filename, engine="openpyxl") as writer:
        for display_name, findings in findings_data.items():
            if findings:
                # Prepare DataFrame
                df = pd.DataFrame(findings, columns=["Finding", "Likelihood Ratio"])

                if blank_values:
                    df["Likelihood Ratio"] = ""  # Clear likelihood ratio values

                # Insert full display_name as the first row
                full_name_row = pd.DataFrame({df.columns[0]: [display_name], df.columns[1]: [""]})
                df = pd.concat([full_name_row, df], ignore_index=True)

                # Save to Excel with sheet name as the **last** 31 characters
                sheet_name = display_name[-31:]
                df.to_excel(writer, sheet_name=sheet_name, index=False, header=False)  # No default header

            else:
                print(f"Skipping {display_name} (No data found).")

    print(f"\nLikelihood ratios saved to {filename}")

# Fetch specialties and links
specialty_links = get_specialty_links()
findings_data = fetch_webpages(specialty_links)

# Save normal file
save_to_excel(findings_data, "nnt_lrs.xlsx", blank_values=False)

# Save version with blank likelihood ratios
save_to_excel(findings_data, "nnt_lrs_sans_number.xlsx", blank_values=True)

Fetching pages for Specialty: Anesthesiology
  - Fetching: Diagnostic Accuracy of Ultrasound for Confirmation of Endotracheal Tube Placement (https://thennt.com/lr/diagnostic-accuracy-ultrasound-confirmation-endotracheal-tube-placement/)
    Success: Diagnostic Accuracy of Ultrasound for Confirmation of Endotracheal Tube Placement page fetched.
  - Fetching: Factors Predicting Difficult Endotracheal Intubation (https://thennt.com/lr/factors-predicting-difficult-endotracheal-intubation/)
    Success: Factors Predicting Difficult Endotracheal Intubation page fetched.


Fetching pages for Specialty: Cardiology
  - Fetching: Acute Coronary Syndrome (https://thennt.com/lr/acute-coronary-syndrome/)
    Success: Acute Coronary Syndrome page fetched.
  - Fetching: Aortic Dissection (https://thennt.com/lr/aortic-dissection/)
    Success: Aortic Dissection page fetched.
  - Fetching: Deep Venous Thrombosis (DVT) (https://thennt.com/lr/deep-venous-thrombosis-dvt/)
    Success: Deep Venous Thrombo

Functions for each individual webpage

In [2]:
# Define the URL
url = 'https://thennt.com/lr/lung-ultrasound-diagnosis-pneumonia-children/'

# Fetch the webpage content
response = requests.get(url)
if response.status_code == 200:
    page_content = response.text
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
    exit()

# Parse the HTML content
soup = BeautifulSoup(page_content, 'html.parser')

In [3]:
print(soup)


<!DOCTYPE html>

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="https://gmpg.org/xfn/11" rel="profile"/>
<meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots">
<!-- This site is optimized with the Yoast SEO plugin v24.2 - https://yoast.com/wordpress/plugins/seo/ -->
<title>Lung Ultrasound for Diagnosis of Pneumonia in Children – TheNNT</title>
<link href="https://thennt.com/lr/lung-ultrasound-diagnosis-pneumonia-children/" rel="canonical">
<meta content="en_US" property="og:locale">
<meta content="article" property="og:type"/>
<meta content="Lung Ultrasound for Diagnosis of Pneumonia in Children – TheNNT" property="og:title"/>
<meta content="https://thennt.com/lr/lung-ultrasound-diagnosis-pneumonia-children/" property="og:url"/>
<meta content="TheNNT" property="og:site_name"/>
<meta content="https://www.facebook.com/theNNT/" property="article:publi