### Import Necessary Libraries

In [15]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
import time
import json


### Scrape the condition names and corresponding URL

In [22]:
def scrape_conditions_with_selenium():

    """
    Scrapes condition names and URLs from the Everyday Health conditions page.
    Extracts links that either contain "/guide" or have exactly five segments in the URL.
    Saves the extracted data to a CSV file.
    """

    driver = webdriver.Chrome() 
    driver.get("https://www.everydayhealth.com/conditions/")

    # Allow time for the page to fully load
    time.sleep(3)  
    
    # Locate all links that contain "/conditions/", "/guide", or are within the domain

    elements = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/conditions/"], a[href*="/guide"], a[href*="/"]')
    conditions = []
    
    for element in elements:
        url = element.get_attribute('href')  # Extract the URL from the element
        text = element.text.strip()  # Extract the visible text (condition name)
        
        # Filter URLs:
        # - Must belong to "https://www.everydayhealth.com/"
        # - Should either contain "/guide" (detailed guide pages)
        # - Or should have exactly five segments when split by '/' (likely a condition page)
        if url.startswith("https://www.everydayhealth.com/") and ("/guide" in url or len(url.split('/')) == 5):
            conditions.append({'Condition Name': text, 'URL': url})  # Store condition data
    
    # Close the browser after extraction is complete
    driver.quit()
    return pd.DataFrame(conditions)

In [23]:
df = scrape_conditions_with_selenium()

### Print the data in Json format for one URL

In [24]:
# URL to fetch HTML from
url = "https://www.everydayhealth.com/abdominal-pain/guide/"

# Set headers to mimic a real browser request
headers = {"User-Agent": "Mozilla/5.0"}

# Send request to fetch HTML content
response = requests.get(url, headers=headers)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Find the <script> tag containing the JSON-LD data
    script_tag = soup.find("script", type="application/ld+json")

    if script_tag:
        json_data = script_tag.string  # Extract JSON text
        
        # Convert JSON string to Python dictionary
        parsed_json = json.loads(json_data)

        # Pretty-print JSON
        print(json.dumps(parsed_json, indent=4, ensure_ascii=False))
        
    else:
        print("❌ No JSON-LD data found on the page.")
else:
    print(f"❌ Failed to fetch the webpage. Status Code: {response.status_code}")


{
    "@context": "https://schema.org",
    "@graph": [
        {
            "@type": "MedicalCondition",
            "SignOrSymptom": {
                "Name": "Signs and Symptoms of Abdominal Pain",
                "Description": "Abdominal pain may take several different forms. In addition to how severe it is, abdominal pain can be described in the following ways:Generalized Pain This refers to pain felt in more than half of your abdominal area, and is typical of stomach viruses, indigestion, or gas as the cause of your pain.Localized Pain This refers to pain felt in just one area of your abdomen, and is typical of a problem with an organ like your stomach, appendix, or gallbladder as the cause of your pain.Cramping This type of pain come and goes, or changes in its severity or perceived position in your abdomen. Cramping is rarely serious and is typical of gas, passing a stool, or menstruation as the cause of your pain.Colicky Pain Like cramping, this type of pain comes and goes, 

### Iterate through each URL from the list of URL and save it to csv file.

Columns included : Condition, Symptom, Diagnosis, Prognosis, Causes, Treatment, Research and Statistics, Complications, FAQ.

Prints 'Not available' for the columns where content is not available for any url from the urls list.

In [25]:

data_list = []
for index, row in df.iterrows():
    condition = row["Condition Name"]
    url = row["URL"]

    print(f"Processing: {condition} - {url}")

    try:
        # Fetch the webpage content
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()  # Raise an error for bad responses

        # Parse HTML content
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract the JSON-LD script containing structured medical data
        script_tag = soup.find("script", type="application/ld+json")
        if script_tag:
            json_data = json.loads(script_tag.string)
        else:
            json_data = {}

        # Initialize extracted fields with "Not Available" by default
        extracted_data = {
            "Condition": condition,
            "Symptoms": "Not Available",
            "Diagnosis": "Not Available",
            "Prognosis": "Not Available",
            "Causes": "Not Available",
            "Treatment": "Not Available",
            "Research and Statistics": "Not Available",
            "Complications": "Not Available",
            "FAQ": "Not Available",
        }

        # Extract relevant data from JSON
        for item in json_data.get("@graph", []):
            if item.get("@type") == "MedicalCondition":
                extracted_data["Symptoms"] = item.get("SignOrSymptom", {}).get("Description", "Not Available")
                extracted_data["Diagnosis"] = item.get("TypicalTest", {}).get("Description", "Not Available")
                extracted_data["Prognosis"] = item.get("ExpectedPrognosis", "Not Available")
                extracted_data["Causes"] = item.get("RiskFactor", {}).get("Description", "Not Available")
                extracted_data["Treatment"] = item.get("PossibleTreatment", {}).get("Description", "Not Available")
                extracted_data["Research and Statistics"] = item.get("Epidemiology", "Not Available")
                extracted_data["Complications"] = item.get("PossibleComplication", "Not Available")

            elif item.get("@type") == "FAQPage":
                faqs = []
                for q in item.get("MainEntity", []):
                    question = q.get("Name", "Unknown Question")
                    answer = q.get("AcceptedAnswer", {}).get("Text", "No answer available")
                    faqs.append(f"Q: {question} A: {answer}")
                extracted_data["FAQ"] = " | ".join(faqs) if faqs else "Not Available"

        # Append extracted data to the list
        data_list.append(extracted_data)

    except Exception as e:
        print(f"Error processing {url}: {e}")

# Convert the extracted data into a DataFrame
output_df = pd.DataFrame(data_list)

# Save the extracted data to CSV
output_csv = "conditions-everyday-health-com.csv"
output_df.to_csv(output_csv, index=False)

print(f"Extraction complete! Data saved to {output_csv}")


Processing: Health Conditions A-Z - https://www.everydayhealth.com/conditions/
Processing: Wellness & Self-Care - https://www.everydayhealth.com/wellness/
Error processing https://www.everydayhealth.com/wellness/: ('Connection aborted.', HTTPException('got more than 100 headers'))
Processing: News - https://www.everydayhealth.com/news/
Processing: Product Reviews - https://www.everydayhealth.com/product-reviews/
Processing: Tools & Resources - https://www.everydayhealth.com/tools-resources/
Processing: About Us - https://www.everydayhealth.com/about-us/
Processing:  - https://www.everydayhealth.com/conditions/
Processing:  - https://www.everydayhealth.com/breast-cancer/
Processing:  - https://www.everydayhealth.com/cold-flu/
Processing:  - https://www.everydayhealth.com/crohns-disease/
Processing:  - https://www.everydayhealth.com/depression/
Processing:  - https://www.everydayhealth.com/eczema/
Processing:  - https://www.everydayhealth.com/hypertension/
Processing:  - https://www.ever