<a href="https://colab.research.google.com/github/sudenurcure/WebScrapping/blob/main/Test_Information_Scrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import csv

BASE_URL = "https://www.mayocliniclabs.com/test-catalog/alphabetical/"
TEST_BASE_URL = "https://www.mayocliniclabs.com"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

def fetch_html(url):
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def parse_test_links(page_html):
    soup = BeautifulSoup(page_html, "html.parser")
    section = soup.find("div", class_="rochester-section")
    if section:
        links = section.find_all("a", href=True)
        return [TEST_BASE_URL + link["href"] for link in links]
    return []

def parse_test_details(test_html):
    soup = BeautifulSoup(test_html, "html.parser")
    test_details = {}

    # Extract test name
    test_info = soup.find("div", class_="test-info")
    if test_info and test_info.h2:
        test_details["Test Name"] = test_info.h2.get_text(strip=True)

    # Extract tab content
    tabs = ["Overview", "Specimen", "Clinical-and-Interpretive", "Performance"]
    for tab in tabs:
        tab_content = soup.find("div", id=f"tabcontent-{tab}")
        if tab_content:
            test_details[tab] = tab_content.get_text(strip=True)

    return test_details

def save_to_csv(data, filename="tests_data.csv"):
    keys = data[0].keys() if data else []
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(data)

def main():
    test_data = []
    for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
        url = f"{BASE_URL}{letter}"
        print(f"Fetching page: {url}")
        page_html = fetch_html(url)
        if page_html:
            test_links = parse_test_links(page_html)
            print(f"Found {len(test_links)} tests on {letter} page.")

            for test_url in test_links:
                print(f"Fetching test: {test_url}")
                test_html = fetch_html(test_url)
                if test_html:
                    test_details = parse_test_details(test_html)
                    test_data.append(test_details)
                    time.sleep(5)  # Respect crawl delay

    # Save data to CSV
    save_to_csv(test_data)
    print("Data scraping completed and saved to tests_data.csv")

if __name__ == "__main__":
    main()
