In [2]:
import pandas as pd 
import requests 
from bs4 import BeautifulSoup 

In [17]:
# function to fitch the diseases name and links 
def scrape_page(url):
    response=requests.get(url)
    if response.status_code==200:
        soup=BeautifulSoup(response.text,"html.parser")
        
        #find all disease links
        disease_links=soup.find_all("div",class_="col-8 col-md-4")
        
        # Create a list to store disease names and links
        diseases = []
        for disease in disease_links:
            link=disease.find("a") #find the <a> tag
            if link:
                name=link.text.strip() #Extract disease name 
                href=link["href"] #Extract disease link
                diseases.append((name, href))  # Add to the list
        return diseases  # Return the list of diseases        
    else:
        print(f"Failed to retrieve {url}. Status code4:{response.status_code}")            
        return []  # Return an empty list if the request fails  

In [19]:
bse_url="https://africacdc.org/disease/"
page_names=scrape_page(bse_url)

print(len(page_names))
# Print the number of diseases found
print(f"Number of diseases found: {len(page_names)}")

# Print the disease names and links
for name, link in page_names:
    print(f"Disease: {name}")
    print(f"Link: {link}")
    print("-" * 50)

25
Number of diseases found: 25
Disease: Anthrax
Link: https://africacdc.org/disease/anthrax/
--------------------------------------------------
Disease: Avian Influenza
Link: https://africacdc.org/disease/avian-influenza/
--------------------------------------------------
Disease: Chikungunya
Link: https://africacdc.org/disease/chikungunya/
--------------------------------------------------
Disease: Cholera
Link: https://africacdc.org/disease/cholera/
--------------------------------------------------
Disease: COVID-19
Link: https://africacdc.org/disease/covid-19/
--------------------------------------------------
Disease: Crimean-Congo Haemorrhagic Fever
Link: https://africacdc.org/disease/crimean-congo-haemorrhagic-fever/
--------------------------------------------------
Disease: Dengue Fever
Link: https://africacdc.org/disease/dengue-fever/
--------------------------------------------------
Disease: Ebola Virus Disease
Link: https://africacdc.org/disease/ebola-virus-disease/
-----

In [5]:
# Function to scrape multiple pages
def scrape_multiple_pages(base_url,num_pages):
    for page in range(1,num_pages +1):
        page_url=f"{base_url}?page={page}"
        print(f"Scraping page {page} : {page_url}")
        scrape_page(page_url)

In [21]:
# Function to scrape a list of disease links
def get_disease_links(base_url):
    response=requests.get(base_url)
    if response.status_code==200:
        soup=BeautifulSoup(response.text,"html.parser")
        # Find all disease links 
        disease_links=soup.find_all("div",class_="col-8 col-md-4")
         # Create a list to store disease links
        list_links=[]
        for disease in disease_links:
            link=disease.find("a") #find the <a> tag
            if link:
                href=link["href"] #Extract disease link
                list_links.append(href) #add to the list 
                
        return list_links   # Return the list of links
    else:
        print(f"Failed to retrieve {base_url} . Status code:{response.status_code}")
        return [] # Return an empty list if the request fails      
                

In [22]:
base_url="https://africacdc.org/disease/"

get_disease_links(base_url=base_url)

['https://africacdc.org/disease/anthrax/',
 'https://africacdc.org/disease/avian-influenza/',
 'https://africacdc.org/disease/chikungunya/',
 'https://africacdc.org/disease/cholera/',
 'https://africacdc.org/disease/covid-19/',
 'https://africacdc.org/disease/crimean-congo-haemorrhagic-fever/',
 'https://africacdc.org/disease/dengue-fever/',
 'https://africacdc.org/disease/ebola-virus-disease/',
 'https://africacdc.org/disease/hepatitis-b-virus-hbv/',
 'https://africacdc.org/disease/hepatitis-c-virus/',
 'https://africacdc.org/disease/hepatitis-e-virus/',
 'https://africacdc.org/disease/hiv-human-immunodeficiency-virus/',
 'https://africacdc.org/disease/lassa-fever/',
 'https://africacdc.org/disease/malaria/',
 'https://africacdc.org/disease/marburg-virus-disease-mvd/',
 'https://africacdc.org/disease/measles/',
 'https://africacdc.org/disease/meningococcal-meningitis/',
 'https://africacdc.org/disease/middle-east-respiratory-syndrome/',
 'https://africacdc.org/disease/monkeypox/',
 'h

In [67]:
import requests
from bs4 import BeautifulSoup

def scrape_disease_detailed(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise error for bad responses
    except requests.RequestException as e:
        print(f"Failed to retrieve {url}. Error: {e}")
        return None  

    soup = BeautifulSoup(response.text, "html.parser")

    # Extract disease name from URL or fallback to page title
    disease_name = url.rstrip('/').split('/')[-1].replace('-', ' ').title()

    # Default values if data is missing
    description, symptoms, treatment = "Not Available", "Not Available", "Not Available"

    # Locate the content container
    container = soup.find("div", class_="elementor-element elementor-element-424b5a88 elementor-widget elementor-widget-theme-post-content")
    if container:
        paragraphs = container.find_all("p")

        # Extract based on available paragraphs
        if len(paragraphs) > 0:
            description = paragraphs[0].text.strip()
        if len(paragraphs) > 2:
            symptoms = paragraphs[2].text.strip()
        if len(paragraphs) > 3:
            treatment = paragraphs[3].text.strip()

    return {
        "disease": disease_name,
        "description": description,
        "symptoms": symptoms,
        "treatment": treatment
    }


In [35]:
# Function to scrape data from all URLs in list_links
def scrape_all_diseases(list_links):
    
    list_diseases=[]
    
    for link in list_links:
       # print(f"Scraping {link}...")
        disease_data=scrape_disease_detailed(link)
        if disease_data:
            list_diseases.append(disease_data)
    return list_diseases        

In [66]:
# Step 1: Get the list of disease links
list_links = get_disease_links(base_url)
print(f"Found {len(list_links)} disease links.")

Found 25 disease links.


In [68]:
# Step 2: Scrape detailed information from all links
list_diseases = scrape_all_diseases(list_links)

# Print the results
for disease in list_diseases:
    print(f"Disease: {disease['disease']}")
    print(f"Description: {disease['description']}")
    print(f"Symptoms: {disease['symptoms']}")
    print(f"Treatment: {disease['treatment']}")
    print("-" * 80)

Disease: Anthrax
Description: Anthrax is a serious infectious disease caused by gram-positive, rod-shaped bacteria known as Bacillus anthracis. Anthrax can be found naturally in soil and commonly affects domestic and wild animals around the world. The spores of the bacteria can survive in the environment for years or decades, awaiting uptake by the next host. The disease still exists in animals and humans in most countries of sub-Saharan Africa and Asia, in several southern European countries, in the Americas, and certain areas of Australia. There are four types of Anthrax; Cutaneous Anthrax, Inhalation Anthrax, Gastrointestinal Anthrax and Injection Anthrax.
Symptoms: The symptoms of anthrax depend on the type of infection and can take anywhere from one day to more than two months to appear. Cutaneous Anthrax include a group of small blisters or bumps that may itch, swelling can occur around the sore, A painless skin sore (ulcer) with a black center that appears after the small bliste

In [70]:
diseases_df=pd.DataFrame(list_diseases)
diseases_df.head()

Unnamed: 0,disease,description,symptoms,treatment
0,Anthrax,Anthrax is a serious infectious disease caused...,The symptoms of anthrax depend on the type of ...,All Type of Anthrax can be treated by Antibiot...
1,Avian Influenza,Avian influenza refers to the disease caused b...,Common initial symptoms of the A(H5) and A(H7N...,Treatment of avian influenza is recommended fo...
2,Chikungunya,Chikungunya is a viral disease transmitted to ...,Prevention and control are usually done by des...,Not Available
3,Cholera,Cholera is an acute diarrhoeal infection that ...,Prevention and control can be achieved by cons...,Not Available
4,Covid 19,The 2019 novel coronavirus disease (COVID-19) ...,Not Available,Not Available


In [71]:
diseases_df.to_csv('disease_data.csv',index=False)

In [114]:
import requests
from bs4 import BeautifulSoup

# Function to scrape a list of disease links
def get_disease_links2(base_url):
    response = requests.get(base_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Find the container with the disease links
        container = soup.find('div', class_='personlist directory')
        
        # Extract all disease links
        list_links = []
        if container:
            # Find all <a> tags with the class 'azsearchlink'
            links = container.find_all('a', class_='azsearchlink')
            for link in links:
                href = link['href']  # Extract the href attribute
                list_links.append(href)
        
        return list_links  # Return the list of links
    else:
        print(f"Failed to retrieve {base_url}. Status code: {response.status_code}")
        return []  # Return an empty list if the request fails

# Base URL for the search results
base_url = "https://www.mayoclinic.org/search/search-results?q=common%20diseases&page="

# Number of pages to scrape
num_pages = 8

# List to store all links
all_links2 = []

# Loop through each page and scrape links
for page in range(1, num_pages + 1):
    url = f"{base_url}{page}"  # Construct the URL for the current page
    #print(f"Scraping page {page}: {url}")
    links = get_disease_links2(url)
    all_links2.extend(links)  # Add the links from the current page to the list

# Print the total number of links found
print(f"Total links found: {len(all_links2)}")

# Print the first 10 links as a sample
for link in all_links2[:10]:
    print(link)

Failed to retrieve https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=1. Status code: 403
Failed to retrieve https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=3. Status code: 403
Failed to retrieve https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=4. Status code: 403
Failed to retrieve https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=5. Status code: 403
Failed to retrieve https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=6. Status code: 403
Failed to retrieve https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=7. Status code: 403
Failed to retrieve https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=8. Status code: 403
Total links found: 10
https://www.mayoclinic.org/diseases-conditions
https://www.mayoclinic.org/diseases-conditions/flu/symptoms-causes/syc-20351719
https://www.mayoclinic.org/diseases-conditions/tuberculosis/

In [115]:
all_links2

['https://www.mayoclinic.org/diseases-conditions',
 'https://www.mayoclinic.org/diseases-conditions/flu/symptoms-causes/syc-20351719',
 'https://www.mayoclinic.org/diseases-conditions/tuberculosis/symptoms-causes/syc-20351250',
 'https://www.mayoclinic.org/diseases-conditions/multiple-sclerosis/symptoms-causes/syc-20350269',
 'https://www.mayoclinic.org/diseases-conditions/heart-failure/symptoms-causes/syc-20373142',
 'https://www.mayoclinic.org/diseases-conditions/meningitis/symptoms-causes/syc-20350508',
 'https://www.mayoclinic.org/diseases-conditions/sexually-transmitted-diseases-stds/symptoms-causes/syc-20351240',
 'https://www.mayoclinic.org/diseases-conditions/copd/symptoms-causes/syc-20353679',
 'https://www.mayoclinic.org/diseases-conditions/sexually-transmitted-diseases-stds/in-depth/std-symptoms/art-20047081',
 'https://www.mayoclinic.org/diseases-conditions/leukemia/symptoms-causes/syc-20374373']

In [101]:
import requests
from bs4 import BeautifulSoup

# Function to scrape detailed information from a disease page
def scrape_disease_detailed2(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract disease name (from the URL or page title)
            disease_name = url.split('/')[-2].replace('-', ' ').title()
            
            # Extract description, symptoms, and treatment
            description = ""
            symptoms = ""
            treatment = ""
            
            # Find the container with the disease details
            container = soup.find('div', class_='elementor-widget-container')
            if container:
                paragraphs = container.find_all('p')
                text = " ".join([p.text.strip() for p in paragraphs])
                
                # Extract description (first paragraph)
                description = paragraphs[0].text.strip() if len(paragraphs) > 0 else ""
                
                # Extract symptoms (look for keywords like "symptoms" or "symptom")
                if "symptoms" in text.lower():
                    symptoms = text.split("symptoms")[1].split(".")[0].strip()
                
                # Extract treatment (look for keywords like "treatment" or "treat")
                if "treatment" in text.lower():
                    treatment = text.split("treatment")[1].split(".")[0].strip()
            
            # Return the structured data
            return {
                "disease": disease_name,
                "description": description,
                "symptoms": symptoms,
                "treatment": treatment
            }
        else:
            print(f"Failed to retrieve {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

# Function to scrape data from all URLs in a list
def scrape_all_diseases2(url_list):
    list_diseases = []
    for url in url_list:
        print(f"Scraping {url}...")
        disease_data = scrape_disease_detailed2(url)
        if disease_data:
            list_diseases.append(disease_data)
    return list_diseases

# Example list of disease URLs
all_links2 = [
    "https://www.mayoclinic.org/diseases-conditions/infectious-diseases/symptoms-causes/syc-20351173",
    "https://www.mayoclinic.org/diseases-conditions/coronavirus/symptoms-causes/syc-20479963",
    "https://www.mayoclinic.org/diseases-conditions/heart-disease/symptoms-causes/syc-20353118",
    # Add more URLs as needed
]

# Scrape detailed information from all URLs
list_diseases2 = scrape_all_diseases2(all_links2)

# Print the results
if list_diseases2:
    for disease in list_diseases2:
        print(f"Disease: {disease['disease']}")
        print(f"Description: {disease['description']}")
        print(f"Symptoms: {disease['symptoms']}")
        print(f"Treatment: {disease['treatment']}")
        print("-" * 80)
else:
    print("No data found.")

Scraping https://www.mayoclinic.org/diseases-conditions/infectious-diseases/symptoms-causes/syc-20351173...
Scraping https://www.mayoclinic.org/diseases-conditions/coronavirus/symptoms-causes/syc-20479963...
Scraping https://www.mayoclinic.org/diseases-conditions/heart-disease/symptoms-causes/syc-20353118...
Disease: Symptoms Causes
Description: 
Symptoms: 
Treatment: 
--------------------------------------------------------------------------------
Disease: Symptoms Causes
Description: 
Symptoms: 
Treatment: 
--------------------------------------------------------------------------------
Disease: Symptoms Causes
Description: 
Symptoms: 
Treatment: 
--------------------------------------------------------------------------------


In [103]:
import requests
from bs4 import BeautifulSoup

def scrape_disease_data(urls):
    diseases_data = []

    for url in urls:
        try:
            # Fetch the webpage content
            response = requests.get(url)
            response.raise_for_status()  # Raise an error for bad status codes

            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract the disease name (assuming it's in the <h1> tag)
            disease_name = soup.find('h1').text.strip()

            # Extract sections (Overview, Symptoms, Causes, Diagnosis, Treatment)
            sections = {}
            for section_name in ['Overview', 'Symptoms', 'Causes', 'Diagnosis', 'Treatment']:
                section = soup.find('h2', text=section_name)
                if section:
                    section_content = []
                    next_element = section.find_next_sibling()
                    while next_element and next_element.name not in ['h2', 'h3']:
                        if next_element.name == 'p':
                            section_content.append(next_element.text.strip())
                        next_element = next_element.find_next_sibling()
                    sections[section_name] = ' '.join(section_content)
                else:
                    sections[section_name] = None

            # Add the disease data to the list
            diseases_data.append({
                'disease': disease_name,
                'Overview': sections.get('Overview'),
                'Symptoms': sections.get('Symptoms'),
                'Causes': sections.get('Causes'),
                'Diagnosis': sections.get('Diagnosis'),
                'Treatment': sections.get('Treatment')
            })

        except Exception as e:
            print(f"Error scraping {url}: {e}")

    return diseases_data




In [116]:
all_links2[:10]

['https://www.mayoclinic.org/diseases-conditions',
 'https://www.mayoclinic.org/diseases-conditions/flu/symptoms-causes/syc-20351719',
 'https://www.mayoclinic.org/diseases-conditions/tuberculosis/symptoms-causes/syc-20351250',
 'https://www.mayoclinic.org/diseases-conditions/multiple-sclerosis/symptoms-causes/syc-20350269',
 'https://www.mayoclinic.org/diseases-conditions/heart-failure/symptoms-causes/syc-20373142',
 'https://www.mayoclinic.org/diseases-conditions/meningitis/symptoms-causes/syc-20350508',
 'https://www.mayoclinic.org/diseases-conditions/sexually-transmitted-diseases-stds/symptoms-causes/syc-20351240',
 'https://www.mayoclinic.org/diseases-conditions/copd/symptoms-causes/syc-20353679',
 'https://www.mayoclinic.org/diseases-conditions/sexually-transmitted-diseases-stds/in-depth/std-symptoms/art-20047081',
 'https://www.mayoclinic.org/diseases-conditions/leukemia/symptoms-causes/syc-20374373']

In [117]:



diseases = scrape_disease_data(all_links2)
for disease in diseases:
    print(disease)

  section = soup.find('h2', text=section_name)


{'disease': 'Diseases & Conditions', 'Overview': None, 'Symptoms': None, 'Causes': None, 'Diagnosis': None, 'Treatment': None}
{'disease': 'Influenza (flu)', 'Overview': 'Flu, also called influenza, is an infection of the nose, throat and lungs, which are part of the respiratory system. The flu is caused by a virus. Influenza viruses are different from the "stomach flu" viruses that cause diarrhea and vomiting. Most people with the flu get better on their own. But sometimes, influenza and its complications can be deadly. To help protect against seasonal flu, you can get an annual flu shot. Although the vaccine isn\'t 100% effective, it lowers the chances of having severe complications from the flu. This is especially true for people who are at high risk of flu complications. Aside from the vaccine, you can take other steps to help prevent infection with the flu. You can clean and disinfect surfaces, wash hands, and keep the air around you moving.', 'Symptoms': 'The viruses that cause f

In [120]:
diseases2_data=pd.DataFrame(diseases)
diseases2_data

Unnamed: 0,disease,Overview,Symptoms,Causes,Diagnosis,Treatment
0,Diseases & Conditions,,,,,
1,Influenza (flu),"Flu, also called influenza, is an infection of...",The viruses that cause flu spread at high leve...,Influenza is caused by viruses. These viruses ...,,
2,Tuberculosis,,,,,
3,Multiple sclerosis,Multiple sclerosis is a disease that causes br...,Multiple sclerosis symptoms vary depending on ...,The cause of multiple sclerosis is not known. ...,,
4,Heart failure,Heart failure occurs when the heart muscle doe...,"If you have heart failure, your heart can't su...","Heart failure can be caused by a weakened, dam...",,
5,Meningitis,"Meningitis is an infection and swelling, calle...",Early meningitis symptoms may be like those of...,Viral infections are the most common cause of ...,,
6,Sexually transmitted diseases (STDs),Sexually transmitted diseases (STDs) are cause...,"STDs can have a range of symptoms, including n...",Sexually transmitted infections can be caused by:,,
7,COPD,Chronic obstructive pulmonary disease (COPD) i...,COPD symptoms often don't appear until a lot o...,The main cause of COPD in developed countries ...,,
8,Sexually transmitted disease (STD) symptoms,,,,,
9,Leukemia,Leukemia is cancer of the body's blood-forming...,"Leukemia symptoms vary, depending on the type ...",Scientists don't understand the exact causes o...,,


In [None]:
# List to store all links
all_links_pag_1 = []

# Loop through each page and scrape links
for page in range(1, num_pages + 1):
    url = f"{base_url}{page}"  # Construct the URL for the current page
    #print(f"Scraping page {page}: {url}")
    links = get_disease_links2(url)
    all_links_pag_1.extend(links)  # Add the links from the current page to the list

In [122]:
dis_page1=scrape_disease_data(list_links_10_1)
dis_page1

  section = soup.find('h2', text=section_name)


Error scraping https://www.mayoclinic.org/diseases-conditions/coronavirus/in-depth/coronavirus-long-term-effects/art-20490351: 403 Client Error: Forbidden for url: https://www.mayoclinic.org/diseases-conditions/coronavirus/in-depth/coronavirus-long-term-effects/art-20490351


[{'disease': 'Infectious diseases',
  'Overview': '',
  'Symptoms': '',
  'Causes': '',
  'Diagnosis': None,
  'Treatment': None},
 {'disease': 'Coronavirus disease 2019 (COVID-19)',
  'Overview': 'COVID-19, also called coronavirus disease 2019, is an illness caused by a virus. The virus is called severe acute respiratory syndrome coronavirus 2, or more commonly, SARS-CoV-2. It started spreading at the end of 2019 and became a pandemic disease in 2020. The virus that causes COVID-19 spreads most commonly through the air in tiny droplets of fluid between people in close contact. Many people with COVID-19 have no symptoms or mild illness. But for older adults and people with certain medical conditions, COVID-19 can lead to the need for care in the hospital or death. Staying up to date on your COVID-19 vaccine helps prevent serious illness, the need for hospital care due to COVID-19 and death from COVID-19. Other ways that may help prevent the spread of this coronavirus includes good indo

In [123]:
diseases3_data=pd.DataFrame(dis_page1)
diseases3_data

Unnamed: 0,disease,Overview,Symptoms,Causes,Diagnosis,Treatment
0,Infectious diseases,,,,,
1,Coronavirus disease 2019 (COVID-19),"COVID-19, also called coronavirus disease 2019...",Typical COVID-19 symptoms often show up 2 to 1...,COVID-19 is caused by infection with the sever...,,
2,Heart disease,Heart disease describes a range of conditions ...,Heart disease symptoms depend on the type of h...,Heart disease causes depend on the specific ty...,,
3,COVID-19: Who's at higher risk of serious symp...,,,,,
4,Obesity,Obesity is a complex disease involving having ...,"Body mass index, known as BMI, is often used t...","Although there are genetic, behavioral, metabo...",,
5,Alzheimer's disease,Alzheimer's disease is the most common cause o...,Memory loss is the key symptom of Alzheimer's ...,The exact causes of Alzheimer's disease aren't...,,
6,Germs: Understand and protect against bacteria...,,,,,
7,Common cold,The common cold is an illness affecting your n...,"Most often, common cold symptoms start 1 to 3 ...",Many viruses can cause a common cold. Rhinovir...,,
8,Crohn's disease,Crohn's disease is a type of inflammatory bowe...,Symptoms of Crohn's disease typically include:...,The exact cause of Crohn's disease remains unk...,,


In [139]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def scrape_disease_links(base_url):
    disease_links = []
    current_url = base_url

    # Set headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # Use a session with retries
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))
    session.headers.update(headers)

    while current_url:
        try:
            # Fetch the search results page
            print(f"Fetching search results page: {current_url}")
            response = session.get(current_url, timeout=10)  # Add a timeout
            response.raise_for_status()  # Raise an error for bad status codes

            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract links to individual disease pages
            links = soup.select('a[href*="/diseases-conditions/"]')
            if not links:
                print(f"No disease links found on {current_url}. Skipping this page.")
                break  # Exit if no disease links are found

            # Add links to the list
            for link in links:
                disease_url = urljoin(current_url, link['href'])
                if disease_url not in disease_links:  # Avoid duplicates
                    disease_links.append(disease_url)

            # Find the next page link
            next_page_link = soup.find('a', id='pagination-next')
            if next_page_link and 'href' in next_page_link.attrs:
                # Construct the full URL for the next page
                next_page_url = urljoin(current_url, next_page_link['href'])
                if next_page_url != current_url:  # Ensure it's a new page
                    current_url = next_page_url
                    print(f"Next page found: {current_url}")
                    time.sleep(5)  # Add a delay to avoid being blocked
                else:
                    print("No more pages found.")
                    current_url = None  # No more pages
            else:
                print("No more pages found.")
                current_url = None  # No more pages

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {current_url}: {e}")
            time.sleep(10)  # Wait longer before retrying
        except Exception as e:
            print(f"Unexpected error fetching {current_url}: {e}")
            break

    return disease_links



In [142]:
# Function 2: Scrape disease data from individual disease pages
def scrape_disease_data(disease_urls):
    diseases_data = []

    # Set headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # Use a session with retries
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))
    session.headers.update(headers)

    for disease_url in disease_urls:
        try:
            #print(f"Fetching disease page: {disease_url}")
            # Fetch the disease page
            disease_response = session.get(disease_url, timeout=10)  # Add a timeout
            disease_response.raise_for_status()

            # Parse the disease page content
            disease_soup = BeautifulSoup(disease_response.content, 'html.parser')

            # Extract the disease name (assuming it's in the <h1> tag)
            disease_name_tag = disease_soup.find('h1')
            if disease_name_tag:
                disease_name = disease_name_tag.text.strip()
            else:
                print(f"No disease name found on {disease_url}. Skipping this page.")
                continue  # Skip this page if no disease name is found

            # Extract sections (Overview, Symptoms, Causes, Diagnosis, Treatment)
            sections = {}
            for section_name in ['Overview', 'Symptoms', 'Causes', 'Diagnosis', 'Treatment']:
                section = disease_soup.find('h2', string=section_name)  # Use 'string' instead of 'text'
                if section:
                    section_content = []
                    next_element = section.find_next_sibling()
                    while next_element and next_element.name not in ['h2', 'h3']:
                        if next_element.name == 'p':
                            section_content.append(next_element.text.strip())
                        next_element = next_element.find_next_sibling()
                    sections[section_name] = ' '.join(section_content)
                else:
                    sections[section_name] = None

            # Add the disease data to the list
            diseases_data.append({
                'disease': disease_name,
                'Overview': sections.get('Overview'),
                'Symptoms': sections.get('Symptoms'),
                'Causes': sections.get('Causes'),
                
            })

            time.sleep(5)  # Add a delay to avoid being blocked

        except requests.exceptions.RequestException as e:
            print(f"Error scraping {disease_url}: {e}")
            time.sleep(10)  # Wait longer before retrying
        except Exception as e:
            print(f"Unexpected error scraping {disease_url}: {e}")

    return diseases_data

In [140]:

# Example usage
base_url = "https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=1"

# Step 1: Scrape links to disease pages
disease_links = scrape_disease_links(base_url)
print(f"Found {len(disease_links)} disease links.")


Fetching search results page: https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=1
Next page found: https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=2
Fetching search results page: https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=2
Next page found: https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=3
Fetching search results page: https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=3
Next page found: https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=4
Fetching search results page: https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=4
Next page found: https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=5
Fetching search results page: https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=5
Next page found: https://www.mayoclinic.org/search/search-results?q=common%20diseases&page=6
Fetch

In [143]:

# Step 2: Scrape data from disease pages
diseases_all_2 = scrape_disease_data(disease_links)

# Print the scraped data
for disease in diseases_all_2:
    print(disease)

KeyboardInterrupt: 

In [149]:
lik_280=disease_links[:280]

In [150]:
# Step 2: Scrape data from disease pages
diseases_all_280 = scrape_disease_data(lik_280)

# Print the scraped data
for disease in diseases_all_280:
    print(disease)

{'disease': 'Infectious diseases', 'Overview': '', 'Symptoms': '', 'Causes': ''}
{'disease': 'Coronavirus disease 2019 (COVID-19)', 'Overview': 'COVID-19, also called coronavirus disease 2019, is an illness caused by a virus. The virus is called severe acute respiratory syndrome coronavirus 2, or more commonly, SARS-CoV-2. It started spreading at the end of 2019 and became a pandemic disease in 2020. The virus that causes COVID-19 spreads most commonly through the air in tiny droplets of fluid between people in close contact. Many people with COVID-19 have no symptoms or mild illness. But for older adults and people with certain medical conditions, COVID-19 can lead to the need for care in the hospital or death. Staying up to date on your COVID-19 vaccine helps prevent serious illness, the need for hospital care due to COVID-19 and death from COVID-19. Other ways that may help prevent the spread of this coronavirus includes good indoor air flow, physical distancing, wearing a mask in t

In [151]:
diseases_all_280=pd.DataFrame(diseases_all_280)

diseases_all_280.tail(10)

Unnamed: 0,disease,Overview,Symptoms,Causes
0,Infectious diseases,,,
1,Coronavirus disease 2019 (COVID-19),"COVID-19, also called coronavirus disease 2019...",Typical COVID-19 symptoms often show up 2 to 1...,COVID-19 is caused by infection with the sever...
2,Heart disease,Heart disease describes a range of conditions ...,Heart disease symptoms depend on the type of h...,Heart disease causes depend on the specific ty...
3,COVID-19: Who's at higher risk of serious symp...,,,
4,Obesity,Obesity is a complex disease involving having ...,"Body mass index, known as BMI, is often used t...","Although there are genetic, behavioral, metabo..."
5,Alzheimer's disease,Alzheimer's disease is the most common cause o...,Memory loss is the key symptom of Alzheimer's ...,The exact causes of Alzheimer's disease aren't...
6,Germs: Understand and protect against bacteria...,,,
7,Long COVID: Lasting effects of COVID-19,,,
8,Common cold,The common cold is an illness affecting your n...,"Most often, common cold symptoms start 1 to 3 ...",Many viruses can cause a common cold. Rhinovir...
9,Crohn's disease,Crohn's disease is a type of inflammatory bowe...,Symptoms of Crohn's disease typically include:...,The exact cause of Crohn's disease remains unk...


: 

In [154]:
lik_280_2=disease_links[280:560]
# Step 2: Scrape data from disease pages
diseases_all_280_2 = scrape_disease_data(lik_280_2)

# Print the scraped data
for disease in diseases_all_280_2:
    print(disease)

{'disease': 'Roseola', 'Overview': '', 'Symptoms': '', 'Causes': ''}
{'disease': 'Juvenile idiopathic arthritis', 'Overview': 'Juvenile idiopathic arthritis, formerly known as juvenile rheumatoid arthritis, is the most common type of arthritis in children under the age of 16. Juvenile idiopathic arthritis can cause persistent joint pain, swelling and stiffness. Some children may experience symptoms for only a few months, while others have symptoms for many years. Some types of juvenile idiopathic arthritis can cause serious complications, such as growth problems, joint damage and eye inflammation. Treatment focuses on controlling pain and inflammation, improving function, and preventing damage.', 'Symptoms': 'The most common signs and symptoms of juvenile idiopathic arthritis are: Juvenile idiopathic arthritis can affect one joint or many. There are several different subtypes of juvenile idiopathic arthritis, but the main ones are systemic, oligoarticular and polyarticular. Which type 

In [156]:
diseases_all_280_2=pd.DataFrame(diseases_all_280_2)
diseases_all_280_2.head(10)

Unnamed: 0,disease,Overview,Symptoms,Causes
0,Roseola,,,
1,Juvenile idiopathic arthritis,"Juvenile idiopathic arthritis, formerly known ...",The most common signs and symptoms of juvenile...,Juvenile idiopathic arthritis occurs when the ...
2,Polycythemia vera,,,
3,Reactive arthritis,,,
4,Common cold in babies,,,
5,Polycystic kidney disease,Polycystic kidney disease (PKD) is a condition...,Polycystic kidney disease symptoms can include:,Gene changes cause polycystic kidney disease. ...
6,Cerebral palsy,Cerebral palsy is a group of conditions that a...,Symptoms of cerebral palsy can vary greatly. I...,Cerebral palsy is caused by irregular brain de...
7,Vascular dementia,,,
8,Rubella,,,
9,Buerger disease,Buerger disease is a rare disease of the arter...,Buerger disease symptoms include:,The exact cause of Buerger disease is unknown....


In [157]:
lik_280_3=disease_links[560:840]
# Step 2: Scrape data from disease pages
diseases_all_280_3 = scrape_disease_data(lik_280_3)

# Print the scraped data
for disease in diseases_all_280_3:
    print(disease)

Error scraping https://www.mayo.edu/research/clinical-trials/diseases-conditions/common-variable-immunodeficiency/: 403 Client Error: Forbidden for url: https://www.mayo.edu/research/clinical-trials/diseases-conditions/common-variable-immunodeficiency/
{'disease': 'Toxoplasmosis', 'Overview': "Toxoplasmosis (tok-so-plaz-MOE-sis) is an infection with a parasite called Toxoplasma gondii. People often get the infection from eating undercooked meat. You can also get it from contact with cat feces. The parasite can pass to a baby during pregnancy. Most people infected with the parasite do not have symptoms. Some people get flu-like symptoms. Serious disease most often affects infants and people with weakened immune systems. Toxoplasmosis during pregnancy may cause miscarriage and birth defects. Most infections don't need treatment. Drug treatment is used for people with more-serious cases, pregnant people, newborns and people with weakened immune systems. Several steps to prevent toxoplasmo

In [158]:
diseases_all_280_3=pd.DataFrame(diseases_all_280_3)
diseases_all_280_3.head(10)

Unnamed: 0,disease,Overview,Symptoms,Causes
0,Toxoplasmosis,Toxoplasmosis (tok-so-plaz-MOE-sis) is an infe...,Most people infected with toxoplasmosis do not...,Toxoplasma gondii is a parasite that can infec...
1,Type 2 diabetes,,,
2,Suicide and suicidal thoughts,"Suicide, taking your own life, is a tragic rea...",Suicide warning signs or suicidal thoughts inc...,Suicidal thoughts have many causes. Most often...
3,Factor V Leiden,Factor V Leiden (FAK-tur five LIDE-n) is a mut...,The factor V Leiden mutation does not itself c...,"If you have factor V Leiden, you inherited eit..."
4,Cleft lip and cleft palate,Cleft lip and cleft palate are openings or spl...,"Usually, a split (cleft) in the lip or roof of...",Cleft lip and cleft palate occur when tissues ...
5,Epididymitis,Epididymitis (ep-ih-did-uh-MY-tis) is an infla...,Symptoms of epididymitis might include:,Causes of epididymitis include:
6,Raynaud's disease,,,
7,Urinary incontinence,Urinary incontinence — the loss of bladder con...,"Many people experience occasional, minor leaks...",Urinary incontinence can be caused by everyday...
8,Rosacea,Rosacea (roe-ZAY-she-uh) is a common skin cond...,Symptoms of rosacea include:,The cause of rosacea is not known. It could be...
9,Cyclic vomiting syndrome,,,


In [159]:
lik_280_4=disease_links[840:1120]
# Step 2: Scrape data from disease pages
diseases_all_280_4 = scrape_disease_data(lik_280_4)

# Print the scraped data
for disease in diseases_all_280_4:
    print(disease)

Error scraping https://www.mayo.edu/research/clinical-trials/diseases-conditions/immune-deficiencies/: 403 Client Error: Forbidden for url: https://www.mayo.edu/research/clinical-trials/diseases-conditions/immune-deficiencies/
{'disease': 'Nonallergic rhinitis', 'Overview': '', 'Symptoms': '', 'Causes': ''}
{'disease': 'Glioblastoma', 'Overview': "Glioblastoma is a type of cancer that starts as a growth of cells in the brain or spinal cord. It grows quickly and can invade and destroy healthy tissue. Glioblastoma forms from cells called astrocytes that support nerve cells. Glioblastoma can happen at any age. But it tends to happen more often in older adults. Glioblastoma symptoms may include headaches that keep getting worse, nausea and vomiting, blurred or double vision, trouble speaking, altered sense of touch, and seizures. There also may be trouble with balance, coordination, and moving parts of the face or body. There's no cure for glioblastoma. Treatments might slow cancer growth 

In [160]:
diseases_all_280_4=pd.DataFrame(diseases_all_280_4)
diseases_all_280_4.head()

Unnamed: 0,disease,Overview,Symptoms,Causes
0,Nonallergic rhinitis,,,
1,Glioblastoma,Glioblastoma is a type of cancer that starts a...,Signs and symptoms of glioblastoma may include:,The cause of most glioblastomas isn't known. G...
2,Geographic tongue,Geographic tongue is an inflammatory but harml...,Symptoms of geographic tongue may include: Man...,"The cause of geographic tongue is not known, a..."
3,Dissociative disorders,,,
4,Aortic valve regurgitation,Aortic valve regurgitation — also called aorti...,"Most often, aortic valve regurgitation develop...",The aortic valve is one of four valves that co...


In [162]:
lik_280_5=disease_links[1120:]
# Step 2: Scrape data from disease pages
diseases_all_280_5 = scrape_disease_data(lik_280_5)

# Print the scraped data
for disease in diseases_all_280_5:
    print(disease)

Error scraping https://www.mayo.edu/research/clinical-trials/diseases-conditions/primary-immunodeficiency: 403 Client Error: Forbidden for url: https://www.mayo.edu/research/clinical-trials/diseases-conditions/primary-immunodeficiency
Error scraping https://www.mayo.edu/research/clinical-trials/diseases-conditions/vasculitis/: 403 Client Error: Forbidden for url: https://www.mayo.edu/research/clinical-trials/diseases-conditions/vasculitis/
{'disease': 'Mitral valve stenosis', 'Overview': "Mitral valve stenosis — sometimes called mitral stenosis — is a narrowing of the valve between the two left heart chambers. The narrowed valve reduces or blocks blood flow into the lower left heart chamber. The lower left heart chamber is the heart's main pumping chamber. It also is called the left ventricle. Mitral valve stenosis can make you tired and short of breath. Other symptoms may include irregular heartbeats, dizziness, chest pain or coughing up blood. Some people don't notice symptoms. Mitra

In [163]:
diseases_all_280_5=pd.DataFrame(diseases_all_280_5)
diseases_all_280_5.head()

Unnamed: 0,disease,Overview,Symptoms,Causes
0,Mitral valve stenosis,Mitral valve stenosis — sometimes called mitra...,Mitral valve stenosis usually worsens slowly. ...,To understand the causes of mitral valve disea...
1,Hip fracture,,,
2,Generalized anxiety disorder,,,
3,History of polio: Outbreaks and vaccine timeline,,,
4,Pseudotumor cerebri (idiopathic intracranial h...,,,


In [164]:
#make all data in one data frame 

len(diseases_all_280),len(diseases_all_280_2),len(diseases_all_280_3),len(diseases_all_280_4),len(diseases_all_280_5)

(280, 280, 279, 279, 279)

In [165]:
all_nlp_diseases_df=pd.concat([diseases_all_280, diseases_all_280_2, diseases_all_280_3,diseases_all_280_4,diseases_all_280_5], ignore_index=True)

In [172]:
all_nlp_diseases_df.tail(10)

Unnamed: 0,disease,Overview,Symptoms,Causes
1387,"Multiple endocrine neoplasia, type 1 (MEN 1)","Multiple endocrine neoplasia, type 1 (MEN 1) i...","Symptoms of multiple endocrine neoplasia, type...","Multiple endocrine neoplasia, type 1 (MEN 1) i..."
1388,Kyphosis,,,
1389,Hyperglycemia in diabetes,,,
1390,Spinal stenosis,,,
1391,Sepsis,,,
1392,Aplastic anemia,,,
1393,How you hear,,,
1394,Swollen knee,,,
1395,Acute lymphocytic leukemia,,,
1396,What's the difference between H1N1 flu and inf...,,,


In [173]:
all_nlp_diseases_df.to_csv('all_nlp_diseases_df.csv',index=False)

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_diagnosis_treatment(url):
    # Set headers to mimic a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # Fetch the webpage content
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raise an error for bad status codes

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Initialize a dictionary to store the scraped data
    data = {
        'Diagnosis': None,
        'Treatment': None
    }

    # Scrape the Diagnosis section
    diagnosis_section = soup.find('section', {'aria-labelledby': 'diagnosis'})
    if diagnosis_section:
        diagnosis_content = []
        for element in diagnosis_section.find_all(['p', 'h3', 'ul']):
            if element.name == 'h3':
                diagnosis_content.append(f"\n**{element.text.strip()}**\n")
            elif element.name == 'p':
                diagnosis_content.append(element.text.strip())
            elif element.name == 'ul':
                diagnosis_content.append("- " + "\n- ".join([li.text.strip() for li in element.find_all('li')]))
        data['Diagnosis'] = "\n".join(diagnosis_content)

    # Scrape the Treatment section
    treatment_section = soup.find('section', {'aria-labelledby': 'treatment'})
    if treatment_section:
        treatment_content = []
        for element in treatment_section.find_all(['p', 'h3', 'ul']):
            if element.name == 'h3':
                treatment_content.append(f"\n**{element.text.strip()}**\n")
            elif element.name == 'p':
                treatment_content.append(element.text.strip())
            elif element.name == 'ul':
                treatment_content.append("- " + "\n- ".join([li.text.strip() for li in element.find_all('li')]))
        data['Treatment'] = "\n".join(treatment_content)

    return data




In [None]:
# Example usage
url = "https://www.mayoclinic.org/diseases-conditions/infectious-diseases/diagnosis-treatment/drc-20351179"
result = scrape_diagnosis_treatment(url)

# Print the results
print("Diagnosis:")
print(result['Diagnosis'])
print("\nTreatment:")
print(result['Treatment'])

In [192]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_diagnosis_treatment_urls(url_list):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    diagnosis_treatment_urls = []

    for url in url_list:
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            # Find any link that contains "diagnosis-treatment" in its href
            link_tag = soup.find("a", href=lambda href: href and "diagnosis-treatment" in href)

            if link_tag:
                full_url = urljoin(url, link_tag["href"])
                diagnosis_treatment_urls.append(full_url)
            else:
                print(f"Diagnosis & Treatment link not found for: {url}")
                diagnosis_treatment_urls.append(None)

        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            diagnosis_treatment_urls.append(None)

    return diagnosis_treatment_urls


In [194]:

urls =disease_links[:10]
diagnosis_treatment_links = get_diagnosis_treatment_urls(urls)
print(diagnosis_treatment_links)

Diagnosis & Treatment link not found for: https://www.mayoclinic.org/diseases-conditions/coronavirus/in-depth/coronavirus-who-is-at-risk/art-20483301
Diagnosis & Treatment link not found for: https://www.mayoclinic.org/diseases-conditions/infectious-diseases/in-depth/germs/art-20045289
Diagnosis & Treatment link not found for: https://www.mayoclinic.org/diseases-conditions/coronavirus/in-depth/coronavirus-long-term-effects/art-20490351
['https://www.mayoclinic.org/diseases-conditions/infectious-diseases/diagnosis-treatment/drc-20351179', 'https://www.mayoclinic.org/diseases-conditions/coronavirus/diagnosis-treatment/drc-20479976', 'https://www.mayoclinic.org/diseases-conditions/heart-disease/diagnosis-treatment/drc-20353124', None, 'https://www.mayoclinic.org/diseases-conditions/obesity/diagnosis-treatment/drc-20375749', 'https://www.mayoclinic.org/diseases-conditions/alzheimers-disease/diagnosis-treatment/drc-20350453', None, None, 'https://www.mayoclinic.org/diseases-conditions/commo

In [245]:
urls_280_1 =disease_links[:560]

> # collecting diagnosis and treatment

In [246]:
import requests
from bs4 import BeautifulSoup

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_diagnosis_treatment_url(base_url):
    """Extracts the correct 'Diagnosis & Treatment' page link from the given base URL."""
    try:
        response = requests.get(base_url, headers=HEADERS)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        return None, str(e)

    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the link that contains "Diagnosis & Treatment"
    link_element = soup.find("a", string=lambda text: text and "Diagnosis" in text)
    
    if link_element and link_element.get("href"):
        return "https://www.mayoclinic.org" + link_element["href"], None

    return None, "Diagnosis & Treatment link not found"

def scrape_diagnosis_treatment(url):
    """Scrapes the 'Diagnosis & Treatment' details from the correct page."""
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        return {'Diagnosis': 'Error', 'Treatment': 'Error', 'Error': str(e)}

    soup = BeautifulSoup(response.content, 'html.parser')
    data = {'Diagnosis': 'Not Found', 'Treatment': 'Not Found'}

    def extract_section(section_title):
        """Extracts content based on section headers."""
        section_header = soup.find(lambda tag: tag.name in ["h2", "h3"] and section_title in tag.text)
        
        if not section_header:
            return "Not Found"

        content = []
        for sibling in section_header.find_next_siblings():
            if sibling.name in ["h2", "h3"]:
                break
            if sibling.name == "p":
                content.append(sibling.text.strip())
            elif sibling.name == "ul":
                content.append("- " + "\n- ".join([li.text.strip() for li in sibling.find_all('li')]))

        return "\n".join(content) if content else "Not Found"

    data['Diagnosis'] = extract_section("Diagnosis")
    data['Treatment'] = extract_section("Treatment")

    return data

def scrape_multiple_urls(base_urls):
    """Extracts the correct links and scrapes multiple pages."""
    results = {}

    for base_url in base_urls:
        if not base_url or not base_url.startswith("http"):
            results[base_url] = {'Diagnosis': 'Error', 'Treatment': 'Error', 'Error': 'Invalid URL'}
            continue

        diagnosis_treatment_url, error = get_diagnosis_treatment_url(base_url)

        if error:
            results[base_url] = {'Diagnosis': 'Error', 'Treatment': 'Error', 'Error': error}
            continue

        results[base_url] = scrape_diagnosis_treatment(diagnosis_treatment_url)

    return results

# Example usage
base_urls = [
    "https://www.mayoclinic.org/diseases-conditions/infectious-diseases/symptoms-causes/syc-20351173",
    "https://www.mayoclinic.org/diseases-conditions/heart-disease/symptoms-causes/syc-20353118",
    "https://www.mayoclinic.org/diseases-conditions/coronavirus/symptoms-causes/syc-20479963",
]

results_280_1 = scrape_multiple_urls(urls_280_1)

for url, result in results_280_1.items():
    print(f"URL: {url}")
    print("Diagnosis:")
    print(result['Diagnosis'])
    print("\nTreatment:")
    print(result['Treatment'])
    print("\n" + "-"*50 + "\n")


URL: https://www.mayoclinic.org/diseases-conditions/infectious-diseases/symptoms-causes/syc-20351173
Diagnosis:
Error

Treatment:
Error

--------------------------------------------------

URL: https://www.mayoclinic.org/diseases-conditions/coronavirus/symptoms-causes/syc-20479963
Diagnosis:
If you have symptoms of coronavirus disease 2019, known as COVID-19, or you've been exposed to the COVID-19 virus, contact your healthcare team. Let them know if you've had close contact with anyone diagnosed with COVID-19.
In the United States, at-home COVID-19 tests are available. Free tests can be mailed to U.S. addresses, or you can purchase tests in stores, pharmacies or online. The U.S. Food and Drug Administration, also known as the FDA, approves or authorizes the tests. On the FDA website, you can find a list of the tests that are validated and their expiration dates. You also can check with your healthcare professional before buying a test if you have any concerns.
When taking a test at ho

In [250]:

# Convert dictionary to list
results_280_1_list = [{'Diagnosis': value['Diagnosis'], 'Treatment': value['Treatment']} for key, value in results_280_1.items()]
results_280_1_list



[{'Diagnosis': 'Error', 'Treatment': 'Error'},
 {'Diagnosis': "If you have symptoms of coronavirus disease 2019, known as COVID-19, or you've been exposed to the COVID-19 virus, contact your healthcare team. Let them know if you've had close contact with anyone diagnosed with COVID-19.\nIn the United States, at-home COVID-19 tests are available. Free tests can be mailed to U.S. addresses, or you can purchase tests in stores, pharmacies or online. The U.S. Food and Drug Administration, also known as the FDA, approves or authorizes the tests. On the FDA website, you can find a list of the tests that are validated and their expiration dates. You also can check with your healthcare professional before buying a test if you have any concerns.\nWhen taking a test at home, read the directions that come with the test carefully. Follow the instructions exactly to get as accurate a result as possible.\nCOVID-19 tests also are available from healthcare professionals, some pharmacies and clinics, o

3

In [220]:
urls_280_2 =disease_links[560:1120]

In [227]:
results_280_2 = scrape_multiple_urls(urls_280_2)

for url, result in results_280_2.items():
    print(f"URL: {url}")
    print("Diagnosis:")
    print(result['Diagnosis'])
    print("\nTreatment:")
    print(result['Treatment'])
    print("\n" + "-"*50 + "\n")

URL: https://www.mayoclinic.org/diseases-conditions/toxoplasmosis/symptoms-causes/syc-20356249
Diagnosis:
A diagnosis of toxoplasmosis is based on blood tests. Laboratory tests can detect two types of antibodies. One antibody is an immune system agent that is present during a new and active infection with the parasite. The other antibody is present if you had an infection at any time in the past. Depending on the results, your health care provider may repeat a test after two weeks.
More diagnostic tests are used depending on other symptoms, your health and other factors.

Treatment:
Medication is used to treat active infections. How much and how long you take medicine depends on different factors. These include how seriously ill you are, your immune system health and where the infection is located. Your stage of pregnancy is also a factor.
Your provider may give you a combination of prescription drugs. They include:
- Pyrimethamine (Daraprim). This fights infections caused by microscop

In [252]:

# Convert dictionary to list
results_280_2_list = [{'Diagnosis': value['Diagnosis'], 'Treatment': value['Treatment']} for key, value in results_280_2.items()]
results_280_2_list[:5]

[{'Diagnosis': 'A diagnosis of toxoplasmosis is based on blood tests. Laboratory tests can detect two types of antibodies. One antibody is an immune system agent that is present during a new and active infection with the parasite. The other antibody is present if you had an infection at any time in the past. Depending on the results, your health care provider may repeat a test after two weeks.\nMore diagnostic tests are used depending on other symptoms, your health and other factors.',
  'Treatment': "Medication is used to treat active infections. How much and how long you take medicine depends on different factors. These include how seriously ill you are, your immune system health and where the infection is located. Your stage of pregnancy is also a factor.\nYour provider may give you a combination of prescription drugs. They include:\n- Pyrimethamine (Daraprim). This fights infections caused by microscopic organisms. It can block the body's use of folic acid. Other possible side effe

In [258]:
results_280_2_list_df=pd.DataFrame(results_280_2_list)

In [230]:
urls_280_3 =disease_links[1120:]

In [231]:
results_280_3 = scrape_multiple_urls(urls_280_3)

for url, result in results_280_3.items():
    print(f"URL: {url}")
    print("Diagnosis:")
    print(result['Diagnosis'])
    print("\nTreatment:")
    print(result['Treatment'])
    print("\n" + "-"*50 + "\n")

URL: https://www.mayoclinic.org/diseases-conditions/mitral-valve-stenosis/symptoms-causes/syc-20353159
Diagnosis:
To diagnosis mitral valve stenosis, your healthcare professional examines you and asks questions about your symptoms and medical history. You also may be asked about your family's medical history.
The healthcare professional listens to your heart and lungs with a device called a stethoscope. Mitral valve stenosis often causes an irregular heart sound due to the narrowed opening. This sound is called a heart murmur. Mitral valve stenosis also can cause fluid buildup in the lungs.
If you have symptoms of mitral valve stenosis, tests are done to examine the heart.

Treatment:
Treatment for mitral valve stenosis may include:
- Medicine.
- Valve repair or replacement surgery.
- Open-heart surgery.
If you have mild to moderate mitral valve stenosis with no symptoms, you might not need immediate treatment. Instead, you need regular health checkups to see if your condition gets wor

In [253]:
# Convert dictionary to list
results_280_3_list = [{'Diagnosis': value['Diagnosis'], 'Treatment': value['Treatment']} for key, value in results_280_3.items()]
results_280_3_list[:5]

[{'Diagnosis': "To diagnosis mitral valve stenosis, your healthcare professional examines you and asks questions about your symptoms and medical history. You also may be asked about your family's medical history.\nThe healthcare professional listens to your heart and lungs with a device called a stethoscope. Mitral valve stenosis often causes an irregular heart sound due to the narrowed opening. This sound is called a heart murmur. Mitral valve stenosis also can cause fluid buildup in the lungs.\nIf you have symptoms of mitral valve stenosis, tests are done to examine the heart.",
  'Treatment': 'Treatment for mitral valve stenosis may include:\n- Medicine.\n- Valve repair or replacement surgery.\n- Open-heart surgery.\nIf you have mild to moderate mitral valve stenosis with no symptoms, you might not need immediate treatment. Instead, you need regular health checkups to see if your condition gets worse.\nA doctor trained in heart disease typically provides care for people with mitral 

In [254]:
results_280_3_list_df=pd.DataFrame(results_280_3_list)
results_280_3_list_df[:5]

Unnamed: 0,Diagnosis,Treatment
0,"To diagnosis mitral valve stenosis, your healt...",Treatment for mitral valve stenosis may includ...
1,Error,Error
2,Error,Error
3,Error,Error
4,Error,Error


In [255]:
results_280_1_list_df=pd.DataFrame(results_280_1_list)

In [256]:
len(results_280_1_list_df),len(results_280_2_list),len(results_280_3_list_df)

(560, 560, 281)

In [259]:
all_nlp_dig_trt_df=pd.concat([results_280_1_list_df, results_280_2_list_df, results_280_3_list_df], ignore_index=True)

In [261]:
all_nlp_dig_trt_df.shape

(1401, 2)

In [269]:
all_nlp_diseases_df.shape

(1397, 4)

In [270]:
diseases_df.shape

(25, 4)

In [271]:
all_nlp_dig_trt_df.shape

(1401, 2)

In [272]:
all_nlp_dig_trt_df.to_csv('all_nlp_dig_trt_df.csv',index=False)