In [None]:
import requests
from bs4 import BeautifulSoup
import time
import random
from fuzzywuzzy import process
import pandas as pd
from datetime import datetime
today_date = datetime.today().strftime('%Y-%m-%d')

In [None]:
tropical_diseases = [
    "Malaria", "Dengue fever", "Chikungunya", "Yellow fever", "Zika virus", "Rift Valley fever",
    "African trypanosomiasis", "Leishmaniasis", "Schistosomiasis", "Lymphatic filariasis",
    "Onchocerciasis", "Loiasis", "Dracunculiasis", "Buruli ulcer", "Yaws", "Cholera", "Typhoid fever",
    "Amoebiasis", "Giardiasis", "Cryptosporidiosis", "Ebola virus disease", "Marburg virus disease",
    "Lassa fever", "Rabies", "Anthrax", "Brucellosis", "Leptospirosis"
]

common_diseases_africa = [
    "Tuberculosis", "HIV/AIDS", "Pneumonia", "Meningitis", "COVID-19", "Influenza", "Measles",
    "Syphilis", "Gonorrhea", "Chlamydia", "HPV", "Hepatitis B", "Hepatitis C", "Diabetes", "Hypertension",
    "Sickle Cell Disease", "Cancer"
]

# Convert disease names to lowercase for better matching
# Load and inspect the CSV file
file_path = "condition.csv"
df = pd.read_csv(file_path, usecols=[1, 2])
df.columns = ["Disease", "Link"]  # Ensure correct column names
df["Disease_lower"] = df["Disease"].fillna("").astype(str).str.lower()
#remove the disease name including vaccine|prophylaxis|prevention
df = df[~df["Disease"].str.contains('vaccine|prophylaxis|prevention', case=False, na=False, regex=True)]


# Function to filter diseases from the dataset
def filter_diseases(df, disease_list):
    return df[df["Disease_lower"].apply(lambda x: any(d.lower() in x for d in disease_list))]

# Extract tropical diseases and common diseases in Africa
df_tropical_diseases = filter_diseases(df, tropical_diseases)
df_common_diseases = filter_diseases(df, common_diseases_africa)

# Drop the lowercase helper column
df_tropical_diseases = df_tropical_diseases.drop(columns=["Disease_lower"])
df_common_diseases = df_common_diseases.drop(columns=["Disease_lower"])

africa_disease = pd.concat([df_tropical_diseases, df_common_diseases], ignore_index=True)
africa_disease = africa_disease.drop_duplicates()

In [None]:
import time
import random
import requests
from bs4 import BeautifulSoup

# Function to scrape symptoms from a given URL
def scrape_symptoms(disease_name, url):
    try:
        # Random delay to avoid detection (between 5 to 7 seconds)
        time.sleep(random.uniform(5, 7))

        # Send request
        response = requests.get(url, timeout=10)

        # Check if request was successful
        if response.status_code == 200:
            # Parse the HTML
            soup = BeautifulSoup(response.text, "html.parser")

            # Dynamically search for the symptoms section using the disease name
            keywords = [
                f"symptoms of {disease_name.lower()}",
                f"what are the symptoms of {disease_name.lower()}?",
                f"{disease_name.lower()} symptoms"
            ]

            symptoms_section = None

            for keyword in keywords:
                symptoms_section = soup.find("h2", string=lambda text: text and keyword in text.lower())
                if symptoms_section:
                    break  # Stop if a match is found

            if symptoms_section:  # <-- FIXED INDENTATION ERROR
                print(f"Found Symptoms Section for {disease_name}: {symptoms_section.text.strip()}")

                symptoms = []

                # **1. Check if <ul> is a direct child of <h2> (i.e., symptoms are listed under the heading)**
                symptoms_list = symptoms_section.find_next_sibling("ul")
                if symptoms_list and symptoms_list.name == "ul":
                    symptoms.extend([li.text.strip() for li in symptoms_list.find_all("li")])

                # **2. If no <ul> under <h2>, check the next sibling paragraph <p>**
                if not symptoms:
                    symptoms_list = symptoms_section.find_next_sibling("p")
                    if symptoms_list:
                        symptoms.append(symptoms_list.text.strip())

                return "; ".join(symptoms) if symptoms else ""  # Return symptoms as a single string

        print(f"No symptoms found for {disease_name}.")
        return ""  # Return empty string if no symptoms found

    except Exception as e:
        print(f"Error scraping {disease_name}: {e}")
        return ""  # Return empty string if an error occurs


# Apply the scraping function
africa_disease["Symptoms"] = africa_disease.apply(lambda row: scrape_symptoms(row["Disease"], row["Link"]), axis=1)

# Save the updated CSV file
updated_file_path = f"updated_condition_with_symptoms_{today_date}.csv"
africa_disease.to_csv(updated_file_path, index=False)

No symptoms found for African Trypanosomiasis.
Found Symptoms Section for Anthrax: What are the signs and symptoms of anthrax?
No symptoms found for Anthrax, Cutaneous.
No symptoms found for Anthrax, Inhalation.
No symptoms found for Anthrax, Skin.
No symptoms found for Brucellosis.
No symptoms found for Chikungunya.
No symptoms found for Chikungunya Fever.
No symptoms found for Chikungunya Hemorrhagic Fever.
No symptoms found for Chikungunya Virus Infection.
No symptoms found for Cholera.
Found Symptoms Section for Congenital Zika Virus: What are the signs and symptoms of congenital Zika virus?
No symptoms found for Cryptosporidiosis.
No symptoms found for Cutaneous Anthrax.
No symptoms found for Cutaneous Leishmaniasis.
No symptoms found for Dengue Fever.
No symptoms found for Dientamoebiasis.
No symptoms found for Dracunculiasis.
No symptoms found for Ebola Virus Disease.
Found Symptoms Section for Giardiasis: What are the signs and symptoms of giardiasis?
No symptoms found for Inha