In [None]:
import requests
from bs4 import BeautifulSoup
import time
import random
from fuzzywuzzy import process
import pandas as pd

In [None]:
####filter specific diseases from condition.csv

###List of tropical_diseases and common_diseases

tropical_diseases = [
    "Malaria", "Dengue fever", "Chikungunya", "Yellow fever", "Zika virus", "Rift Valley fever",
    "African trypanosomiasis", "Leishmaniasis", "Schistosomiasis", "Lymphatic filariasis",
    "Onchocerciasis", "Loiasis", "Dracunculiasis", "Buruli ulcer", "Yaws", "Cholera", "Typhoid fever",
    "Amoebiasis", "Giardiasis", "Cryptosporidiosis", "Ebola virus disease", "Marburg virus disease",
    "Lassa fever", "Rabies", "Anthrax", "Brucellosis", "Leptospirosis"
]

common_diseases_africa = [
    "Tuberculosis", "HIV/AIDS", "Pneumonia", "Meningitis", "COVID-19", "Influenza", "Measles",
    "Syphilis", "Gonorrhea", "Chlamydia", "HPV", "Hepatitis B", "Hepatitis C", "Diabetes", "Hypertension",
    "Sickle Cell Disease", "Cancer"
]

# Convert disease names to lowercase for better matching
# Load and inspect the CSV file
file_path = "condition.csv"
df = pd.read_csv(file_path, usecols=[1, 2])
df.columns = ["Disease", "Link"]  # Ensure correct column names
df["Disease_lower"] = df["Disease"].fillna("").astype(str).str.lower()

# Function to filter diseases from the dataset
def filter_diseases(df, disease_list):
    return df[df["Disease_lower"].apply(lambda x: any(d.lower() in x for d in disease_list))]

# Extract tropical diseases and common diseases in Africa
df_tropical_diseases = filter_diseases(df, tropical_diseases)
df_common_diseases = filter_diseases(df, common_diseases_africa)

# Drop the lowercase helper column
df_tropical_diseases = df_tropical_diseases.drop(columns=["Disease_lower"])
df_common_diseases = df_common_diseases.drop(columns=["Disease_lower"])

africa_disease = pd.concat([df_tropical_diseases, df_common_diseases], ignore_index=True)
africa_disease = africa_disease.drop_duplicates()


In [None]:
# Function to scrape symptoms from a given URL
def scrape_symptoms(disease_name, url):
    try:
        # Random delay to avoid detection (between 5 to 7 seconds)
        time.sleep(random.uniform(5, 7))

        # Send request
        response = requests.get(url, timeout=10)

        # Check if request was successful
        if response.status_code == 200:
            # Parse the HTML
            soup = BeautifulSoup(response.text, "html.parser")

            # Dynamically search for the symptoms section using the disease name
            keywords = [
                f"symptoms of {disease_name.lower()}",
                f"what are the symptoms of {disease_name.lower()}?",
                f"{disease_name.lower()} symptoms"
            ]

            symptoms_section = None

            for keyword in keywords:
                symptoms_section = soup.find("h2", string=lambda text: text and keyword in text.lower())
                if symptoms_section:
                    break  # Stop if a match is found

            if symptoms_section:
                print(f" Found Symptoms Section for {disease_name}: {symptoms_section.text.strip()}")

                # Find the next sibling element (should contain symptoms)
                symptoms_list = symptoms_section.find_next_sibling(["ul", "p"])

                if symptoms_list:
                    # If symptoms are in a <ul> list
                    if symptoms_list.name == "ul":
                        symptoms = [li.text.strip() for li in symptoms_list.find_all("li")]
                    # If symptoms are in a <p> paragraph
                    else:
                        symptoms = [symptoms_list.text.strip()]

                    return "; ".join(symptoms)  # Return symptoms as a single string

        return ""  # Return empty string if no symptoms found

    except Exception as e:
        return ""  # Return empty string if an error occurs


# Apply the scraping function
africa_disease["Symptoms"] = africa_disease.apply(lambda row: scrape_symptoms(row["Disease"], row["Link"]), axis=1)

# Save the updated CSV file
updated_file_path = "updated_condition_with_symptoms.csv"
africa_disease.to_csv(updated_file_path, index=False)




 Found Symptoms Section for Anthrax: What are the signs and symptoms of anthrax?
 Found Symptoms Section for Congenital Zika Virus: What are the signs and symptoms of congenital Zika virus?
 Found Symptoms Section for Giardiasis: What are the signs and symptoms of giardiasis?
 Found Symptoms Section for Leishmaniasis: What are the signs and symptoms of leishmaniasis?
 Found Symptoms Section for Malaria: What are the signs and symptoms of malaria?
 Found Symptoms Section for Rabies: What are the early signs and symptoms of rabies?
 Found Symptoms Section for Zika Virus: What are the signs and symptoms of Zika virus?
 Found Symptoms Section for Aspiration Pneumonia: What are the signs and symptoms of aspiration pneumonia?
 Found Symptoms Section for Bacterial Meningitis: What are the signs and symptoms of bacterial meningitis?
 Found Symptoms Section for Bacterial Pneumonia: What are the signs and symptoms of bacterial pneumonia?
 Found Symptoms Section for Bladder Cancer: What are the s