In [1]:
!pip install selenium beautifulsoup4
!pip install --upgrade webdriver-manager



In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time

# Initialize WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
wait = WebDriverWait(driver, 10)

base_url = "https://agritech.tnau.ac.in/crop_protection/"

data = []  # Store scraped data

# Open main page
driver.get(base_url + "crop_prot_crop_insect_agri_pest.html")

# Collect all crop links
crop_links = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "tr.style3 a")))
crop_data = {
    link.text.strip(): link.get_attribute("href") for link in crop_links
    if link.text.strip() and "http" in link.get_attribute("href")  # Ensure it's a valid link
}

print(f"Found {len(crop_data)} crop links...")

# Loop through each crop link
for crop, crop_url in crop_data.items():
    print(f"Processing crop: {crop}")

    driver.get(crop_url)
    time.sleep(2)  # Allow page to load

    # Collect pest links (filter out irrelevant ones)
    pest_links = driver.find_elements(By.CSS_SELECTOR, "tr a")

    pest_data = {}
    for link in pest_links:
        pest_name = link.text.strip()
        pest_href = link.get_attribute("href")

        if pest_name and pest_href:
            if "crop_prot" in pest_href:  # Ensuring it is a pest/disease link
                pest_data[pest_name] = pest_href

    print(f"  Found {len(pest_data)} pests for {crop}...")

    for pest, pest_url in pest_data.items():
        print(f"    Scraping pest: {pest}")

        driver.get(pest_url)
        time.sleep(2)  # Allow content to load

        # Parse page content
        soup = BeautifulSoup(driver.page_source, "html.parser")

        def extract_list_text(section_title):
            """Extracts text from a section based on title, handling NoneType errors."""
            section = soup.find("p", string=lambda text: text and section_title in text)
            if section:
                next_ul = section.find_next("ul")
                if next_ul:
                    return "; ".join([li.text.strip() for li in next_ul.find_all("li")])
            return "N/A"

        # Extract details safely
        symptoms = extract_list_text("Symptoms of damage")
        identification = extract_list_text("Identification of insect pest")
        management = extract_list_text("Management")

        # Store data
        data.append({
            "Crop": crop,
            "Disease": pest,
            "Symptoms": symptoms,
            "Identification": identification,
            "Management": management
        })

# Convert to DataFrame
df = pd.DataFrame(data)
print(df.head())

# Save to CSV
df.to_csv("crop_diseases.csv", index=False)

# Close WebDriver
driver.quit()

Found 20 crop links...
Processing crop: Rice
  Found 17 pests for Rice...
    Scraping pest: Thrips
    Scraping pest: Brown Plant Hopper
    Scraping pest: Yellow Stemborer
    Scraping pest: Gall midge
    Scraping pest: Leaf folder
    Scraping pest: Case worm
    Scraping pest: Whorl maggot
    Scraping pest: Swarming caterpillar
    Scraping pest: Skipper
    Scraping pest: Green horned caterpillar
    Scraping pest: Yellow hairy caterpillar
    Scraping pest: Grasshopper
    Scraping pest: Mealy bug
    Scraping pest: Spiny beetle / Hispa
    Scraping pest: Green leafhopper
    Scraping pest: White backed plant hopper
    Scraping pest: Earhead bug
Processing crop: Sorghum
  Found 0 pests for Sorghum...
Processing crop: Maize
  Found 11 pests for Maize...
    Scraping pest: Fall armyworm
    Scraping pest: Shoot fly
    Scraping pest: Stem borer
    Scraping pest: Pink stem borer
    Scraping pest: Corn worm / Earworm
    Scraping pest: Ear head bug
    Scraping pest: Web worm
  

In [11]:
!python -c "import platform; print(platform.architecture())"


('64bit', 'WindowsPE')
