In [3]:
!pip install requests beautifulsoup4 pandas



In [8]:
import requests
from bs4 import BeautifulSoup
import csv
import string
import time

BASE_URL = "https://startupmahakumbh.org/exhibitor_directory/exhi_list_pub.php"
HEADERS = {"User-Agent": "Mozilla/5.0"}

def extract_startup_info(detail_div):
    def get_value(detail_div, field):
        for p in detail_div.find_all("p"):
            strong = p.find("strong")
            if strong and strong.text.strip().startswith(field):
                return p.get_text(strip=True).replace(strong.text, "").strip()
        return ""

    name_tag = detail_div.find("h2")
    name = name_tag.get_text(strip=True) if name_tag else ""
    website = name_tag.find("a")["href"] if name_tag and name_tag.find("a") else ""

    return {
        "Name": name,
        "Contact Person": get_value(detail_div, "Contact Person:"),
        "Designation": get_value(detail_div, "Designation:"),
        "Email": get_value(detail_div, "Contact Details:"),
        "Profile": get_value(detail_div, "Profile:"),
        "Website": website
    }


def get_total_pages(soup):
    page_input = soup.find("input", {"id": "currentPage"})
    if page_input and page_input.has_attr("max"):
        try:
            return int(page_input["max"])
        except:
            return 1
    return 1

def crawl_letter(letter):
    startups = []
    first_page_url = f"{BASE_URL}?event_name=sm&event_year=2025&filter={letter}&page=1"
    response = requests.get(first_page_url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")
    total_pages = get_total_pages(soup)

    print(f"\n🔤 Crawling letter '{letter}' with {total_pages} pages")

    for page in range(1, total_pages + 1):
        url = f"{BASE_URL}?event_name=sm&event_year=2025&filter={letter}&page={page}"
        print(f"📄  Page {page}/{total_pages}")
        res = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(res.text, "html.parser")

        detail_blocks = soup.select("div.col-md-10.col-sm-12 > div.details")
        print(f"   🧱 Found {len(detail_blocks)} startups")

        for detail_div in detail_blocks:
            startup = extract_startup_info(detail_div)
            if startup["Name"]:  # Avoid blank entries
                startups.append(startup)

        time.sleep(0.3)

    return startups

def crawl_all_letters():
    all_startups = []
    for letter in string.ascii_uppercase:
        letter_data = crawl_letter(letter)
        all_startups.extend(letter_data)

    with open("startup_directory.csv", "w", newline="", encoding="utf-8") as f:
        fieldnames = ["Name", "Contact Person", "Designation", "Email", "Profile", "Website"]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_startups)

    print(f"\n✅ Finished. Extracted total {len(all_startups)} startups")

# 🚀 Fire it up
crawl_all_letters()



🔤 Crawling letter 'A' with 1 pages
📄  Page 1/1
   🧱 Found 15 startups

🔤 Crawling letter 'B' with 1 pages
📄  Page 1/1
   🧱 Found 15 startups

🔤 Crawling letter 'C' with 1 pages
📄  Page 1/1
   🧱 Found 15 startups

🔤 Crawling letter 'D' with 1 pages
📄  Page 1/1
   🧱 Found 15 startups

🔤 Crawling letter 'E' with 1 pages
📄  Page 1/1
   🧱 Found 15 startups

🔤 Crawling letter 'F' with 1 pages
📄  Page 1/1
   🧱 Found 15 startups

🔤 Crawling letter 'G' with 1 pages
📄  Page 1/1
   🧱 Found 15 startups

🔤 Crawling letter 'H' with 1 pages
📄  Page 1/1
   🧱 Found 15 startups

🔤 Crawling letter 'I' with 1 pages
📄  Page 1/1
   🧱 Found 15 startups

🔤 Crawling letter 'J' with 1 pages
📄  Page 1/1
   🧱 Found 15 startups

🔤 Crawling letter 'K' with 1 pages
📄  Page 1/1
   🧱 Found 15 startups

🔤 Crawling letter 'L' with 1 pages
📄  Page 1/1
   🧱 Found 15 startups

🔤 Crawling letter 'M' with 1 pages
📄  Page 1/1
   🧱 Found 15 startups

🔤 Crawling letter 'N' with 1 pages
📄  Page 1/1
   🧱 Found 15 startups

🔤 Cra