In [25]:
import pandas as pd
import numpy as np
import overturemaps

In [26]:
from google.colab import files
uploaded = files.upload()  # prompts you to upload again

Saving web_scraper.py to web_scraper (1).py


In [27]:
def fetch_overture_poi_data(theme: str, bbox: tuple) -> pd.DataFrame:
    """
    Fetches POI data from Overture Maps for a given theme and bounding box.

    Parameters:
        theme (str): The theme to fetch (e.g., 'place').
        bbox (tuple): Bounding box as (min_lon, min_lat, max_lon, max_lat).

    Returns:
        pd.DataFrame: DataFrame containing the POI data.
    """
    print(f"\nBounding box: {bbox}")
    print(f"Theme: '{theme}'")
    print("Fetching data from Overture Maps...")

    try:
        reader = overturemaps.record_batch_reader(theme, bbox=bbox)
        df = reader.read_all().to_pandas()

        print(f"Successfully loaded {len(df)} POIs.")
        return df

    except Exception as e:
        print(f"Error: {e}")
        print("Check your internet connection or Overture Maps availability.")
        return pd.DataFrame()

In [60]:
bbox_sf = (-122.5136, 37.7079, -122.3569, 37.8324)  # San Francisco bounding box
df_sf_pois = fetch_overture_poi_data("place", bbox_sf)


Bounding box: (-122.5136, 37.7079, -122.3569, 37.8324)
Theme: 'place'
Fetching data from Overture Maps...
Successfully loaded 39492 POIs.


In [64]:
df_websties = df_sf_pois[['names', 'categories']][df_sf_pois['websites'].notna()]
df_websites.head(30)

Unnamed: 0,names,categories,websites
3,"{'primary': 'Fort Funston', 'common': None, 'r...","{'primary': 'national_park', 'alternate': ['pa...",[https://www.nps.gov/goga/planyourvisit/fortfu...
4,"{'primary': 'Fort Funston', 'common': None, 'r...","{'primary': 'park', 'alternate': ['beach', 'la...",[http://en.wikipedia.org/wiki/Fort_Funston]
6,"{'primary': 'Pacific Rowing Club', 'common': N...","{'primary': 'lake', 'alternate': ['sports_club...",[http://www.pacificrowingclub.org]
7,"{'primary': 'Harding Park Picnic Area', 'commo...","{'primary': 'park', 'alternate': ['lake']}",[http://www.tpc.com/tpc-harding-park]
8,{'primary': 'Ripple Effect Dragon Boating Team...,"{'primary': 'sports_club_and_league', 'alterna...",[http://www.ripplesf.org/]
9,"{'primary': 'Lake Merced', 'common': None, 'ru...","{'primary': 'attractions_and_activities', 'alt...",[http://sfrecpark.org/destination/lake-merced-...
10,"{'primary': 'Lake Course - The Olympic Club', ...","{'primary': 'sports_club_and_league', 'alterna...",[http://www.olyclub.com/]
11,"{'primary': 'The Olympic Club', 'common': None...","{'primary': 'golf_course', 'alternate': ['comm...",[http://www.olyclub.com]
13,"{'primary': 'Lake Merced - Bike Ride', 'common...","{'primary': 'lake', 'alternate': ['active_life...",[http://sfrecpark.org/destination/lake-merced-...
15,"{'primary': 'Broderick-Terry Duel Site', 'comm...",{'primary': 'landmark_and_historical_building'...,[http://www.nps.gov/goga/historyculture/broder...


In [82]:
x = 20  # set the maximum number of iterations

for i, raw in enumerate(df_websites['websites']):
    if i >= x:
        break  # stop the loop after x iterations
    url = raw[0]  # Get the first item in the list

https://www.nps.gov/goga/planyourvisit/fortfunston.htm
http://en.wikipedia.org/wiki/Fort_Funston
http://www.pacificrowingclub.org
http://www.tpc.com/tpc-harding-park
http://www.ripplesf.org/
http://sfrecpark.org/destination/lake-merced-park
http://www.olyclub.com/
http://www.olyclub.com
http://sfrecpark.org/destination/lake-merced-park/
http://www.nps.gov/goga/historyculture/broderick-terry-duel.htm
http://shop.sprint.com/mysprint/shop_landing.jsp?pagename=whysprint&INTNAV=ATG:HE:WS
http://sftremors.com/
http://sanfranciscosystema.com/
http://www.slipperyfish-sf.com/
http://www.golflink.com/golf-courses/course.aspx?course=101645
http://www.holytrinitysf.org/
http://www.kzv.org/
http://lodge120.wpfreemason.net
http://cal1mason.com
http://www.columbiabrotherhoodlodge.org/
http://www.lakemercedchurch.com/
http://www.sfbrandeis.org/
http://calvaryarmenianchurch.org/
https://www.jccsf.org/program/brotherhood-way-preschool
https://www.newnorth.church/
http://www.lakemercedchurch.com/
http://

In [92]:
x = 20  # number of URLs to scrape

for i, raw in enumerate(df_websites['websites']):
    if i >= x:
        break

    url = raw[0]
    print(f"\nScraping: {url}")

    try:
        info = webScraper.extract_meta_and_title(url)

        # Check if meta_description starts with "[ERROR]"
        meta = info.get("meta_description", "")
        if meta.startswith("[ERROR]"):
            print(f"[SKIPPED] ERROR for {url}: {meta}")
            continue

        # Print if no error
        print("Title:", info.get("title"))
        print("Meta Description:", meta)

    except Exception as e:
        print(f"[EXCEPTION] Skipping {url}: {e}")

print("\nScraping completed.")


Scraping: https://www.nps.gov/goga/planyourvisit/fortfunston.htm
Title: Fort Funston - Golden Gate National Recreation Area (U.S. National Park Service)
Meta Description: 

Scraping: http://en.wikipedia.org/wiki/Fort_Funston
Title: Fort Funston - Wikipedia
Meta Description: 

Scraping: http://www.pacificrowingclub.org
Title: Pacific Rowing Club
Meta Description: 

Scraping: http://www.tpc.com/tpc-harding-park
Title: 403 Forbidden
Meta Description: 

Scraping: http://www.ripplesf.org/
Title: Ripple Effect Dragon Boat Team
Meta Description: We are a dragon boat team based in San Francisco, California, United 
States. Besides paddling, we also do outreach both in and outside the 
dragon boat community.

Scraping: http://sfrecpark.org/destination/lake-merced-park
Title: Custom404 • San Francisco Recreation and Parks, CA • CivicEn
Meta Description: 

Scraping: http://www.olyclub.com/
Title: The Olympic Club | Members
Meta Description: The Olympic Club

Scraping: http://www.olyclub.com
Titl

In [95]:
x = 100  # how many sites to process'
fail_count = 0


for i, raw in enumerate(df_websites['websites']):
        if i >= x:
            break

        base_url = raw[0]
        print(f"\n🔍 Searching for 'About' page on: {base_url}")

        try:
            about_url = webScraper.find_about_page(base_url)
            if not about_url:
                print("❌ No 'About' page found.")
                fail_count += 1
                continue

            print(f"📄 Found About Page: {about_url}")

            main_text = webScraper.extract_main_text(about_url)
            if main_text.startswith("[ERROR]"):
                print(f"⚠️ Error extracting text: {main_text}")
            else:
                print("✅ Extracted Text:\n", main_text[:1000], "...\n")  # limit preview

        except Exception as e:
            print(f"💥 Exception for {base_url}: {e}")
            fail_count += 1

success_count = x - fail_count
success_rate = (success_count / x) * 100
fail_rate = 100 - success_rate

print(f"\n📊 Out of {x} websites:")
print(f"✅ Succeeded: {success_count}")
print(f"❌ Failed: {fail_count}")
print(f"🔢 Success Rate: {success_rate:.2f}%")
print(f"🔢 Failure Rate: {fail_rate:.2f}%")


🔍 Searching for 'About' page on: https://www.nps.gov/goga/planyourvisit/fortfunston.htm
📄 Found About Page: https://www.nps.gov/aboutus
✅ Extracted Text:
 Contact UsImportant LinksNewsPhotos & MultimediaDo Business With UsManagementTransparency & AccountabilityWork for UsSince 1916, the National Park Service has been entrusted with the care of our national parks. With the help of volunteers and partners, we safeguard these special places and share their stories with more than 318 million visitors every year. But our work doesn't stop there.We are proud that tribes, local governments, nonprofit organizations, businesses, and individual citizens ask for our help inrevitalizing their communities, preserving local history, celebrating local heritage, and creating close-to-home opportunities for kids and families to get outside, be active, and have fun.Taking care of the national parks and helping Americans take care of their communities is a job we love, and we need—and welcome—your help 

In [111]:
# === CONFIGURABLE PARAMETERS ===
x = 100  # number of websites to scrape
min_meta_len = 75
min_about_text_len = 100

# === RESULTS TRACKING ===
results = []

for i, raw in enumerate(df_websites['websites']):
    if i >= x:
        break

    base_url = raw[0]
    print(f"\n🔍 Scraping: {base_url}")
    record = {"url": base_url, "source": None, "text": None, "status": None}

    try:
      # Step 1: Try meta extraction
        info = webScraper.extract_meta_and_title(base_url)
        meta = info.get("meta_description", "")

        if meta.startswith("[ERROR]"):
            print(f"⚠️ Meta extraction error: {meta}")
        elif len(meta.strip()) < min_meta_len:
            print(f"⚠️ Meta too short ({len(meta)} chars): {meta}")
        else:
            print("✅ Using meta description")
            record["source"] = "meta"
            record["text"] = meta
            record["status"] = "success"
            results.append(record)
            continue  # Skip to next URL if meta is good
    except Exception as e:
        print(f"[EXCEPTION] Meta extraction failed: {e}")

    try:
        about_url = webScraper.find_about_page(base_url)
        if not about_url:
            print("❌ No About page found.")
            record["status"] = "about_page_not_found"
        else:
            print(f"📄 About page found: {about_url}")
            about_text = webScraper.extract_main_text(about_url)

            if about_text.startswith("[ERROR]"):
                print(f"⚠️ About page scrape error: {about_text}")
                record["status"] = "about_page_error"
            elif len(about_text.strip()) < min_about_text_len:
                print(f"⚠️ About page text too short ({len(about_text)} chars)")
                record["status"] = "about_page_too_short"
            else:
                print("✅ Using About page text")
                record["source"] = "about"
                record["text"] = about_text
                record["status"] = "success"

    except Exception as e:
        print(f"[EXCEPTION] About page fallback failed: {e}")
        record["status"] = "exception_about"

    results.append(record)

print("\n✅ Scraping completed.\n")


🔍 Scraping: https://www.nps.gov/goga/planyourvisit/fortfunston.htm
⚠️ Meta too short (0 chars): 
📄 About page found: https://www.nps.gov/aboutus
✅ Using About page text

🔍 Scraping: http://en.wikipedia.org/wiki/Fort_Funston
⚠️ Meta too short (0 chars): 
📄 About page found: http://en.wikipedia.org/wiki/Wikipedia:About
✅ Using About page text

🔍 Scraping: http://www.pacificrowingclub.org
⚠️ Meta too short (0 chars): 
📄 About page found: http://www.pacificrowingclub.org/about-us
✅ Using About page text

🔍 Scraping: http://www.tpc.com/tpc-harding-park
⚠️ Meta too short (0 chars): 
❌ No About page found.

🔍 Scraping: http://www.ripplesf.org/
✅ Using meta description

🔍 Scraping: http://sfrecpark.org/destination/lake-merced-park
⚠️ Meta too short (0 chars): 
📄 About page found: http://sfrecpark.org/388/ABOUT-US
✅ Using About page text

🔍 Scraping: http://www.olyclub.com/
⚠️ Meta too short (16 chars): The Olympic Club
❌ No About page found.

🔍 Scraping: http://www.olyclub.com
⚠️ Meta too sho

In [117]:
df_results = pd.DataFrame(results)
df_results

Unnamed: 0,url,source,text,status
0,https://www.nps.gov/goga/planyourvisit/fortfun...,about,Contact UsImportant LinksNewsPhotos & Multimed...,success
1,http://en.wikipedia.org/wiki/Fort_Funston,about,"From Wikipedia, the free encyclopediaIntroduct...",success
2,http://www.pacificrowingclub.org,about,0What Pacific Rowing Club is all aboutMISSION ...,success
3,http://www.tpc.com/tpc-harding-park,,,about_page_not_found
4,http://www.ripplesf.org/,meta,We are a dragon boat team based in San Francis...,success
...,...,...,...,...
95,https://cpage.sfsu.edu/prehealth,about,SAN FRANCISCO STATE UNIVERSITY|College of Prof...,success
96,https://icce.sfsu.edu/,about,SAN FRANCISCO STATE UNIVERSITY|Institute for C...,success
97,http://sfstategators.com/,,,about_page_not_found
98,http://member.campusrec.sfsu.edu,about,SAN FRANCISCO STATE UNIVERSITY|Campus Recreati...,success


In [116]:
total = len(df_results)
successes = df_results['status'].value_counts().get("success", 0)
failure_rate = 100 * (1 - successes / total)

print(f"\n🔢 Success rate: {successes}/{total} = {100 - failure_rate:.2f}%")



🔢 Success rate: 59/100 = 59.00%
