In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
import random
import os
import re

# Base URL for the species guide
BASE_URL = "https://app.birda.org/species-guide"
# The TwicPics base domain used for Birda images
TWIC_BASE = "https://birda.twic.pics/"

def get_headers():
    """Returns headers to mimic a real browser."""
    return {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://app.birda.org/",
    }

def extract_birds_from_html(html_content):
    """Parses HTML and extracts bird details including image paths."""
    soup = BeautifulSoup(html_content, 'html.parser')
    birds = []
    
    bird_cards = soup.find_all('a', href=lambda x: x and '/species-guide/' in x)
    
    for card in bird_cards:
        try:
            name_tag = card.find('h4')
            if not name_tag: continue
            common_name = name_tag.get_text(strip=True)
            
            sci_tag = card.find('div', class_='css-xnfnhl')
            scientific_name = sci_tag.get_text(strip=True) if sci_tag else "Unknown"
            
            img_tag = card.find('img')
            raw_path = "N/A"
            
            if img_tag:
                # 'data-twic-src' contains the internal media path
                # e.g., 'media:species/9a2db496.../Acacia_Pied_Barbet.jpg'
                twic_data = img_tag.get('data-twic-src')
                if twic_data and twic_data.startswith('media:'):
                    # Convert 'media:xyz' to 'https://birda.twic.pics/xyz'
                    raw_path = twic_data.replace('media:', TWIC_BASE)
            
            guide_path = card.get('href')
            
            birds.append({
                "common_name": common_name,
                "scientific_name": scientific_name,
                "image_url": raw_path,
                "guide_url": f"{guide_path}"
            })
        except Exception:
            continue
            
    return birds

def download_bird_images(bird_list, folder="bird_images"):
    """Downloads images from the image_url for each bird in the list."""
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    session = requests.Session()
    session.headers.update(get_headers())
    
    for bird in bird_list:
        url = bird['image_url']
        if url == "N/A":
            continue
            
        # Clean filename: replace spaces/special chars with underscores
        safe_name = re.sub(r'[^\w\s-]', '', bird['common_name']).strip().replace(' ', '_')
        file_path = os.path.join(folder, f"{safe_name}.jpg")
        
        if os.path.exists(file_path):
            print(f"Skipping {bird['common_name']}, already exists.")
            continue
            
        try:
            print(f"Downloading {bird['common_name']}...")
            # Append a resize parameter to ensure we get a high-quality version
            # TwicPics syntax: ?twic=v1/resize=1000
            download_url = f"{url}?twic=v1/resize=1200"
            
            response = session.get(download_url, timeout=20)
            if response.status_code == 200:
                with open(file_path, 'wb') as f:
                    f.write(response.content)
                time.sleep(random.uniform(1, 2)) # Small delay between downloads
            else:
                print(f"Failed to download {bird['common_name']} (Status: {response.status_code})")
        except Exception as e:
            print(f"Error downloading {bird['common_name']}: {e}")

if __name__ == "__main__":
    # 1. Extract data from your uploaded file
    print("Extracting data from source.html...")
    birds = []
    if os.path.exists('/content/page_1.html'):
        with open('/content/page_1.html', 'r', encoding='utf-8') as f:
            birds = extract_birds_from_html(f.read())
    
    # 2. Save JSON data
    if birds:
        with open("birds_data.json", "w", encoding="utf-8") as f:
            json.dump(birds, f, indent=4)
        print(f"Saved data for {len(birds)} birds to birds_data.json")
        
        # 3. Download the images
        # Warning: This will start downloading files to your environment
        download_bird_images(birds)
    else:
        print("No birds found to process.")



Processing local file: source.html
