# **NOTE THAT THE 21 PAGES ARE DOWLODED MANUALLY!**

In [1]:
pip install -U requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [2]:
from importlib.metadata import files
import requests
from bs4 import BeautifulSoup
import json
import time
import random
import os
import re
import pandas as pd

# Base URL for the species guide
BASE_URL = "https://app.birda.org/species-guide"
# The TwicPics base domain used for Birda images
TWIC_BASE = "https://birda.twic.pics/"

def get_headers():
    """Returns headers to mimic a real browser."""
    return {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://app.birda.org/",
    }

def extract_birds_from_html(html_content):
    """Parses HTML and extracts bird details including image paths."""
    soup = BeautifulSoup(html_content, 'html.parser')
    birds = []
    
    bird_cards = soup.find_all('a', href=lambda x: x and '/species-guide/' in x)
    
    for card in bird_cards:
        try:
            name_tag = card.find('h4')
            if not name_tag: continue
            common_name = name_tag.get_text(strip=True)
            
            sci_tag = card.find('div', class_='css-xnfnhl')
            scientific_name = sci_tag.get_text(strip=True) if sci_tag else "Unknown"
            
            img_tag = card.find('img')
            raw_path = "N/A"
            
            if img_tag:
                # 'data-twic-src' contains the internal media path
                # e.g., 'media:species/9a2db496.../Acacia_Pied_Barbet.jpg'
                twic_data = img_tag.get('data-twic-src')
                if twic_data and twic_data.startswith('media:'):
                    # Convert 'media:xyz' to 'https://birda.twic.pics/xyz'
                    raw_path = twic_data.replace('media:', TWIC_BASE)
            
            guide_path = card.get('href')
            
            birds.append({
                "common_name": common_name,
                "scientific_name": scientific_name,
                "image_url": raw_path,
                "guide_url": f"{guide_path}"
            })
        except Exception:
            continue
            
    return birds

def download_bird_images(bird_list, folder="bird_images"):
    """Downloads images from the image_url for each bird in the list."""
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    session = requests.Session()
    session.headers.update(get_headers())
    
    for bird in bird_list:
        url = bird['image_url']
        if url == "N/A":
            continue
            
        # Clean filename: replace spaces/special chars with underscores
        safe_name = re.sub(r'[^\w\s-]', '', bird['common_name']).strip().replace(' ', '_')
        file_path = os.path.join(folder, f"{safe_name}.jpg")
        
        if os.path.exists(file_path):
            print(f"Skipping {bird['common_name']}, already exists.")
            continue
            
        try:
            print(f"Downloading {bird['common_name']}...")
            # Append a resize parameter to ensure we get a high-quality version
            # TwicPics syntax: ?twic=v1/resize=1000
            download_url = f"{url}?twic=v1/resize=1200"
            
            response = session.get(download_url, timeout=20)
            if response.status_code == 200:
                with open(file_path, 'wb') as f:
                    f.write(response.content)
                time.sleep(random.uniform(1, 2)) # Small delay between downloads
            else:
                print(f"Failed to download {bird['common_name']} (Status: {response.status_code})")
        except Exception as e:
            print(f"Error downloading {bird['common_name']}: {e}")

if __name__ == "__main__":

    # List of local HTML files to process 
    pages = [file for file in os.listdir('C:\\Users\\scuba\\Downloads\\BIRD APP\\bird-trainer\\2 ALL BIRD DATA\\BIRDA\\BIRDA PAGES') if file.endswith('.html')] 
    
    birds = []
    # Get the directory where the script itself is located
    script_dir = "C:\\Users\\scuba\\Downloads\\BIRD APP\\bird-trainer\\2 ALL BIRD DATA\\BIRDA\\BIRDA PAGES\\"

    for n in range(1, len(pages)+2):
        page = f"page_{n}.html"
        # Join the script directory with the filename
        new_page = os.path.join(script_dir, page)
        # print(os.path.exists(new_page))
        if os.path.exists(new_page):
            print(f"Processing {page}...")
            with open(new_page, 'r', encoding='utf-8') as f:
                birds.extend(extract_birds_from_html(f.read()))
                print(f"Extracted data from {page}")      
    # 2. Save JSON data
    if birds:
        with open("birds_data.json", "w", encoding="utf-8") as f:
            json.dump(birds, f, indent=4)
        print(f"Saved data for {len(birds)} birds to birds_data.json")
        
        # 3. Download the images
        # Warning: This will start downloading files to your environment
    #     download_bird_images(birds)
    # else:
    #     print("No birds found to process.")



Processing page_1.html...
Extracted data from page_1.html
Processing page_2.html...
Extracted data from page_2.html
Processing page_3.html...
Extracted data from page_3.html
Processing page_4.html...
Extracted data from page_4.html
Processing page_5.html...
Extracted data from page_5.html
Processing page_6.html...
Extracted data from page_6.html
Processing page_7.html...
Extracted data from page_7.html
Processing page_8.html...
Extracted data from page_8.html
Processing page_9.html...
Extracted data from page_9.html
Processing page_10.html...
Extracted data from page_10.html
Processing page_11.html...
Extracted data from page_11.html
Processing page_12.html...
Extracted data from page_12.html
Processing page_13.html...
Extracted data from page_13.html
Processing page_14.html...
Extracted data from page_14.html
Processing page_15.html...
Extracted data from page_15.html
Processing page_16.html...
Extracted data from page_16.html
Processing page_17.html...
Extracted data from page_17.htm

In [3]:
len(birds)

993

In [4]:
birds[0]

{'common_name': 'Acacia Pied Barbet',
 'scientific_name': 'Tricholaema leucomelas',
 'image_url': 'https://birda.twic.pics/species/9a2db496-7033-439a-8333-4ef05d269a46/reference_images/Acacia_Pied_Barbet.jpg',
 'guide_url': 'https://app.birda.org/species-guide/11404/Acacia_Pied_Barbet'}

## **NOTE THAT SOME BIRD_IMAGES ARE MISSING!**

In [8]:
df = pd.DataFrame()
for bird in birds:
        # print(bird['common_name'], bird['scientific_name'],bird['image_url'])
        row = bird['common_name'], bird['scientific_name'], bird['image_url']
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
df.columns = ['name', 'scientific', 'image']
df

Unnamed: 0,name,scientific,image
0,Acacia Pied Barbet,Tricholaema leucomelas,https://birda.twic.pics/species/9a2db496-7033-...
1,Black-collared Barbet,Lybius torquatus,https://birda.twic.pics/species/ecd51f7d-78f8-...
2,Crested Barbet,Trachyphonus vaillantii,https://birda.twic.pics/species/40bffa5c-f735-...
3,Green Barbet,Stactolaema olivacea,
4,Green Tinkerbird,Pogoniulus simplex,
...,...,...,...
988,Little Spotted Woodpecker,Campethera cailliautii,
989,Olive Woodpecker,Dendropicos griseocephalus,https://birda.twic.pics/species/fe103628-67ff-...
990,Red-throated Wryneck,Jynx ruficollis,https://birda.twic.pics/species/8f928302-b1e6-...
991,Speckle-throated Woodpecker,Campethera scriptoricauda,


In [7]:
len(df)

993

In [11]:
df.to_csv("birda_birdlist (+images).csv", index=False)

# SCRATCH

In [70]:
birds

[{'common_name': 'Acacia Pied Barbet',
  'scientific_name': 'Tricholaema leucomelas',
  'image_url': 'https://birda.twic.pics/species/9a2db496-7033-439a-8333-4ef05d269a46/reference_images/Acacia_Pied_Barbet.jpg',
  'guide_url': 'https://app.birda.org/species-guide/11404/Acacia_Pied_Barbet'},
 {'common_name': 'Black-collared Barbet',
  'scientific_name': 'Lybius torquatus',
  'image_url': 'https://birda.twic.pics/species/ecd51f7d-78f8-4246-b1e4-7359b13ab704/reference_images/Black-collared_Barbet_-_Adult.jpg',
  'guide_url': 'https://app.birda.org/species-guide/11447/Black-collared_Barbet'},
 {'common_name': 'Crested Barbet',
  'scientific_name': 'Trachyphonus vaillantii',
  'image_url': 'https://birda.twic.pics/species/40bffa5c-f735-4d85-add0-6cee03396e6f/reference_images/Crested_Barbet_-_Adult.jpg',
  'guide_url': 'https://app.birda.org/species-guide/11479/Crested_Barbet'},
 {'common_name': 'Green Barbet',
  'scientific_name': 'Stactolaema olivacea',
  'image_url': 'N/A',
  'guide_url'

In [69]:
len(birds)


945

In [61]:
pages = [file for file in os.listdir('C:\\Users\\scuba\\Downloads\\BIRD APP\\bird-trainer\\BIRDA\\BIRDA PAGES') if file.endswith('.html')] 
pages

['page_1.html',
 'page_10.html',
 'page_11.html',
 'page_12.html',
 'page_13.html',
 'page_14.html',
 'page_15.html',
 'page_16.html',
 'page_17.html',
 'page_18.html',
 'page_19.html',
 'page_2.html',
 'page_20.html',
 'page_21.html',
 'page_3.html',
 'page_4.html',
 'page_5.html',
 'page_6.html',
 'page_7.html',
 'page_9.html']

In [67]:
# List of local HTML files to process 
pages = [file for file in os.listdir('C:\\Users\\scuba\\Downloads\\BIRD APP\\bird-trainer\\BIRDA\\BIRDA PAGES') if file.endswith('.html')] 

birds = []
# Get the directory where the script itself is located
script_dir = "C:\\Users\\scuba\\Downloads\\BIRD APP\\bird-trainer\\BIRDA\\BIRDA PAGES\\"

for n in range(1, len(pages)+2):
    page = f"page_{n}.html"
    # Join the script directory with the filename
    new_page = os.path.join(script_dir, page)
    # print(f"Checking: {new_page}")
    if os.path.exists(new_page):
        print(f"Processing {page}...")

Processing page_1.html...
Processing page_2.html...
Processing page_3.html...
Processing page_4.html...
Processing page_5.html...
Processing page_6.html...
Processing page_7.html...
Processing page_9.html...
Processing page_10.html...
Processing page_11.html...
Processing page_12.html...
Processing page_13.html...
Processing page_14.html...
Processing page_15.html...
Processing page_16.html...
Processing page_17.html...
Processing page_18.html...
Processing page_19.html...
Processing page_20.html...
Processing page_21.html...


In [64]:
len(pages)

20