# WIKIPEDIA BIRDLIST

In [4]:
pip install -U requests


Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import StringIO

# URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_birds_of_South_Africa"

# Use pandas to read all tables from the URL
# Note: Wikipedia requires a User-Agent header for some scraping tools to prevent blocking
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(url, headers=header)

tables = pd.read_html(StringIO(r.text))

# The bird data starts after the table of contents and introductory tables.
# We filter for tables that contain the column "Common name" and "Binomial"
bird_tables = [df for df in tables if "Common name" in df.columns and "Binomial" in df.columns]

# Concatenate all matching tables into one DataFrame
df_birds = pd.concat(bird_tables, ignore_index=True)

# Clean up the data: Remove any rows that might be sub-headers or empty
df_birds = df_birds.dropna(subset=['Common name', 'Binomial'])
df_birds.drop('Status', axis=1, errors='ignore', inplace=True)
df_birds.reset_index(drop=True, inplace=True)

# Removes citations like [1] or [notes 1] from the names
df_birds['Common name'] = df_birds['Common name'].str.replace(r'\[.*\]', '', regex=True).str.strip()

# Display the first few rows and the total count
print(f"Total birds found: {len(df_birds)}")
print(df_birds.head())

# Optional: Save to CSV
folder = "C:\\Users\\scuba\\Downloads\\BIRD APP\\bird-trainer\\2 ALL BIRD DATA\\WIKIPEDIA\\"
df_birds.to_csv(folder + 'wikipedia_birdlist.csv')

Total birds found: 880
                              Common name                            Binomial
0  Common ostrich (South African ostrich)  Struthio camelus (S. c. australis)
1              White-faced whistling-duck                 Dendrocygna viduata
2                  Fulvous whistling-duck                 Dendrocygna bicolor
3                       White-backed duck             Thalassornis leuconotus
4                               Mute swan                         Cygnus olor


In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/List_of_birds_of_South_Africa"
base_url = "https:"

# 1. Fetch the page
header = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=header)
soup = BeautifulSoup(response.text, 'html.parser')

bird_data = []

# 2. Wikipedia bird lists usually group by 'h2' or 'h3' families
# Most bird entries are within tables with class 'wikitable'
tables = soup.find_all('table', class_='wikitable')

for table in tables:
    rows = table.find_all('row') if table.find('row') else table.find_all('tr')
    
    for row in rows[1:]:  # Skip header
        cols = row.find_all('td')
        if len(cols) >= 2:
            common_name = cols[0].get_text(strip=True)
            binomial = cols[1].find('i').get_text(strip=True) if cols[1].find('i') else cols[1].get_text(strip=True)
            
            # --- Extract Image ---
            # Images are often in the first column or a dedicated column
            img_tag = row.find('img')
            img_url = base_url + img_tag['src'] if img_tag else "No Image"
            
            # --- Extract Audio (Calls) ---
            # Calls on Wikipedia are usually 'audio' tags or '.ogg' links
            audio_tag = row.find('audio')
            audio_url = ""
            if audio_tag:
                source = audio_tag.find('source')
                audio_url = base_url + source['src'] if source else "No Audio"
            else:
                # Sometimes calls are just links to Wikimedia Commons files
                audio_link = row.find('a', href=True)
                if audio_link and '.ogg' in audio_link['href']:
                    audio_url = base_url + audio_link['href']
                else:
                    audio_url = "No Audio"

            bird_data.append({
                "Common Name": common_name,
                "Binomial": binomial,
                "Image URL": img_url,
                "Call URL": audio_url
            })

# 3. Create DataFrame
df_media = pd.DataFrame(bird_data)

# Display a sample of birds that have calls
birds_with_calls = df_media[df_media['Call URL'] != "No Audio"]
print(birds_with_calls.head())



Empty DataFrame
Columns: [Common Name, Binomial, Image URL, Call URL]
Index: []


In [2]:
len(bird_data)

880

In [3]:
df_media

Unnamed: 0,Common Name,Binomial,Image URL,Call URL
0,Common ostrich(South African ostrich),Struthio camelus,No Image,No Audio
1,White-faced whistling-duck,Dendrocygna viduata,No Image,No Audio
2,Fulvous whistling-duck,Dendrocygna bicolor,No Image,No Audio
3,White-backed duck,Thalassornis leuconotus,No Image,No Audio
4,Mute swan,Cygnus olor,No Image,No Audio
...,...,...,...,...
875,Ortolan bunting,Emberiza hortulana,No Image,No Audio
876,Golden-breasted bunting,Emberiza flaviventris,No Image,No Audio
877,Cape bunting,Emberiza capensis,No Image,No Audio
878,Lark-like bunting,Emberiza impetuani,No Image,No Audio


In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import StringIO

url = "https://en.wikipedia.org/wiki/List_of_birds_of_South_Africa"
header = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=header)
soup = BeautifulSoup(response.text, 'html.parser')

# 1. First, get the text data using the method we used before
tables = pd.read_html(StringIO(response.text))
bird_tables = [df for df in tables if "Common name" in df.columns]
df_birds = pd.concat(bird_tables, ignore_index=True)

# 2. Scrape all Image and Audio links from the page
# Wikipedia stores these in 'thumb' divs and 'media' spans
images = []
for img in soup.find_all('img'):
    if 'upload.wikimedia.org' in img.get('src', ''):
        # We clean the URL to get a usable link
        images.append("https:" + img['src'])

# Audio files are usually in 'audio' tags or specific .ogg links
audio_links = []
for a in soup.find_all('a', href=True):
    if '.ogg' in a['href']:
        audio_links.append("https://en.wikipedia.org" + a['href'])

# 3. Create a summary of what we found
print(f"Total Bird Species: {len(df_birds)}")
print(f"Total Images Found: {len(images)}")
print(f"Total Audio Files Found: {len(audio_links)}")

# 4. Display the first few birds with their names
print(df_birds[['Common name', 'Binomial']].head(10))

Total Bird Species: 880
Total Images Found: 50
Total Audio Files Found: 0
                              Common name                            Binomial
0  Common ostrich (South African ostrich)  Struthio camelus (S. c. australis)
1              White-faced whistling-duck                 Dendrocygna viduata
2                  Fulvous whistling-duck                 Dendrocygna bicolor
3                       White-backed duck             Thalassornis leuconotus
4                               Mute swan                         Cygnus olor
5                        Knob-billed duck              Sarkidiornis melanotos
6                          Egyptian goose                Alopochen aegyptiaca
7                  South African shelduck                        Tadorna cana
8                       Spur-winged goose             Plectropterus gambensis
9                     African pygmy-goose                    Nettapus auritus


In [7]:
# for Xeno-Canto

import pandas as pd
import requests
import time

def get_xeno_canto_media(scientific_name):
    """
    Queries Xeno-Canto for Quality 'A' recordings between 30-60 seconds.
    """
    # API Search string: 
    # qlt:A (Quality A)
    # len:30-60 (Length in seconds)
    query = f"{scientific_name} qlt:A len:30-60"
    api_url = f"https://xeno-canto.org/api/2/recordings?query={query}"
    
    try:
        response = requests.get(api_url, timeout=10)
        data = response.json()
        
        results = []
        if int(data.get('numRecordings', 0)) > 0:
            # We iterate through the recordings to categorize types
            for rec in data['recordings']:
                results.append({
                    "Scientific Name": scientific_name,
                    "Type": rec.get('type', 'Unknown'),
                    "Quality": rec.get('q'),
                    "Length": rec.get('length'),
                    "Audio URL": rec.get('file'),
                    "Location": rec.get('loc'),
                    "Recordist": rec.get('rec')
                })
        return results
    except Exception as e:
        print(f"Error fetching {scientific_name}: {e}")
        return []

# --- MAIN EXECUTION ---
# Assuming 'df_birds' is your DataFrame from the previous step
# Let's test with the first 5 birds to avoid hitting the API too hard
sample_birds = df_birds['Binomial'].unique()[:5]

all_recordings = []

print("Fetching high-quality calls from Xeno-Canto...")
for bird in sample_birds:
    print(f"Searching for: {bird}")
    media = get_xeno_canto_media(bird)
    all_recordings.extend(media)
    time.sleep(1) # Polite delay for API rate limits

# Create the final Media DataFrame
df_calls = pd.DataFrame(all_recordings)

# Display results
if not df_calls.empty:
    print("\n--- Found Recordings ---")
    print(df_calls[['Scientific Name', 'Type', 'Length', 'Audio URL']].head(10))
else:
    print("No matches found for the criteria.")

Fetching high-quality calls from Xeno-Canto...
Searching for: Struthio camelus (S. c. australis)
Searching for: Dendrocygna viduata
Searching for: Dendrocygna bicolor
Searching for: Thalassornis leuconotus
Searching for: Cygnus olor
No matches found for the criteria.


In [8]:
import os
import pandas as pd
import requests
import time
from io import StringIO
from pathlib import Path

def get_recordings(scientific_name, quality, length_range):
    """Helper to query the API with specific parameters."""
    query = f"{scientific_name} qlt:{quality} len:{length_range}"
    api_url = f"https://xeno-canto.org/api/2/recordings?query={query}"
    try:
        response = requests.get(api_url, timeout=15)
        return response.json().get('recordings', [])
    except:
        return []

def download_bird_calls_robust(df_birds, target_folder="SA_Bird_Library"):
    main_path = Path(target_folder)
    main_path.mkdir(exist_ok=True)
    
    for _, bird in df_birds.iterrows():
        name = bird['Common name']
        sci_name = bird['Binomial']
        bird_dir = main_path / name.replace("/", "-")
        bird_dir.mkdir(exist_ok=True)

        print(f"Searching for {name}...")

        # Strategy 1: Your Ideal (Quality A, 30-60s)
        recs = get_recordings(sci_name, "A", "30-60")

        # Strategy 2: Fallback (Quality A or B, 10-90s)
        if not recs:
            print(f"  No ideal match. Trying broader search (Q:A-B, 10-90s)...")
            recs = get_recordings(sci_name, "B", "10-90")

        # Strategy 3: Final Fallback (Any Quality, any length)
        if not recs:
            print(f"  Still no match. Pulling any available recording...")
            recs = get_recordings(sci_name, "D", "1-500")[:2] # Grab whatever is there

        if recs:
            for rec in recs[:3]: # Limit to 3 files per bird
                file_url = rec.get('file')
                rec_id = rec.get('id')
                # Categorize by type (song, alarm, etc.)
                call_type = rec.get('type', 'call').split(',')[0].replace(" ", "_")
                file_name = f"{call_type}_Q{rec.get('q')}_{rec_id}.mp3"
                
                # Download
                if not (bird_dir / file_name).exists():
                    audio_data = requests.get(file_url)
                    with open(bird_dir / file_name, 'wb') as f:
                        f.write(audio_data.content)
                    print(f"    Downloaded: {file_name}")
        else:
            print(f"  [!] No recordings found on Xeno-Canto for {sci_name}")
            
        time.sleep(1.2) # Be kind to the API

# Run the robust downloader
download_bird_calls_robust(df_birds)

Searching for Common ostrich (South African ostrich)...
  No ideal match. Trying broader search (Q:A-B, 10-90s)...
  Still no match. Pulling any available recording...
  [!] No recordings found on Xeno-Canto for Struthio camelus (S. c. australis)
Searching for White-faced whistling-duck...
  No ideal match. Trying broader search (Q:A-B, 10-90s)...
  Still no match. Pulling any available recording...
  [!] No recordings found on Xeno-Canto for Dendrocygna viduata
Searching for Fulvous whistling-duck...
  No ideal match. Trying broader search (Q:A-B, 10-90s)...
  Still no match. Pulling any available recording...
  [!] No recordings found on Xeno-Canto for Dendrocygna bicolor
Searching for White-backed duck...
  No ideal match. Trying broader search (Q:A-B, 10-90s)...
  Still no match. Pulling any available recording...
  [!] No recordings found on Xeno-Canto for Thalassornis leuconotus
Searching for Mute swan...
  No ideal match. Trying broader search (Q:A-B, 10-90s)...
  Still no match

KeyboardInterrupt: 