In [1]:
import os
import requests
from pathlib import Path

To use the selenium and the ChromeDriver, you will have to:

- `conda install -c conda-forge selenium`

Then you need to download ChromeDriver, install it here: <br>
https://sites.google.com/a/chromium.org/chromedriver/downloads

Be sure you download the right Chrome version. <br>
If you don't know your chrome version, you just need to click at options, help, about chrome and then you will find the right version for you. 

Once you have the chromedriver downloaded, run thoses lines:

```
sudo mv chromedriver /usr/bin/chromedriver
sudo chown root:root /usr/bin/chromedriver
sudo chmod +x /usr/bin/chromedriver
```

In case you are using a Windows Environment, you need to add the chromedriver to `path`

In [8]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By

options = webdriver.FirefoxOptions()
options.add_argument('-headless')
# Configurar o WebDriver
webdriver_service = Service('/usr/bin/geckodriver')
driver = webdriver.Firefox(service=webdriver_service, options=options)

driver.get("https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=11644&o=mp")

# Test - Downloading the bird's audio:

- I've selected a random bird to check how the download can be done

In [9]:
# Testing if selenium is working properly 

title = driver.find_element(By.CLASS_NAME, 'wa-registros-titulo')
complete_name = title.text.strip('Sons de ')
popular_name = complete_name.split(' ')[0]
cientific_name = complete_name[complete_name.find('(') + 1: complete_name.find(')')] 

print(complete_name)
print(popular_name)
print(cientific_name)

canário-da-terra (Sicalis flaveola)
canário-da-terra
Sicalis flaveola


In [14]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor

# Verify if the folder exists, if not, it will create it
folder_path = "../audio/"
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Testing the download
audios = driver.find_elements(By.TAG_NAME, 'audio')

# Limit the number of downloaded files to 20
max_files = 20

# Define a function to download a single file
def download_file(n, audio_src):
    url = audio_src.get_attribute('src')
    r = requests.get(url, allow_redirects=True)
    filename = folder_path + popular_name + "_" + str(n) + ".mp3"
    open(filename, 'wb').write(r.content)

# Use a ThreadPoolExecutor to download the files in parallel
with ThreadPoolExecutor(max_workers=5) as executor:
    for n, audio_src in enumerate(audios[:max_files], 0):
        executor.submit(download_file, n, audio_src)

# Test - Getting the link to the bird page based on a region: 

- The following link has a search to show only the birds in the Lages - SC (BR) region

In [16]:
driver.get("https://www.wikiaves.com.br/especies.php?t=c&c=4209300&o=5")
table = driver.find_element(By.TAG_NAME, 'table')

In [17]:
table_body = table.find_element(By.TAG_NAME, 'tbody')
table_rows = table_body.find_elements(By.TAG_NAME, 'tr')

In [19]:
# Checking the the elements in the table
for row_n, tr in enumerate(table_rows[:10], start= 1):
    for td_n, td in enumerate(tr.find_elements(By.TAG_NAME, 'a'), start= 1):
        if(td.get_attribute("class") == "font-green-dark"):
            print(td.get_attribute('href'))

https://www.wikiaves.com.br/wiki/curicaca
https://www.wikiaves.com.br/wiki/canario-da-terra
https://www.wikiaves.com.br/wiki/sabia-do-campo
https://www.wikiaves.com.br/wiki/caboclinho-de-barriga-preta
https://www.wikiaves.com.br/wiki/veste-amarela


**As we can see, links can repeat multiple time, so we will have to save it as a set, so we don't have repeated links**

In [20]:
# Creates a set, so links will not repeat
bird_page_set = set([])

# Checking the the elements in the table
for row_n, tr in enumerate(table_rows, start= 1):
    for td_n, td in enumerate(tr.find_elements(By.TAG_NAME, 'a'), start= 1):
        if(td.get_attribute("class") == "font-green-dark"):
            bird_page_set.add(td.get_attribute('href'))

print("First 10 links of our search:\n", list(bird_page_set)[:10])

First 10 links of our search:
 ['https://www.wikiaves.com.br/wiki/suindara', 'https://www.wikiaves.com.br/wiki/arapacu-verde', 'https://www.wikiaves.com.br/wiki/gaviao-bombachinha-grande', 'https://www.wikiaves.com.br/wiki/gralha-picaca', 'https://www.wikiaves.com.br/wiki/papa-moscas-cinzento', 'https://www.wikiaves.com.br/wiki/tovaca-campainha', 'https://www.wikiaves.com.br/wiki/pica-pau-do-campo', 'https://www.wikiaves.com.br/wiki/martim-pescador-verde', 'https://www.wikiaves.com.br/wiki/galinha-d_agua', 'https://www.wikiaves.com.br/wiki/papa-lagarta-acanelado']


## Getting into the audio page

In [21]:
driver.get(list(bird_page_set)[0])

In [23]:
driver.find_element(By.LINK_TEXT, "Sons").get_attribute('href')

'https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=10512'

# Doing the real thing

Now that we have the list of the links, we can now run the following code to download the audios into the audios folder

In [24]:
links_list = list(bird_page_set)

In [26]:
links_list

['https://www.wikiaves.com.br/wiki/suindara',
 'https://www.wikiaves.com.br/wiki/arapacu-verde',
 'https://www.wikiaves.com.br/wiki/gaviao-bombachinha-grande',
 'https://www.wikiaves.com.br/wiki/gralha-picaca',
 'https://www.wikiaves.com.br/wiki/papa-moscas-cinzento',
 'https://www.wikiaves.com.br/wiki/tovaca-campainha',
 'https://www.wikiaves.com.br/wiki/pica-pau-do-campo',
 'https://www.wikiaves.com.br/wiki/martim-pescador-verde',
 'https://www.wikiaves.com.br/wiki/galinha-d_agua',
 'https://www.wikiaves.com.br/wiki/papa-lagarta-acanelado',
 'https://www.wikiaves.com.br/wiki/beija-flor-de-topete-azul',
 'https://www.wikiaves.com.br/wiki/pica-pau-verde-barrado',
 'https://www.wikiaves.com.br/wiki/pica-pau-verde-carijo',
 'https://www.wikiaves.com.br/wiki/tauato-miudo',
 'https://www.wikiaves.com.br/wiki/abre-asa-de-cabeca-cinza',
 'https://www.wikiaves.com.br/wiki/besourinho-de-bico-vermelho',
 'https://www.wikiaves.com.br/wiki/mariquita',
 'https://www.wikiaves.com.br/wiki/figuinha-d

In [31]:
import concurrent.futures

def download_audio(audio_src, n, complete_name):
    url = audio_src.get_attribute('src')
    r = requests.get(url, allow_redirects=True)
    filename = '../audios/' + complete_name + "_" + str(n) + ".mp3"
    open(filename, 'wb').write(r.content)

for bird_page in list(bird_page_set)[:30]:
    # Opens the bird wiki page
    driver.get(bird_page)
    # Searches for the link that takes into the bird's audios
    audio_page = driver.find_element(By.LINK_TEXT, "Sons").get_attribute('href')
    print(audio_page)
    # Opens the audio's page
    driver.get(audio_page)

    title = driver.find_element(By.CLASS_NAME, 'wa-registros-titulo')
    complete_name = title.text.replace('Sons de ', '')
    
    # Remove special characters
    complete_name = complete_name.replace('(', '')
    complete_name = complete_name.replace(')', '')
    complete_name = complete_name.replace(' ', '_')
    
    print(complete_name)
    # Testing the download

    audios = driver.find_elements(By.TAG_NAME, 'audio')
    # Verify if the folder exists, if not, it will create it
    Path("audios").mkdir(parents=True, exist_ok=True)

    print("Downloading audios")
    
    max_files = 20

    with concurrent.futures.ThreadPoolExecutor() as executor:
        for n, audio_src in enumerate(audios, 0):
            # Stop the loop if the maximum number of files is reached
            if n >= max_files:
                break

            executor.submit(download_audio, audio_src, n, complete_name)

https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=10512
suindara_Tyto_furcata
Downloading audios
https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=11011
arapaçu-verde_Sittasomus_griseicapillus
Downloading audios
https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=10212
gavião-bombachinha-grande_Accipiter_bicolor
Downloading audios
https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=11473
gralha-picaça_Cyanocorax_chrysops
Downloading audios
https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=11287
papa-moscas-cinzento_Contopus_cinereus
Downloading audios
