In [3]:
import os
import requests
import re
from pathlib import Path

To use the selenium and the ChromeDriver, you will have to:

- `conda install -c conda-forge selenium`

Then you need to download ChromeDriver, install it here: <br>
https://sites.google.com/a/chromium.org/chromedriver/downloads

Be sure you download the right Chrome version. <br>
If you don't know your chrome version, you just need to click at options, help, about chrome and then you will find the right version for you. 

Once you have the chromedriver downloaded, run thoses lines:

```
sudo mv chromedriver /usr/bin/chromedriver
sudo chown root:root /usr/bin/chromedriver
sudo chmod +x /usr/bin/chromedriver
```

In case you are using a Windows Environment, you need to add the chromedriver to `path`

In [4]:
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
driver = webdriver.Chrome(options=options)
driver.get("https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=11644&o=mp")

# Test - Downloading the bird's audio:

- I've selected a random bird to check how the download can be done

In [5]:
# Testing if selenium is working properly 

title = driver.find_element_by_class_name('wa-registros-titulo')
complete_name = title.text.strip('Sons de ')
popular_name = complete_name.split(' ')[0]
cientific_name = complete_name[complete_name.find('(') + 1: complete_name.find(')')] 

print(complete_name)
print(popular_name)
print(cientific_name)

canário-da-terra (Sicalis flaveola)
canário-da-terra
Sicalis flaveola


In [6]:
# Testing the download

audios = driver.find_elements_by_tag_name('audio')
# Verify if the folder exists, if not, it will create it
Path("audios").mkdir(parents=True, exist_ok=True)

for n, audio_src in enumerate(audios, 0):
    url = audio_src.get_attribute('src')
    r = requests.get(url, allow_redirects=True)
    filename = '../audios/' + popular_name + "_" + str(n) + ".mp3"
    open(filename, 'wb').write(r.content)

# Test - Getting the link to the bird page based on a region: 

- The following link has a search to show only the birds in the Lages - SC (BR) region

In [7]:
driver.get("https://www.wikiaves.com.br/especies.php?t=c&c=4209300&o=5")
table = driver.find_element_by_tag_name('table')

In [8]:
table_body = table.find_element_by_tag_name('tbody')
table_rows = table_body.find_elements_by_tag_name('tr')

In [9]:
# Checking the the elements in the table
for row_n, tr in enumerate(table_rows[:10], start= 1):
    for td_n, td in enumerate(tr.find_elements_by_tag_name('a'), start= 1):
        if(td.get_attribute("class") == "font-green-dark"):
            print(td.get_attribute('href'))

https://www.wikiaves.com.br/wiki/curicaca
https://www.wikiaves.com.br/wiki/canario-da-terra
https://www.wikiaves.com.br/wiki/sabia-do-campo
https://www.wikiaves.com.br/wiki/carrapateiro
https://www.wikiaves.com.br/wiki/coruja-buraqueira


**As we can see, links can repeat multiple time, so we will have to save it as a set, so we don't have repeated links**

In [10]:
# Creates a set, so links will not repeat
bird_page_set = set([])

# Checking the the elements in the table
for row_n, tr in enumerate(table_rows, start= 1):
    for td_n, td in enumerate(tr.find_elements_by_tag_name('a'), start= 1):
        if(td.get_attribute("class") == "font-green-dark"):
            bird_page_set.add(td.get_attribute('href'))

print("First 10 links of our search:\n", list(bird_page_set)[:10])

First 10 links of our search:
 ['https://www.wikiaves.com.br/wiki/aguia-serrana', 'https://www.wikiaves.com.br/wiki/sabia-laranjeira', 'https://www.wikiaves.com.br/wiki/pica-pau-rei', 'https://www.wikiaves.com.br/wiki/arapacu-grande', 'https://www.wikiaves.com.br/wiki/surucua-variado', 'https://www.wikiaves.com.br/wiki/avoante', 'https://www.wikiaves.com.br/wiki/curiango-do-banhado', 'https://www.wikiaves.com.br/wiki/marreca-de-coleira', 'https://www.wikiaves.com.br/wiki/coro-coro', 'https://www.wikiaves.com.br/wiki/coruja-listrada']


## Getting into the audio page

In [11]:
driver.get(list(bird_page_set)[0])

In [12]:
driver.find_element_by_link_text("Sons").get_attribute('href')

'https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=10231'

# Doing the real thing

Now that we have the list of the links, we can now run the following code to download the audios into the audios folder

In [13]:
links_list = list(bird_page_set)

In [14]:
for bird_page in list(bird_page_set):
    # Opens the bird wiki page
    driver.get(bird_page)
    # Searches for the link that takes into the bird's audios
    audio_page = driver.find_element_by_link_text("Sons").get_attribute('href')
    print(audio_page)
    # Opens the audio's page
    driver.get(audio_page)

    title = driver.find_element_by_class_name('wa-registros-titulo')
    complete_name = title.text.strip('Sons de ')
    popular_name = complete_name.split(' ')[0]

    print(popular_name)
    # Testing the download

    audios = driver.find_elements_by_tag_name('audio')
    # Verify if the folder exists, if not, it will create it
    Path("audios").mkdir(parents=True, exist_ok=True)

    print("Downloading audios")
    
    for n, audio_src in enumerate(audios, 0):
        url = audio_src.get_attribute('src')
        r = requests.get(url, allow_redirects=True)
        filename = '../audios/' + popular_name + "_" + str(n) + ".mp3"
        open(filename, 'wb').write(r.content)

https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=10231
águia-serrana
Downloading audios
https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=11523
abiá-laranjeira
Downloading audios
https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=10799
pica-pau-rei
Downloading audios
https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=11028
arapaçu-grande
Downloading audios
https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=10666
urucuá-variado
Downloading audios
https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=10401
avoante
Downloading audios
https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=10563
curiango-do-banhado
Downloading audios
https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=10035
marreca-de-coleira
Downloading audios
https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=10175
coró-coró
Downloading audios
https://www.wikiaves.com.br/midias.php?tm=s&t=s&s=10523
coruja-listrada
Downloading audios


KeyboardInterrupt: 