In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

In [129]:
class URLScraper():
    """
        Given a base url, e.g. http://www.animalia.bio/social-animals, repeatedly click next page and store all the URLs observed along the way 
    """
    
    def __init__(self, file):
        """
            base_url -- where to start from, should take the form http://www.animalia.bio/X?page=1 e.g. http://www.animalia.bio/social-animals?page=1
            file -- where to write the urls out to
        """
        self.urls = set()
        self.file = file
        with open(file) as f:
            for line in f.readlines():
                self.urls.add(line.strip())
                
        self.MAX_DELAY = 10
    
    def write(self):
        with open(self.file,'w') as f:
            for u in self.urls:
               f.write(u + "\n")
        
    def traverse(self, base_url):
        """
            Start at the first page and keep clicking to the next page until we run out of pages
        """
        options = webdriver.ChromeOptions()
        options.add_argument('headless')
        self.browser = webdriver.Chrome(options=options)
        self.browser.get(base_url)
        page_number = 1
        while True:
            
            # Load the page and wait for the Javascript to load until animals appear on the page
            try:
                _ = WebDriverWait(self.browser, self.MAX_DELAY).until(EC.presence_of_element_located((By.CLASS_NAME, 'item-animal')))
                elements = self.browser.find_elements_by_class_name("item-animal")
                for e in elements:
                    link = e.get_attribute("href")
                    self.urls.add(link.strip())
                print(f"Number of total species scraped: {len(self.urls)}")
            except TimeoutException:
                pass
        
            # Attempt to click on the next page
            try:
                self.browser.find_element_by_link_text(str(page_number+1)).click()
            except Exception:
                self.write()
                return
            
            # Wait for the new page to appear
            try: 
                while not self.browser.find_element_by_link_text(str(page_number+1)).get_attribute("style"):
                    time.sleep(1)
            except Exception:
                time.sleep(10)
            
            page_number += 1
        
    def __del__(self):
        self.browser.quit()

In [130]:
to_traverse = ["http://animalia.bio/reptiles?page=1", "http://animalia.bio/mammals?page=1", "http://animalia.bio/social-animals?page=1", "http://animalia.bio/solitary-animals?page=1",
              "http://animalia.bio/carnivore?page=1", "http://animalia.bio/cursorial?page=1","http://animalia.bio/tropical-moist-forests?page=1","http://animalia.bio/monogamy?page=1",
              "http://animalia.bio/temperate-broadleaf-and-mixed-forest?page=1","http://animalia.bio/birds?page=1","http://animalia.bio/temperate?page=1"]

In [131]:
scraper = URLScraper("links.txt")
for t in to_traverse:
    print(f"Traversing: {t}")
    scraper.traverse(t)

Traversing: http://animalia.bio/reptiles?page=1
Number of total species scraped: 42
Number of total species scraped: 84
Number of total species scraped: 126
Number of total species scraped: 168
Number of total species scraped: 210
Number of total species scraped: 252
Number of total species scraped: 259
Traversing: http://animalia.bio/mammals?page=1
Number of total species scraped: 301
Number of total species scraped: 343
Number of total species scraped: 385
Number of total species scraped: 427
Number of total species scraped: 469
Number of total species scraped: 511
Number of total species scraped: 553
Number of total species scraped: 595
Number of total species scraped: 637
Traversing: http://animalia.bio/social-animals?page=1
Number of total species scraped: 654
Number of total species scraped: 671
Number of total species scraped: 678
Number of total species scraped: 678
Number of total species scraped: 718
Number of total species scraped: 740
Number of total species scraped: 740
Nu

In [4]:
import json
f = open('train.json',) 
train_annotations = json.load(f) 
train_annotations

{'info': {'year': 2021,
  'verion': 1,
  'description': 'iNaturalist Species Classification Dataset Training Split.',
  'contributor': 'Grant Van Horn and the Visipedia Team.',
  'url': 'https://github.com/visipedia/inat_comp',
  'date_created': '2021-03-01 12:34:38'},
 'images': [{'id': 0,
   'width': 500,
   'height': 500,
   'file_name': 'train/02912_Animalia_Chordata_Actinopterygii_Siluriformes_Ictaluridae_Ameiurus_nebulosus/d615f184-8af4-4c60-b9f8-3081c1607644.jpg',
   'license': 0,
   'rights_holder': 'Ken-ichi Ueda',
   'date': '2010-07-14 20:19:00+00:00',
   'latitude': 43.83486,
   'longitude': -71.22231,
   'location_uncertainty': 77},
  {'id': 1,
   'width': 500,
   'height': 333,
   'file_name': 'train/04831_Animalia_Chordata_Mammalia_Rodentia_Sciuridae_Marmota_flaviventris/4653466f-f7b5-4e6c-985e-e059c1493b8f.jpg',
   'license': 0,
   'rights_holder': 'Michelle S. Koo',
   'date': '2010-07-06 22:17:00+00:00',
   'latitude': 38.86995,
   'longitude': -120.19326,
   'locatio

In [20]:
print(train_annotations.keys())
animalia_species = set()
for s in train_annotations['categories']:
    animalia_species.add(s['name'].lower())

dict_keys(['info', 'images', 'categories', 'annotations', 'licenses'])


In [101]:
base_url = "http://www.animalia.bio/"
test_species = ["pine-siskin","radiated-tortoise","gharial"]

In [30]:
from tqdm.notebook import tqdm
import csv
titles = ["Common name", "Species", "Full taxonomy", "Description"]
with open('intersection_animalia_inaturalist.csv', 'w') as f: 
        write = csv.writer(f) 
        write.writerow(titles)
        
with open('full_animalia_scrape.csv', 'w') as f: 
        write = csv.writer(f) 
        write.writerow(titles)

options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('disable-gpu')
browser = webdriver.Chrome(options=options)
with open("links.txt") as f:
    links = f.readlines()
links = [l.strip() for l in links]
count = 0
for url in tqdm(links):
    browser.get(url)
    els = browser.find_elements_by_class_name("s-char-kinds__name")
    els = [e.text for e in els]
    species = els[-1].lower()
    description = browser.find_element_by_class_name("s-char-text").text
    common_name = browser.find_element_by_class_name("a-h1").text.lower()
    if species in animalia_species:
        count += 1
        with open('intersection_animalia_inaturalist.csv', 'a') as f: 
            write = csv.writer(f) 
            write.writerow([common_name, species, "_".join(els), description])
    with open('full_animalia_scrape.csv', 'a') as f: 
        write = csv.writer(f) 
        write.writerow([common_name, species, "_".join(els), description])
#     print(els)
print(count)
browser.quit()

  0%|          | 0/1570 [00:00<?, ?it/s]

674


In [46]:
# compute number of images per species
map_species_count = {}
for i in train_annotations['images']:
    species = " ".join(i['file_name'].split("/")[1].split("_")[-2:]).lower()
    if not(species in map_species_count):
        map_species_count[species] = 1
    else:
        map_species_count[species] = map_species_count[species] + 1

In [55]:
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('intersection_animalia_inaturalist.csv')
count = 0
map_species_count_intersection = {}
for s in df['Species']:
    map_species_count_intersection[s] = map_species_count[s]

vls = map_species_count_intersection.values()
print(f"{sum(vls)}, {min(vls)}, {max(vls)}")

195605, 156, 300
