In [None]:
# !pip install torch torchvision
# !pip install matplotlib pandas scikit-learn
# !pip install pillow
# !pip install opencv-python

# Scrape https://www.indianodonata.org for butterfly dataset

In [1]:
dataset_dir = "insect-dataset/src/indianodonata.org"
class_details_json_path = "insect-dataset/src/class_details.indianodonata.org.json"

website_url = "https://www.indianodonata.org"
initial_path = "/odonata"
first_page = 0
last_page = 26

batch_size = 3
max_workers = 50
page_timeout = 120
image_timeout = 30

ignore_image_regex = r"^(imgs10|.*(iucn-red-list|mobileapp|butterfliesofurbangreeneries|webheader|headerlogo|WPA-[IVX]+).*)\.(png|jpg|jpeg)$"
skip_downloaded_species = False

early_stage_suffix = '-early'

In [2]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import threading
from concurrent.futures import ThreadPoolExecutor

def log_header():
    return f"[ {threading.current_thread().name:24} ]  "

def download_image(img_url, output_dir):
    try:
        img_data = requests.get(img_url, timeout=image_timeout).content
        img_name = img_url.split("/")[-1]
        img_path = os.path.join(output_dir, img_name)
        with open(img_path, 'wb') as file:
            file.write(img_data)
        return True
    except Exception as e:
        # print(f"{log_header()}{e}")
        return False

def has_parent_with_prop(tag, prop, value, max_parents):
    parent = tag
    for i in range(0, max_parents):
        parent = parent.parent
        if parent.get(prop) == value:
            return True
    return False
    
def scrape_images(url, output_dir):
    try:
        print(f"{log_header()}    Scraping URL: {url}")
        response = requests.get(url, timeout=page_timeout)
        soup = BeautifulSoup(response.text, 'html.parser')
        img_tags = soup.find_all('img')
        success_cnt = 0
        failure_cnt = 0
        already_downloaded_cnt = 0
        for img in img_tags:
            img_url = img.get('src')
            if has_parent_with_prop(img, 'id', 'laraval', 5):
                # larval host plants photo
                # print (f"Ignoring larval host plants image {img_url}")
                continue
            class_suffix = ''
            if has_parent_with_prop(img, 'id', 'early', 6):
                # early stages
                # print (f"Found early stages image {img_url}")
                class_suffix = early_stage_suffix
            if img_url:
                img_url = urljoin(url, img_url)
                img_name = img_url.split("/")[-1]
                if re.search(ignore_image_regex, img_name):
                    continue
                if os.path.exists(f"{output_dir}{class_suffix}/{img_name}"):
                    already_downloaded_cnt = already_downloaded_cnt + 1
                    continue
                if not os.path.exists(output_dir+class_suffix):
                    os.makedirs(output_dir+class_suffix)
                if download_image(img_url, output_dir+class_suffix):
                    success_cnt = success_cnt + 1
                else:
                    failure_cnt = failure_cnt + 1
        if success_cnt > 0:
            print(f"{log_header()}      Downloaded {success_cnt}(+{already_downloaded_cnt}) / {success_cnt+already_downloaded_cnt+failure_cnt} image(s) in {output_dir}")
        return True
    except Exception as e:
        print(f"{log_header()}{e}")
        return False

def crawl(base_url, root, output_dir):
    try:
        url = urljoin(base_url, root)
        print(f"{log_header()}Crawling URL: {url}")
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        response = requests.get(url, timeout=page_timeout)
        soup = BeautifulSoup(response.text, 'html.parser')
        img_tags = soup.find_all('img')
        for img in img_tags:
            img_url = img.get('src')
            if img_url:
                img_url = urljoin(url, img_url)
                # print (f"Found image {img_url}")
                img_name = img_url.split("/")[-1]
                if re.search(ignore_image_regex, img_name):
                    continue
                species_path = img.parent.parent.get('href')
                species_dir = output_dir+species_path.lower()
                if not (skip_downloaded_species and os.path.exists(species_dir) and os.path.isdir(species_dir) and os.listdir(species_dir)):
                    scrape_images(urljoin(base_url, species_path), species_dir)
        return True
    except Exception as e:
        print(f"{log_header()}{e}")
        return False

def crawl_in_batch(batch_start):
    print(f"Starting batch from page {batch_start} on thread {threading.current_thread().name}")
    for page in range(batch_start, min(batch_start+batch_size, last_page+1), 1):
        crawl(website_url, f"{initial_path}?page={page}", f"{dataset_dir}")

In [4]:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(crawl_in_batch, batch_start) for batch_start in range(first_page, last_page+1, batch_size)]
    for future in futures:
        print(f"Thread completed with result {future.result()}")

print("Scraping completed!")

Starting batch from page 0 on thread ThreadPoolExecutor-0_0
[ ThreadPoolExecutor-0_0   ]  Crawling URL: https://www.indianodonata.org/odonata?page=0
Starting batch from page 3 on thread ThreadPoolExecutor-0_1
[ ThreadPoolExecutor-0_1   ]  Crawling URL: https://www.indianodonata.org/odonata?page=3
Starting batch from page 6 on thread ThreadPoolExecutor-0_2
[ ThreadPoolExecutor-0_2   ]  Crawling URL: https://www.indianodonata.org/odonata?page=6
Starting batch from page 9 on thread ThreadPoolExecutor-0_3
[ ThreadPoolExecutor-0_3   ]  Crawling URL: https://www.indianodonata.org/odonata?page=9
Starting batch from page 12 on thread ThreadPoolExecutor-0_4
[ ThreadPoolExecutor-0_4   ]  Crawling URL: https://www.indianodonata.org/odonata?page=12
Starting batch from page 15 on thread ThreadPoolExecutor-0_5
[ ThreadPoolExecutor-0_5   ]  Crawling URL: https://www.indianodonata.org/odonata?page=15
Starting batch from page 18 on thread ThreadPoolExecutor-0_6
[ ThreadPoolExecutor-0_6   ]  Crawling UR

# Remove corrupted images & empty folders

In [5]:
import os
from PIL import Image
from pathlib import Path

def check_image(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify()
        return True
    except (IOError, SyntaxError):
        return False

for species_dir in Path(f"{dataset_dir}").iterdir():
    if species_dir.is_dir() and os.listdir(species_dir):
        for file in Path(f"{species_dir}").iterdir():
            if file.is_file() and not check_image(file):
                os.remove(file)
                print(f"Corrupted file {file} removed")
            elif re.search(ignore_image_regex, file.name):
                os.remove(file)
                print(f"Unwanted file {file} removed")
    # if species_dir.is_dir() and not os.listdir(species_dir):
    #     os.rmdir(species_dir)
    #     print(f"Empty folder {species_dir} removed")

Corrupted file insect-dataset\src\indianodonata.org\protosticta-mortoni\Protosticta%20mortoni_1655551867_237852.jpg removed
Corrupted file insect-dataset\src\indianodonata.org\protosticta-mortoni\Protosticta%20mortoni_1655551867_237853.jpg removed


# Fetch species names

In [6]:
classes = { class_dir: len([ img for img in os.listdir(f"{dataset_dir}/{class_dir}") ]) for class_dir in os.listdir(f"{dataset_dir}") }

In [7]:
class_details = {}

In [8]:
import json

# read json
with open(class_details_json_path, "r", encoding="utf-8") as file:
    class_details = json.load(file)

In [9]:
for class_name in [ c for c in classes.keys() if not c.endswith("-early") ]:
    try:
        if class_name in class_details and "name" in class_details[class_name]:
            continue
        print()
        print(f"class_name: {class_name}")
        url = f"{website_url}/{class_name}"
        response = requests.get(url, timeout=page_timeout)
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find(lambda tag: tag.find("span", class_="spices-title") is not None, class_="field field--name-title field--type-string field--label-hidden").get_text(strip=True)
        print(f"title: {title}")
        species_name = re.split(r"\s+-\s+", title)
        if len(species_name) > 1:
            species_name = species_name[-1]
        else:
            print(f"species_name: NOT FOUND")
            continue
        print(f"species_name: {species_name}")
        class_details[class_name] = {"name": species_name}
    except Exception:
        print("ERROR")


class_name: aciagrion-azureum
title: Aciagrion azureumFraser, 1922  -
species_name: NOT FOUND

class_name: aciagrion-hisopa
title: Aciagrion hisopaSelys, 1876  - Violet-striped Slender Dartlet
species_name: Violet-striped Slender Dartlet

class_name: aciagrion-olympicum
title: Aciagrion olympicumLaidlaw, 1919  -
species_name: NOT FOUND

class_name: aciagrion-spp
title: Aciagrion spp- Slims
species_name: NOT FOUND

class_name: aciagrion-tillyardi
title: Aciagrion tillyardiLaidlaw, 1919  -
species_name: NOT FOUND

class_name: acrogomphus-fraseri
title: Acrogomphus fraseriLaidlaw, 1925  -
species_name: NOT FOUND

class_name: acrogomphus-mohani
title: Acrogomphus mohaniSahani, 1964  -
species_name: NOT FOUND

class_name: aeshna-donaldi
title: Aeshna donaldiFraser, 1922b  - Darner
species_name: Darner

class_name: aeshna-spp
title: Aeshna spp- Hawker
species_name: NOT FOUND

class_name: agriocnemis-clauseni
title: Agriocnemis clauseniFraser, 1922  -
species_name: NOT FOUND

class_name: agr

In [10]:
# replace unicodes 

import json

for class_name, details in class_details.items():
    if 'name' in details:
        # details['name'] = re.sub("\\u2019", "'", details['name'])
        for unicode in range(0, 9999):
            details['name'] = re.sub(f"\\u{unicode:04}", f"\\u{unicode:04}".encode().decode("unicode_escape"), details['name'])

In [11]:
import json

# dump json
with open(class_details_json_path, "w", encoding="utf-8") as file:
        json.dump(class_details, file, indent=4)

# Fetch image urls

In [3]:
import json

# read json
with open(class_details_json_path, "r", encoding="utf-8") as file:
    class_details = json.load(file)

In [13]:
for class_name, _ in classes.items():
    if class_name not in class_details and not class_name.endswith(early_stage_suffix):
        class_details[class_name] = {}

In [14]:
max_images = 10

for class_name, data in class_details.items():
    try:
        if 'images' in class_details[class_name]:
            continue
        print(f"processing {class_name}")
        early_class_name = class_name + early_stage_suffix
        url = f"{website_url}/{class_name}"
        response = requests.get(url, timeout=page_timeout)
        soup = BeautifulSoup(response.text, 'html.parser')
        img_tags = soup.find_all('img')
        for img in img_tags:
            if 'images' in class_details[class_name] and len(class_details[class_name]['images']) >= max_images:
                break
            if early_class_name in class_details:
                if 'images' in class_details[early_class_name] and len(class_details[early_class_name]['images']) >= max_images:
                    break
            img_url = img.get('src')
            if has_parent_with_prop(img, 'id', 'laraval', 5):
                continue
            if img_url:
                img_url = urljoin(url, img_url)
                img_name = img_url.split("/")[-1]
                if re.search(ignore_image_regex, img_name):
                    continue
                if has_parent_with_prop(img, 'id', 'early', 6):
                    if 'images' not in class_details[early_class_name]:
                        class_details[early_class_name]['images'] = []
                    if len(class_details[early_class_name]['images']) < max_images:
                        class_details[early_class_name]['images'] += [img_url]
                else:
                    if 'images' not in class_details[class_name]:
                        class_details[class_name]['images'] = []
                    if len(class_details[class_name]['images']) < max_images:
                        class_details[class_name]['images'] += [img_url]
        with open(class_details_json_path, "w", encoding="utf-8") as file:
            json.dump(class_details, file, indent=4)
    except Exception:
        print("ERROR")

processing aciagrion-hisopa
processing aeshna-donaldi
processing anax-imperator
processing nesoxenia-lineata
processing orthetrum-brunneum
processing orthetrum-cancellatum
processing pseudagrion-andamanicum
processing pseudagrion-pruinosum
processing selysiothemis-nigra
processing sympetrum-meridionale
processing sympetrum-vulgatum
processing zygonyx-torrida
processing zyxomma-obtusum
processing aciagrion-azureum
processing aciagrion-olympicum
processing aciagrion-tillyardi
processing acrogomphus-mohani
processing agriocnemis-corbeti
processing agriocnemis-dabreui
processing agriocnemis-nana
processing agrionoptera-dorothea
processing amphithemis-curvistyla
processing anax-panybeus
processing anisogomphus-orites
processing anormogomphus-heteropterus
processing anormogomphus-kiritschenkoi
processing aristocypha-immaculata
processing asiagomphus-personatus
processing bayadera-kali
processing bayadera-longicauda
processing burmargiolestes-laidlawi
processing caconeura-gomphoides
processin

In [9]:
import json
# read json
with open(class_details_json_path, "r", encoding="utf-8") as file:
    class_details = json.load(file)
# remove duplicates
for class_name, data in class_details.items():
    if 'images' in data:
        data['images'] = list(set(data['images']))
# dump json
with open(class_details_json_path, "w", encoding="utf-8") as file:
    json.dump(class_details, file, indent=4)

In [10]:
import json
# read json
with open(class_details_json_path, "r", encoding="utf-8") as file:
    class_details = json.load(file)
# remove bad images
bad_image_regex = r"(?i).*(butterfliy_5\.jpg).*"
for class_name, data in class_details.items():
    if 'images' in data:
        data['images'] = [ image for image in data['images'] if not re.match(bad_image_regex, image) ]
# dump json
with open(class_details_json_path, "w", encoding="utf-8") as file:
    json.dump(class_details, file, indent=4)