# Scrape https://indiabiodiversity.org

In [3]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import threading
from concurrent.futures import ThreadPoolExecutor
from PIL import Image
from pathlib import Path
import requests
import time
import datetime

In [4]:
dataset_dir = "insect-dataset/indiabiodiversity.org"

In [5]:
page_timeout = 120
image_timeout = 30

def log_header():
    return f"[ {threading.current_thread().name:24} ]  "

def check_image(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify()
        return True
    except (IOError, SyntaxError):
        return False

def download_image(img_url, output_dir):
    try:
        # print(f"{log_header()} Downloading {img_url} into {output_dir}")
        # print(f"{log_header()} Downloading {img_url.split("/")[-1].split("?")[0]} into {output_dir.split("/")[-1]}")
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        img_name = img_url.split("/")[-1]
        img_path = os.path.join(output_dir, img_name.split("?")[0])
        if Path(img_path).is_file() and check_image(img_path):
            # skipping, already downloaded
            return 'EXISTS'
        img_data = requests.get(img_url, timeout=image_timeout).content
        with open(img_path, 'wb') as file:
            file.write(img_data)
        if not check_image(img_path):
            print(f"{log_header()}Removing corrupted image {file}")
            os.remove(Path(img_path))
            return 'FAILURE'
        return 'SUCCESS'
    except Exception as e:
        print(f"{log_header()}{e}")
        return 'FAILURE'

def fetch_json(url, method, headers, params):
    if method == 'get':
        return requests.get(url, headers=headers, params=params).json()
    else:
        return requests.post(url, headers=headers, params=params, json={}).json()

img_base_url = "https://indiabiodiversity.org/files-api/api/get/crop/"
context_map = {
    "OBSERVATION": "observations",
    "SPECIES": "img"
}
headers = {
    "accept": "application/json, text/plain, */*",
}
def scrape(url, method, list_key, img_src_key, get_class_name, params, start_offset=0, step=10, steps_in_batch=100000):
    print(f"{log_header()}Starting scraping from {url} with offset={start_offset}, step={step}, steps_in_batch={steps_in_batch}")
    start_time = time.time()
    total_count = fetch_json(url, method, headers=headers, params=params)['totalCount']
    # print(f"{log_header()}Total count: {total_count}")
    img_cnt = 0
    success_cnt = 0
    failure_cnt = 0
    for offset in range(start_offset, min(total_count, start_offset+step*steps_in_batch), step):
        params['offset'] = f"{offset}"
        response = fetch_json(url, method, headers=headers, params=params)
        for obj in response[list_key]:
            try:
                class_name = get_class_name(obj)
                if len(class_name.split('-')) == 1:
                    class_name += '-spp'
                if 'factValuePair' in obj:
                    life_stage = [ item for item in obj['factValuePair'] if item["name"] == "Life Stage (Complete Metamorphosis)" ]
                    if len(life_stage) != 1 or life_stage[0]['value'] != 'Adult':
                        # skip non-adult life stage
                        continue
                context_path = context_map[obj['context']] if 'context' in obj else "observations"
                img_url = f"{img_base_url}/{context_path}/{obj[img_src_key]}?h=300"
                target_dir = f"{dataset_dir}/{class_name}"
                download_status = download_image(img_url, target_dir)
                if download_status == 'SUCCESS':
                    success_cnt += 1
                    img_cnt += 1
                elif download_status == 'EXISTS':
                    img_cnt += 1
                else:
                    failure_cnt += 1
            except Exception as e:
                print(f"{log_header()}{e}")
        class_cnt = sum(1 for entry in os.scandir(dataset_dir) if entry.is_dir())
        print(f"{log_header()}Offset: {offset:5} | Success: {success_cnt:5} / {img_cnt:5} | Failure: {failure_cnt:5} | Class count: {class_cnt:5}")
    print(f"{log_header()}Scraping completed in {datetime.timedelta(seconds=(time.time() - start_time))}")

In [16]:
scrape("https://indiabiodiversity.org/species-api/api/v1/species/list/extended_species/_doc", 'get',
      list_key = 'speciesTiles', img_src_key='reprImage',
      get_class_name = lambda obj: '-'.join(re.sub(r"<i>|</i>.*", "", obj['name']).lower().split(' ')[0:2]),
      params = {
          "view": "grid",
          "max": "16",
          "mediaFilter": "IMAGE",
          "offset": "0",
          "sGroup": "839",
          "sort": "species.lastUpdated",
          "taxon": "70513"
      })

[ MainThread               ]  Starting scraping from https://indiabiodiversity.org/species-api/api/v1/species/list/extended_species/_doc with offset=0, step=10, steps_in_batch=100000
[ MainThread               ]  Offset:     0 | Success:    16 /    16 | Failure:     0 | Class count:    16
[ MainThread               ]  Offset:    10 | Success:    26 /    32 | Failure:     0 | Class count:    26
[ MainThread               ]  Offset:    20 | Success:    36 /    48 | Failure:     0 | Class count:    36
[ MainThread               ]  Offset:    30 | Success:    46 /    64 | Failure:     0 | Class count:    46
[ MainThread               ]  Offset:    40 | Success:    56 /    80 | Failure:     0 | Class count:    56
[ MainThread               ]  Offset:    50 | Success:    66 /    96 | Failure:     0 | Class count:    66
[ MainThread               ]  Offset:    60 | Success:    76 /   112 | Failure:     0 | Class count:    76
[ MainThread               ]  Offset:    70 | Success:    86 /   128

In [6]:
max_workers = 50
step = 8
steps_in_batch = 100

def scrape_observations(offset, step, batch_size):
    scrape("https://indiabiodiversity.org/observation-api/api/v1/observation/list/extended_observation/_doc", 'post',
          list_key = 'observationList', img_src_key='reprImageUrl',
          get_class_name = lambda obj: '-'.join(re.sub(r"<i>|</i>.*", "", obj['recoShow']['recoIbp']['scientificName']).lower().split(' ')[0:2]),
          params = {
              "max": "8",
              "mediaFilter": "no_of_images",
              "offset": f"{offset}",
              "sort": "created_on",
              "taxon": "70513",
              "view": "list",
              "speciesName": "IDENTIFIED"
          },
          start_offset = offset, step = step, steps_in_batch = steps_in_batch)

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(scrape_observations, offset, step, steps_in_batch) for offset in range(0, 72037, step * steps_in_batch)]
    for future in futures:
        print(f"Thread completed with result {future.result()}")
print("All threads completed")

[ ThreadPoolExecutor-0_0   ]  Starting scraping from https://indiabiodiversity.org/observation-api/api/v1/observation/list/extended_observation/_doc with offset=0, step=8, steps_in_batch=100
[ ThreadPoolExecutor-0_1   ]  Starting scraping from https://indiabiodiversity.org/observation-api/api/v1/observation/list/extended_observation/_doc with offset=800, step=8, steps_in_batch=100
[ ThreadPoolExecutor-0_2   ]  Starting scraping from https://indiabiodiversity.org/observation-api/api/v1/observation/list/extended_observation/_doc with offset=1600, step=8, steps_in_batch=100
[ ThreadPoolExecutor-0_3   ]  Starting scraping from https://indiabiodiversity.org/observation-api/api/v1/observation/list/extended_observation/_doc with offset=2400, step=8, steps_in_batch=100
[ ThreadPoolExecutor-0_4   ]  Starting scraping from https://indiabiodiversity.org/observation-api/api/v1/observation/list/extended_observation/_doc with offset=3200, step=8, steps_in_batch=100
[ ThreadPoolExecutor-0_5   ]  Star

# Remove corrupted images & empty folders

In [7]:
import os
from PIL import Image
from pathlib import Path

def check_image(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify()
        return True
    except (IOError, SyntaxError):
        return False

for species_dir in Path(dataset_dir).iterdir():
    if species_dir.is_dir() and os.listdir(species_dir):
        for file in Path(f"{species_dir}").iterdir():
            if file.is_file() and not check_image(file):
                os.remove(file)
                print(f"Corrupted file {file} removed")
    if not os.listdir(species_dir):
        os.rmdir(species_dir)
        print(f"Empty folder {species_dir} removed")

Empty folder insect-dataset\indiabiodiversity.org\celastrina-oreana removed
Empty folder insect-dataset\indiabiodiversity.org\lethe-goalpara removed


# ZIP the data

In [9]:
import shutil
import time
import datetime

shutil.make_archive(f"{dataset_dir}/../lepidoptera.indiabiodiversity.org.{datetime.datetime.now().strftime("%Y.%m.%d")}", 
                    'zip', f"{dataset_dir}")

'D:\\Projects\\my-jupyter-notebook\\insect-species-identification\\insect-dataset\\lepidoptera.indiabiodiversity.org.2025.02.10.zip'