# Download Empty Images From the LILA Datasets

## Wellington Dataset

In [None]:
import pandas as pd
import random
import os
import requests
from tqdm import tqdm

# Define the file path
file_path = '../data/detector_tests/Wellington_empty_images/wellington_camera_traps.csv'
print(f"Loading CSV file from {file_path}")

# Load the CSV file
data = pd.read_csv(file_path)
print("CSV file loaded successfully.")

# Filter the rows with label 'NOTHINGHERE'
print("Filtering rows with label 'NOTHINGHERE'.")
nothinghere_files = data[data['label'] == 'NOTHINGHERE']['file']
print(f"Found {len(nothinghere_files)} files with label 'NOTHINGHERE'.")

# Sample 20,000 file names randomly
print("Sampling 20,000 file names randomly.")
sampled_files = nothinghere_files.sample(n=20000, random_state=42)
print("Sampled 20,000 file names.")


# Create a directory to save the downloaded images
download_dir = '../data/detector_tests/Wellington_empty_images/'
os.makedirs(download_dir, exist_ok=True)
print(f"Created directory {download_dir} for downloading images.")


# Download the images with a progress bar
base_url = "https://storage.googleapis.com/public-datasets-lila/wellington-unzipped/images/"
print("Starting to download images.")
for file_name in tqdm(sampled_files, desc="Downloading images"):
    file_name = file_name.replace('.jpg', '.JPG')
    url = base_url + file_name
    response = requests.get(url)
    if response.status_code == 200:
        with open(os.path.join(download_dir, file_name), 'wb') as f:
            f.write(response.content)
    else:
        print(f"Failed to download {file_name}")

print("Download complete.")


## WCS Dataset

In [None]:
import json
import os
import requests
import random
from tqdm import tqdm

# Define the JSON file path
json_file_path = '../data/detector_tests/WCS_empty_images/wcs_camera_traps.json'
print(f"Loading JSON file from {json_file_path}")

# Load the JSON file
with open(json_file_path, 'r') as f:
    data = json.load(f)
print("JSON file loaded successfully.")

# Extract "empty" category id
empty_category_id = None
for category in data['categories']:
    if category['name'] == 'empty':
        empty_category_id = category['id']
        break
if empty_category_id is None:
    print("No 'empty' category found in JSON.")
else:
    # Extract images with "empty" category
    print("Extracting images with 'empty' category.")
    empty_image_ids = set()
    for annotation in tqdm(data['annotations'], desc="Extracting image IDs"):
        if annotation['category_id'] == empty_category_id:
            empty_image_ids.add(annotation['image_id'])

    empty_images = [img for img in data['images'] if img['id'] in empty_image_ids]
    print(f"Found {len(empty_images)} images with 'empty' category.")

    # Randomly sample 20,000 images
    if len(empty_images) > 20000:
        empty_images = random.sample(empty_images, 20000)
    print(f"Sampled {len(empty_images)} images for download.")

    # Create a directory to save the downloaded images
    download_dir = '../data/detector_tests/WCS_empty_images/'
    os.makedirs(download_dir, exist_ok=True)
    print(f"Created directory {download_dir} for downloading images.")

    # Download the images with a progress bar
    base_url = "https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/"
    print("Starting to download images.")

    # Modify the download loop to start from image 1093
    start_index = 0
    for i, image in enumerate(tqdm(empty_images, desc="Downloading images")):
        if i < start_index:
            continue
        file_name = image['file_name'].lower()
        url = f"{base_url}{file_name}"
        response = requests.get(url)
        if response.status_code == 200:
            with open(os.path.join(download_dir, f"{i}.jpg"), 'wb') as f:
                f.write(response.content)
        else:
            print(f"Failed to download {file_name} from {url}")


## Fauna and Flora Database

In [None]:
import pandas as pd
import random
import os
import requests
from tqdm import tqdm

# Define the file path
file_path = '../data/detector_tests/Fauna_Flora_Image_Database.csv'
print(f"Loading CSV file from {file_path}")

# Load the CSV file
data = pd.read_csv(file_path)
print("CSV file loaded successfully.")

# Filter the rows with label 'Blank'
print("Filtering rows with label 'Blank'.")
blank_file_urls = data[data['common_name'] == 'Blank']['location']
print(f"Found {len(blank_file_urls)} files with label 'Blank'.")

# Sample 20,000 file names randomly
print("Sampling 19,000 file names randomly.")
sampled_files = blank_file_urls.sample(n=19000, random_state=42)
print("Sampled 19,000 file names.")

# Create a directory to save the downloaded images
download_dir = '../data/detector_tests/Flower_Fauna_Empty_Images'
os.makedirs(download_dir, exist_ok=True)
print(f"Created directory {download_dir} for downloading images.")

# Download the images with a progress bar
print("Starting to download images.")

# Modify the download loop to start from image 1093
start_index = 0
for i, url in enumerate(tqdm(sampled_files, desc="Downloading images")):
    if i < start_index:
        continue
    response = requests.get(url)
    if response.status_code == 200:
        with open(os.path.join(download_dir, f"{i}.jpg"), 'wb') as f:
            f.write(response.content)
    else:
        print(f"Failed to download {url}")


In [None]:
sampled_files[0]

## Idaho Camera Traps

In [None]:
import json
import random
import os
from tqdm import tqdm

# Load the JSON data
file_path = '../data/detector_tests/Idaho_Empty_Images/idaho-camera-traps.json'
save_path = '../data/detector_tests/Idaho_Empty_Images/downloaded_images'
print(f"Loading JSON data from {file_path}...")
with open(file_path, 'r') as file:
    data = json.load(file)
print("JSON data loaded successfully.")

# Create a list of file_name for all images where category_id = 0
category_id = 0
image_file_names = []

# Create a dictionary for quick lookup of image file names by image id
print("Creating a dictionary for quick lookup of image file names by image id...")
image_dict = {image['id']: image['file_name'] for image in data['images']}
print("Dictionary created successfully.")

# Filter annotations for the required category_id and get the corresponding image file names
print(f"Filtering annotations for category_id = {category_id}...")
for annotation in data['annotations']:
    if annotation['category_id'] == category_id:
        image_id = annotation['image_id']
        if image_id in image_dict:
            image_file_names.append(image_dict[image_id])
print(f"Found {len(image_file_names)} images with category_id = {category_id}.")

# Take a random sample of 19,000 images
sample_size = min(19000, len(image_file_names))
print(f"Taking a random sample of {sample_size} images...")
sampled_file_names = random.sample(image_file_names, sample_size)
print("Random sample taken successfully.")

# Generate gsutil commands to download these images
print("Generating gsutil download commands...")
if not os.path.exists(save_path):
    os.makedirs(save_path)

with open('download_images.sh', 'w') as file:
    file.write("#!/bin/bash\n")
    for file_name in sampled_file_names:
        # Extract the base file name without the directory structure
        base_file_name = os.path.basename(file_name)
        gsutil_command = f"gsutil -q cp gs://public-datasets-lila/idaho-camera-traps/public/{file_name} {save_path}/{base_file_name}\n"
        file.write(gsutil_command)
print("Script 'download_images.sh' has been created with gsutil download commands.")

# Run the shell script with a progress bar
print("Starting the download process with progress bar...")
total_files = len(sampled_file_names)
with tqdm(total=total_files, desc="Downloading images", unit="file") as pbar:
    with open('download_images.sh', 'r') as file:
        for line in file:
            os.system(line.strip())
            pbar.update(1)

print("Download process completed.")


In [None]:
import json
import random
import os
import requests
from tqdm import tqdm

# Load the JSON data
file_path = '../data/detector_tests/Idaho_Empty_Images/idaho-camera-traps.json'
save_path = '../data/detector_tests/Idaho_Empty_Images/downloaded_images'
print(f"Loading JSON data from {file_path}...")
with open(file_path, 'r') as file:
    data = json.load(file)
print("JSON data loaded successfully.")

# Create a list of file_name for all images where category_id = 0
category_id = 0
image_file_names = []

# Create a dictionary for quick lookup of image file names by image id
print("Creating a dictionary for quick lookup of image file names by image id...")
image_dict = {image['id']: image['file_name'] for image in data['images']}
print("Dictionary created successfully.")

# Filter annotations for the required category_id and get the corresponding image file names
print(f"Filtering annotations for category_id = {category_id}...")
for annotation in data['annotations']:
    if annotation['category_id'] == category_id:
        image_id = annotation['image_id']
        if image_id in image_dict:
            image_file_names.append(image_dict[image_id])
print(f"Found {len(image_file_names)} images with category_id = {category_id}.")

# Take a random sample of 19,000 images
sample_size = min(19000, len(image_file_names))
print(f"Taking a random sample of {sample_size} images...")
sampled_file_names = random.sample(image_file_names, sample_size)
print("Random sample taken successfully.")

# Ensure the save path exists
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Function to download a single image
def download_image(file_name, save_path):
    base_file_name = os.path.basename(file_name)
    url = f"https://storage.googleapis.com/public-datasets-lila/idaho-camera-traps/public/{file_name}"
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(f"{save_path}/{base_file_name}", 'wb') as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
    else:
        print(f"Failed to download {url}")

# Download images with progress bar
print("Starting the download process with progress bar...")
total_files = len(sampled_file_names)
with tqdm(total=total_files, desc="Downloading images", unit="file") as pbar:
    for file_name in sampled_file_names:
        download_image(file_name, save_path)
        pbar.update(1)

print("Download process completed.")


In [None]:
import json
import random
import os
import requests
from tqdm import tqdm

# Load the JSON data
file_path = '../data/detector_tests/Idaho_Empty_Images/idaho-camera-traps.json'
save_path = '../data/detector_tests/Idaho_Empty_Images/downloaded_images'
print(f"Loading JSON data from {file_path}...")
with open(file_path, 'r') as file:
    data = json.load(file)
print("JSON data loaded successfully.")

# Create a list of file_name for all images where category_id = 0
category_id = 0
image_file_names = []

# Create a dictionary for quick lookup of image file names by image id
print("Creating a dictionary for quick lookup of image file names by image id...")
image_dict = {image['id']: image['file_name'] for image in data['images']}
print("Dictionary created successfully.")

# Filter annotations for the required category_id and get the corresponding image file names
print(f"Filtering annotations for category_id = {category_id}...")
for annotation in data['annotations']:
    if annotation['category_id'] == category_id:
        image_id = annotation['image_id']
        if image_id in image_dict:
            image_file_names.append(image_dict[image_id])
print(f"Found {len(image_file_names)} images with category_id = {category_id}.")

# Take a random sample of 19,000 images
sample_size = min(19000, len(image_file_names))
print(f"Taking a random sample of {sample_size} images...")
sampled_file_names = random.sample(image_file_names, sample_size)
print("Random sample taken successfully.")

# Ensure the save path exists
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Check which images have already been downloaded
downloaded_files = set(os.listdir(save_path))
remaining_files = [file_name for file_name in sampled_file_names if os.path.basename(file_name) not in downloaded_files]

# Adjust the number of remaining files to ensure the total is 19,000
total_downloaded = len(downloaded_files)
needed_files = 19000 - total_downloaded
remaining_files = remaining_files[:needed_files]

print(f"{total_downloaded} images already downloaded. {len(remaining_files)} images remaining.")

# Function to download a single image
def download_image(file_name, save_path):
    base_file_name = os.path.basename(file_name)
    url = f"https://storage.googleapis.com/public-datasets-lila/idaho-camera-traps/public/{file_name}"
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(f"{save_path}/{base_file_name}", 'wb') as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        return True
    else:
        print(f"Failed to download {url}")
        return False

# Download images with progress bar
print("Starting the download process with progress bar...")
total_files = len(remaining_files)
with tqdm(total=total_files, desc="Downloading images", unit="file") as pbar:
    for file_name in remaining_files:
        if download_image(file_name, save_path):
            pbar.update(1)

print("Download process completed.")


## Run inferrence on the images using MegaDetector and DeepFaune

### MegaDetector

In [None]:
import pandas as pd
from megadetector.detection.run_detector_batch import load_and_run_detector_batch, write_results_to_file
from megadetector.utils import path_utils
import os
import pandas as pd
import io
import sys


# Define the directory
empty_folder = '../data/detector_tests/Idaho_Empty_Images/downloaded_images'

# Get the list of files in the directory
file_names = os.listdir(empty_folder)

# Create a DataFrame
df = pd.DataFrame(file_names, columns=["File Name"])

# Recursively find images in the processed folder

image_file_names = path_utils.find_images(empty_folder, recursive=True)

sys.stdout = io.StringIO()

confidence_threshold = 0.005

# Run MegaDetector, default confidence threshold 0.005
results = load_and_run_detector_batch('MDV5A', image_file_names, confidence_threshold)

# Initialize columns for category and confidence
df['MD_Category'] = None
df['MD_Confidence'] = None

# Dictionary to map category numbers to names
category_map = {'1': 'Animal', '2': 'Human', '3': 'Vehicle'}

# Populate the DataFrame
for entry in results:
    file_name = os.path.basename(entry['file'])
    max_confidence = entry['max_detection_conf']
    if entry['detections']:
        highest_conf_detection = max(entry['detections'], key=lambda x: x['conf'])
        category = category_map.get(highest_conf_detection['category'], 'Unknown')
    else:
        category = 'Empty'
    df.loc[df['File Name'] == file_name, 'MD_Category'] = category
    df.loc[df['File Name'] == file_name, 'MD_Confidence'] = max_confidence

df.to_csv('../data/detector_tests/data.csv', index=False)


### DeepFaune

In [None]:
import pandas as pd
from ultralytics import YOLO
from tqdm import tqdm
import logging
import os

df = pd.read_csv('../data/detector_tests/data.csv')


# Define paths
empty_folder = '../data/detector_tests/Idaho_Empty_Images/downloaded_images'
model_path = "../models/deepfaune-yolov8s_960.pt"

# Load the YOLOv8 model
model = YOLO(model_path, verbose=False)
model.conf = 0.005  # Set confidence threshold

# Suppress YOLO logs
logging.getLogger('ultralytics').setLevel(logging.WARNING)

# Function to process a batch of images
def process_batch(image_paths):
    results = model(image_paths)
    batch_detections = []
    batch_confidences = []
    for result in results:
        if result.boxes is None or len(result.boxes) == 0:
            batch_detections.append("Empty")
            batch_confidences.append(0)
        else:
            highest_conf_detection = max(result.boxes, key=lambda x: x.conf[0])  # Access the highest confidence detection
            category = highest_conf_detection.cls[0].item()  # Access the class/category index
            confidence = highest_conf_detection.conf[0].item()  # Access the confidence score
            
            if category == 0:
                batch_detections.append("Animal")
            elif category == 1:
                batch_detections.append("Human")
            elif category == 2:
                batch_detections.append("Vehicle")
            else:
                batch_detections.append("Unknown")
                
            batch_confidences.append(confidence)
    return batch_detections, batch_confidences

# Process images in batches with progress bar
batch_size = 32  # 32 = 3.6GB of VRAM
image_paths = [os.path.join(empty_folder, fname) for fname in df['File Name'].tolist()]
batch_results = []
batch_confidences = []


In [5]:
import pandas as pd
from ultralytics import YOLO
from tqdm import tqdm
import logging
import os
from PIL import Image, UnidentifiedImageError

df = pd.read_csv('../data/detector_tests/Idaho_Empty_Images/data (3rd copy).csv')

# Define paths
empty_folder = '../data/detector_tests/Idaho_Empty_Images/downloaded_images'
model_path = "../models/deepfaune-yolov8s_960.pt"

# Load the YOLOv8 model
model = YOLO(model_path, verbose=False)

# Suppress YOLO logs
logging.getLogger('ultralytics').setLevel(logging.WARNING)

# Function to process a batch of images
def process_batch(image_paths):
    batch_detections = []
    batch_confidences = []
    try:
        results = model(image_paths, conf=0.005, task='detect')
        for result in results:
            if result.boxes is None or len(result.boxes) == 0:
                batch_detections.append("Empty")
                batch_confidences.append(0)
            else:
                highest_conf_detection = max(result.boxes, key=lambda x: x.conf[0])  # Access the highest confidence detection
                category = highest_conf_detection.cls[0].item()  # Access the class/category index
                confidence = highest_conf_detection.conf[0].item()  # Access the confidence score

                if category == 0:
                    batch_detections.append("Animal")
                elif category == 1:
                    batch_detections.append("Human")
                elif category == 2:
                    batch_detections.append("Vehicle")
                else:
                    batch_detections.append("Unknown")

                batch_confidences.append(confidence)
    except Exception as e:
        print(f"Error processing batch: {e}")
        # Log and skip the entire batch if an error occurs
        batch_detections.extend(["Error"] * len(image_paths))
        batch_confidences.extend([0] * len(image_paths))
    return batch_detections, batch_confidences

# Process images in batches with progress bar
batch_size = 32  # 32 = 3.6GB of VRAM
image_paths = [os.path.join(empty_folder, fname) for fname in df['File Name'].tolist()]
batch_results = []
batch_confidences = []

# Initialize progress bar
with tqdm(total=len(image_paths), desc="Processing images", unit="image") as pbar:
    for i in range(0, len(image_paths), batch_size):
        batch = image_paths[i:i + batch_size]
        valid_batch = []
        for image_path in batch:
            try:
                # Check if the image can be opened
                with Image.open(image_path) as img:
                    img.verify()  # Verify that it's an image
                valid_batch.append(image_path)
            except (OSError, UnidentifiedImageError) as e:
                print(f"Skipping corrupted image: {image_path} - {str(e)}")
                pbar.update(1)  # Update progress bar for skipped images

        if valid_batch:
            detections, confidences = process_batch(valid_batch)
            batch_results.extend(detections)
            batch_confidences.extend(confidences)
            pbar.update(len(valid_batch))

# Add or update the 'DF_Detector' and 'DF_Detector_Conf' columns in the DataFrame
df['DF_Detector'] = batch_results
df['DF_Detector_Conf'] = batch_confidences

# Save the updated DataFrame back to CSV or Excel
df.to_csv('../data/detector_tests/data.csv', index=False)

print("YOLO detection results added to the DataFrame and saved successfully.")


Processing images: 100%|██████████| 18945/18945 [21:34<00:00, 14.64image/s]

YOLO detection results added to the DataFrame and saved successfully.





In [6]:
df

Unnamed: 0,File Name,MD_Category,MD_Confidence,DF_Detector,DF_Detector_Conf,Ground_Truth_Detect
0,loc_0067_im_011902.jpg,Empty,0.00000,Empty,0.000000,Empty
1,loc_0067_im_010802.jpg,Animal,0.12200,Animal,0.382068,Empty
2,loc_0008_im_009370.jpg,Empty,0.00000,Animal,0.012061,Empty
3,loc_0033_im_009075.jpg,Empty,0.00000,Empty,0.000000,Empty
4,loc_0041_im_006823.jpg,Animal,0.01480,Animal,0.470771,Empty
...,...,...,...,...,...,...
18940,loc_0130_im_010631.jpg,Animal,0.00548,Animal,0.251582,Empty
18941,loc_0018_im_013482.jpg,Animal,0.15300,Animal,0.049951,Empty
18942,loc_0048_im_006344.jpg,Animal,0.01370,Empty,0.000000,Empty
18943,loc_0027_im_006340.jpg,Animal,0.02860,Animal,0.016934,Empty


In [None]:
df.to_csv('../data/detector_tests/data.csv', index=False)
