# Download Empty Images From the LILA Datasets

## Wellington Dataset

In [None]:
import pandas as pd
import random
import os
import requests
from tqdm import tqdm

# Define the file path
file_path = '../data/detector_tests/wellington_camera_traps.csv'
print(f"Loading CSV file from {file_path}")

# Load the CSV file
data = pd.read_csv(file_path)
print("CSV file loaded successfully.")

# Filter the rows with label 'NOTHINGHERE'
print("Filtering rows with label 'NOTHINGHERE'.")
nothinghere_files = data[data['label'] == 'NOTHINGHERE']['file']
print(f"Found {len(nothinghere_files)} files with label 'NOTHINGHERE'.")

# Sample 20,000 file names randomly
print("Sampling 20,000 file names randomly.")
sampled_files = nothinghere_files.sample(n=20000, random_state=42)
print("Sampled 20,000 file names.")


# Create a directory to save the downloaded images
download_dir = 'downloaded_images'
os.makedirs(download_dir, exist_ok=True)
print(f"Created directory {download_dir} for downloading images.")


# Download the images with a progress bar
base_url = "https://storage.googleapis.com/public-datasets-lila/wellington-unzipped/images/"
print("Starting to download images.")
for file_name in tqdm(sampled_files, desc="Downloading images"):
    file_name = file_name.replace('.jpg', '.JPG')
    url = base_url + file_name
    response = requests.get(url)
    if response.status_code == 200:
        with open(os.path.join(download_dir, file_name), 'wb') as f:
            f.write(response.content)
    else:
        print(f"Failed to download {file_name}")

print("Download complete.")


## WCS Dataset

In [None]:
import json
import os
import requests
import random
from tqdm import tqdm

# Define the JSON file path
json_file_path = '../data/detector_tests/wcs_camera_traps.json'
print(f"Loading JSON file from {json_file_path}")

# Load the JSON file
with open(json_file_path, 'r') as f:
    data = json.load(f)
print("JSON file loaded successfully.")

# Extract "empty" category id
empty_category_id = None
for category in data['categories']:
    if category['name'] == 'empty':
        empty_category_id = category['id']
        break
if empty_category_id is None:
    print("No 'empty' category found in JSON.")
else:
    # Extract images with "empty" category
    print("Extracting images with 'empty' category.")
    empty_image_ids = set()
    for annotation in tqdm(data['annotations'], desc="Extracting image IDs"):
        if annotation['category_id'] == empty_category_id:
            empty_image_ids.add(annotation['image_id'])

    empty_images = [img for img in data['images'] if img['id'] in empty_image_ids]
    print(f"Found {len(empty_images)} images with 'empty' category.")

    # Randomly sample 20,000 images
    if len(empty_images) > 20000:
        empty_images = random.sample(empty_images, 20000)
    print(f"Sampled {len(empty_images)} images for download.")

    # Create a directory to save the downloaded images
    download_dir = '../data/detector_tests/downloaded_empty_images'
    os.makedirs(download_dir, exist_ok=True)
    print(f"Created directory {download_dir} for downloading images.")

    # Download the images with a progress bar
    base_url = "https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/"
    print("Starting to download images.")

    # Modify the download loop to start from image 1093
    start_index = 0
    for i, image in enumerate(tqdm(empty_images, desc="Downloading images")):
        if i < start_index:
            continue
        file_name = image['file_name'].lower()
        url = f"{base_url}{file_name}"
        response = requests.get(url)
        if response.status_code == 200:
            with open(os.path.join(download_dir, f"{i}.jpg"), 'wb') as f:
                f.write(response.content)
        else:
            print(f"Failed to download {file_name} from {url}")


## Fauna and Flora Database

In [None]:
import pandas as pd
import random
import os
import requests
from tqdm import tqdm

# Define the file path
file_path = '../data/detector_tests/Fauna_Flora_Image_Database.csv'
print(f"Loading CSV file from {file_path}")

# Load the CSV file
data = pd.read_csv(file_path)
print("CSV file loaded successfully.")

# Filter the rows with label 'Blank'
print("Filtering rows with label 'Blank'.")
blank_file_urls = data[data['common_name'] == 'Blank']['location']
print(f"Found {len(blank_file_urls)} files with label 'Blank'.")

# Sample 20,000 file names randomly
print("Sampling 19,000 file names randomly.")
sampled_files = blank_file_urls.sample(n=19000, random_state=42)
print("Sampled 19,000 file names.")

# Create a directory to save the downloaded images
download_dir = '../data/detector_tests/Flower_Fauna_Empty_Images'
os.makedirs(download_dir, exist_ok=True)
print(f"Created directory {download_dir} for downloading images.")

# Download the images with a progress bar
print("Starting to download images.")

# Modify the download loop to start from image 1093
start_index = 0
for i, url in enumerate(tqdm(sampled_files, desc="Downloading images")):
    if i < start_index:
        continue
    response = requests.get(url)
    if response.status_code == 200:
        with open(os.path.join(download_dir, f"{i}.jpg"), 'wb') as f:
            f.write(response.content)
    else:
        print(f"Failed to download {url}")


In [4]:
sampled_files[0]

'https://app.wildlifeinsights.org/download/2005775/data-files/28594a77-236b-44ff-8c51-1b1f68fb3f87'

## Run inferrence on the images using MegaDetector and DeepFaune

### MegaDetector

In [None]:
import pandas as pd
from megadetector.detection.run_detector_batch import load_and_run_detector_batch, write_results_to_file
from megadetector.utils import path_utils

# Recursively find images in the processed folder

image_file_names = path_utils.find_images(processed_folder, recursive=True)

sys.stdout = io.StringIO()

confidence_threshold = 0.005

# Run MegaDetector, default confidence threshold 0.005
results = load_and_run_detector_batch('MDV5A', image_file_names, confidence_threshold)

# Write results to output file
write_results_to_file(results, output_file, relative_path_base=processed_folder, detector_file='MDV5A')

### DeepFaune

In [None]:
import pandas as pd
from ultralytics import YOLO
from tqdm import tqdm

# Define paths
excel_path = "../data/detector_tests/original_labelled_photos_file_paths_detections.xlsx"
model_path = "../models/deepfaune-yolov8s_960.pt"

# Load the Excel file
df = pd.read_excel(excel_path)

# Ensure the path components are treated as strings
df['Original Path'] = df['Original Path'].astype(str)
df['Original Filename'] = df['Original Filename'].astype(str)
df['File Extension'] = df['File Extension'].astype(str)

# Combine file path, name, and extension to get the full image path
df['Full Path'] = df['Original Path'] + '/' + df['Original Filename'] + '.' + df['File Extension']

# Load the YOLOv8 model
model = YOLO(model_path, verbose=False)
model.conf = 0.2  # Set confidence threshold

# Suppress YOLO logs
import logging
logging.getLogger('ultralytics').setLevel(logging.WARNING)

# Function to process a batch of images
def process_batch(image_paths):
    results = model(image_paths)
    batch_detections = []
    batch_confidences = []
    for result in results:
        if result.boxes is None or len(result.boxes) == 0:
            batch_detections.append("Empty")
            batch_confidences.append(0)
        else:
            highest_conf_detection = max(result.boxes, key=lambda x: x.conf[0])  # Access the highest confidence detection
            category = highest_conf_detection.cls[0].item()  # Access the class/category index
            confidence = highest_conf_detection.conf[0].item()  # Access the confidence score
            
            if category == 0:
                batch_detections.append("Animal")
            elif category == 1:
                batch_detections.append("Human")
            elif category == 2:
                batch_detections.append("Vehicle")
            else:
                batch_detections.append("Unknown")
                
            batch_confidences.append(confidence)
    return batch_detections, batch_confidences

# Process images in batches with progress bar
batch_size = 32  # 32 = 3.6GB of VRAM
image_paths = df['Full Path'].tolist()
batch_results = []
batch_confidences = []

# Initialize progress bar
with tqdm(total=len(image_paths), desc="Processing images", unit="image") as pbar:
    for i in range(0, len(image_paths), batch_size):
        batch = image_paths[i:i + batch_size]
        detections, confidences = process_batch(batch)
        batch_results.extend(detections)
        batch_confidences.extend(confidences)
        pbar.update(len(batch))

# Add or update the 'DF_Detector' and 'DF_Detector_Conf' columns in the DataFrame
df['DF_Detector'] = batch_results
df['DF_Detector_Conf'] = batch_confidences

# Save the updated DataFrame back to Excel
df.to_excel(excel_path, index=False)
