# Download the images to be analyzed.

### Imports

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import time

### Read in GBIF export table

In [2]:
table = pd.read_table('./raw_data/0002206-250218110819086/multimedia.txt',sep='\t')

### Show the first couple rows

In [3]:
table[:5]

Unnamed: 0,gbifID,type,format,identifier,references,title,description,source,audience,created,creator,contributor,publisher,license,rightsHolder
0,923911394,StillImage,image/jpeg,https://inaturalist-open-data.s3.amazonaws.com...,https://www.inaturalist.org/photos/643856,,,,,2014-01-20T15:34:44Z,Susan Elliott,,iNaturalist,http://creativecommons.org/licenses/by-nc/4.0/,Susan Elliott
1,923910407,StillImage,image/jpeg,https://inaturalist-open-data.s3.amazonaws.com...,https://www.inaturalist.org/photos/479985,,,,,2013-08-31T14:37:44Z,Susan Elliott,,iNaturalist,http://creativecommons.org/licenses/by-nc/4.0/,Susan Elliott
2,923910407,StillImage,image/jpeg,https://inaturalist-open-data.s3.amazonaws.com...,https://www.inaturalist.org/photos/479981,,,,,2013-08-31T14:55:07Z,Susan Elliott,,iNaturalist,http://creativecommons.org/licenses/by-nc/4.0/,Susan Elliott
3,911492230,StillImage,image/jpeg,https://inaturalist-open-data.s3.amazonaws.com...,https://www.inaturalist.org/photos/763481,,,,,2009-07-23T13:57:54Z,Matthew O'Donnell,,iNaturalist,http://creativecommons.org/licenses/by-nc-sa/4.0/,Matthew O'Donnell
4,899970365,StillImage,image/jpeg,https://inaturalist-open-data.s3.amazonaws.com...,https://www.inaturalist.org/photos/793926,,,,,2013-07-16T21:00:34Z,Matt Flower,,iNaturalist,http://creativecommons.org/licenses/by-nc/4.0/,Matt Flower


In [4]:
len(table)

41069

### Download the images

In [5]:
# dataset and filepath to log any failed downloads
output_dir = "/Volumes/My Passport/monarda_fistulosa_segmentation/image_dataset/images"
failed_log = "/Volumes/My Passport/monarda_fistulosa_segmentation/image_dataset/failed_download_urls.txt"

# make output dir if doesn't exist
os.makedirs(output_dir, exist_ok=True)

# clear any existing failed log file before starting
with open(failed_log, "w") as f:
    pass

In [None]:
# loop through urls, download each image
for idx, url in enumerate(table.identifier):
    try:
        # request with a timeout
        response = requests.get(url, stream=True, timeout=10)
        response.raise_for_status()  # Check for HTTP errors

        # filename using the df's index value
        filename = f"{table.index[idx]}.jpg"
        file_path = os.path.join(output_dir, filename)

        # write the image to a file in chunks
        with open(file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:  # filter out keep-alive chunks
                    f.write(chunk)
        
        # space out requests to be nice
        time.sleep(0.2)
    
    except requests.exceptions.RequestException as e:
        error_message = f"Failed to download {url}. Error: {e}\n"
        print(error_message)
        # add failed url and error message to the log file immediately
        with open(failed_log, "a") as f:
            f.write(url + "\n")

### redo any failed downloads (can happen if momentary disconnection, etc)

In [7]:
# dir for dataset and filepath for logging failed downloads
output_dir = "/Volumes/My Passport/monarda_fistulosa_segmentation/image_dataset/images"
failed_log = "/Volumes/My Passport/monarda_fistulosa_segmentation/image_dataset/failed_download_urls.txt"

In [8]:
with open(failed_log,'r') as f:
    failed = f.read().split()

In [10]:
len(failed)

2610

In [14]:
failed_idxs = [table[table.identifier.eq(i)].index[0] for i in failed]

In [16]:
# loop through urls, download each image
for idx in failed_idxs:
    url = table.identifier[idx]
    try:
        # request with a timeout
        response = requests.get(url, stream=True, timeout=10)
        response.raise_for_status()  # Check for HTTP errors

        # filename using the df's index value
        filename = f"{table.index[idx]}.jpg"
        file_path = os.path.join(output_dir, filename)

        # write the image to a file in chunks
        with open(file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:  # filter out keep-alive chunks
                    f.write(chunk)
        
        # space out requests to be nice
        time.sleep(0.1)
    
    except requests.exceptions.RequestException as e:
        error_message = f"Failed to download {url}. Error: {e}\n"
        print(error_message)
        # add failed url and error message to the log file immediately
        with open(failed_log, "a") as f:
            f.write(url + "\n")

# Make sure they all downloaded

In [43]:
files = os.listdir('/Volumes/My Passport/monarda_fistulosa_segmentation/image_dataset/images')

In [56]:
nums = np.sort([int(i.split('.')[0]) for i in files])

In [58]:
np.alltrue(nums == np.array(range(41069)))

True