# This notebook contains code to download the GPT-filtered image dataset to a specified directory using the identifiers in the composite dataframe `filtered_labeled_data.csv'

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import time

# specify the directory in which to save the images

In [2]:
# dataset and filepath to log any failed downloads
output_dir = "/Volumes/My Passport/monarda_fistulosa_segmentation/tmp_test_download/"
failed_log = "/Volumes/My Passport/monarda_fistulosa_segmentation/failed_download_urls.txt"

# make output dir if doesn't exist
os.makedirs(output_dir, exist_ok=True)

# clear any existing failed log file before starting
with open(failed_log, "w") as f:
    pass

# open the dataset

In [3]:
color_df=pd.read_csv('./filtered_labeled_data.csv')
color_df

Unnamed: 0,image_idx,hex,rgb,hsl,lab,gbifID,identifier,latitude,longitude
0,0,#eeddf6,"(238, 221, 246)","(0.7799999999999999, 0.5813953488372098, 0.915...","(230, 138, 118)",923911394,https://inaturalist-open-data.s3.amazonaws.com...,48.826305,-102.092171
1,2,#c7aac4,"(199, 170, 196)","(0.8505747126436781, 0.20567375886524827, 0.72...","(186, 143, 119)",923910407,https://inaturalist-open-data.s3.amazonaws.com...,43.613086,-73.057076
2,4,#bb9ed5,"(187, 158, 213)","(0.7545454545454545, 0.3956834532374102, 0.727...","(177, 149, 104)",899970365,https://inaturalist-open-data.s3.amazonaws.com...,43.066871,-87.890565
3,7,#9175a0,"(145, 117, 160)","(0.7751937984496124, 0.18454935622317595, 0.54...","(136, 147, 109)",891778924,https://inaturalist-open-data.s3.amazonaws.com...,42.140556,-87.831643
4,9,#b796e2,"(183, 150, 226)","(0.7390350877192983, 0.5671641791044776, 0.737...","(172, 155, 94)",891760719,https://inaturalist-open-data.s3.amazonaws.com...,38.679240,-97.990035
...,...,...,...,...,...,...,...,...,...
20623,41059,#b3a5d8,"(179, 165, 216)","(0.7124183006535948, 0.39534883720930225, 0.74...","(180, 143, 104)",1024218211,https://inaturalist-open-data.s3.amazonaws.com...,41.569419,-88.150552
20624,41060,#ae66b9,"(174, 102, 185)","(0.8112449799196787, 0.3721973094170403, 0.562...","(138, 170, 96)",1024202810,https://inaturalist-open-data.s3.amazonaws.com...,45.057871,-87.168277
20625,41061,#a88dc4,"(168, 141, 196)","(0.7484848484848484, 0.3179190751445088, 0.660...","(160, 149, 103)",1024200169,https://inaturalist-open-data.s3.amazonaws.com...,42.921738,-88.026752
20626,41063,#baada2,"(186, 173, 162)","(0.076388888888889, 0.14814814814814817, 0.682...","(182, 131, 135)",1024198670,https://inaturalist-open-data.s3.amazonaws.com...,40.791723,-80.492498


# download the images using the urls in the `identifier` column, and giving them file names from the `image_idx` column

In [None]:
# loop through urls, download each image
for idx, url in zip(color_df.image_idx,color_df.identifier):
    try:
        # request with a timeout
        response = requests.get(url, stream=True, timeout=10)
        response.raise_for_status()  # Check for HTTP errors

        # filename using the df's index value
        filename = f"{idx}.jpg"
        file_path = os.path.join(output_dir, filename)

        # write the image to a file in chunks
        with open(file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:  # filter out keep-alive chunks
                    f.write(chunk)
        
        # space out requests to be nice
        time.sleep(0.2)
    
    except requests.exceptions.RequestException as e:
        error_message = f"Failed to download {url}. Error: {e}\n"
        print(error_message)
        # add failed url and error message to the log file immediately
        with open(failed_log, "a") as f:
            f.write(url + "\n")