## Load metadata

Note: After run this cell, you should have a file named dev_set.csv in the current path

In [1]:
# Install dependencies
!pip install dvc dvc-gdrive &> /dev/null

# Get metadata from the PADs dataset registry
!dvc get  https://github.com/PaperAnalyticalDeviceND/pad_dataset_registry datasets/FHI2020_Stratified_Sampling/dev_set.csv  &> /dev/null

# dataset name
dataset_name = 'FHI2020_Stratified_Sampling'

# Path to the metadata
metadata_path = 'dev_set.csv'


## Visualize the data

In [2]:
# Visualize the metadata using pandas
import pandas as pd
data = pd.read_csv(metadata_path)

data

Unnamed: 0,id,sample_id,sample_name,quantity,camera_type_1,url,hashlib_md5,image_name
0,15214,53698,amoxicillin,100,Google Pixel 3a,https://pad.crc.nd.edu//var/www/html/images/pa...,5a2e10df1774f1941e0b5268da64ee6a,15214__53698__amoxicillin__100.png
1,15215,53691,amoxicillin,100,Google Pixel 3a,https://pad.crc.nd.edu//var/www/html/images/pa...,3ccc5f7a45a7eb0d0d4cd6ae84e409b0,15215__53691__amoxicillin__100.png
2,15216,53698,amoxicillin,100,Google Pixel 3a,https://pad.crc.nd.edu//var/www/html/images/pa...,654d7fd30b4a33db3c0f99cb5494de3c,15216__53698__amoxicillin__100.png
3,15218,53691,amoxicillin,100,Google Pixel 3a,https://pad.crc.nd.edu//var/www/html/images/pa...,d66df94b627ec91f53960fa1d27f8ee3,15218__53691__amoxicillin__100.png
4,15221,53694,amoxicillin,100,Google Pixel 3a,https://pad.crc.nd.edu//var/www/html/images/pa...,935d35bcc61257d40dcac5724c212954,15221__53694__amoxicillin__100.png
...,...,...,...,...,...,...,...,...
5657,25629,55503,ripe,20,samsung SM-A505F,https://pad.crc.nd.edu//var/www/html/images/pa...,26df8c35f27c421d1983d8ed79521071,25629__55503__ripe__20.png
5658,25631,55068,ripe,20,samsung SM-A505F,https://pad.crc.nd.edu//var/www/html/images/pa...,21b5e8f84c6e5643feca160f38282315,25631__55068__ripe__20.png
5659,25633,55491,ripe,20,samsung SM-A505F,https://pad.crc.nd.edu//var/www/html/images/pa...,222126c9dbc73935c8166b8c523afe91,25633__55491__ripe__20.png
5660,25641,55501,ripe,20,samsung SM-A505F,https://pad.crc.nd.edu//var/www/html/images/pa...,8db98a33f7edf97e512ad6ce2c76e928,25641__55501__ripe__20.png


## Download Dataset

Functions

In [3]:
import csv, os
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def download_file(url, filename, images_path):
    """Download a file from a URL and save it to a local file."""
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        path = os.path.join(images_path, filename)
        with open(path, 'wb') as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)

def download_files_from_csv_file(file_path, images_path):
    """Download files in parallel based on URLs from a CSV file with a progress bar."""
    # Open the CSV file and parse its content
    with open(file_path, newline='') as csvfile:
        rows = list(csv.DictReader(csvfile)) # Convert to list for tqdm

        # Initialize tqdm for the progress bar
        pbar = tqdm(total=len(rows), desc="Downloading files")

        def update(*args):
            # Update the progress bar by one each time a file is downloaded
            pbar.update()

        # Use ThreadPoolExecutor to download files in parallel
        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = []
            for row in rows:
                url = row['url']
                filename = row['image_name']
                # Schedule the download task
                future = executor.submit(download_file, url, filename, images_path)
                future.add_done_callback(update)
                futures.append(future)

            # Wait for all futures to complete
            for future in futures:
                future.result()

        # Close the progress bar
        pbar.close()

In [4]:
# create a folder to save all dataset  files
!mkdir $dataset_name

# copy the metadata  and report to the dataset folder
!cp  $metadata_path $dataset_name/$metadata_path

# Folder to save the images inside the dataset folder
images_path = os.path.join(dataset_name, 'dev_images')
!mkdir $images_path

In [None]:
# Call the function to start downloading image files
download_files_from_csv_file(metadata_path, images_path)

Downloading files:   9%|▉         | 506/5662 [00:59<10:25,  8.24it/s]

### Save the dataset

You have the option to save the dataset either on your computer or in a folder in your Google Drive.


#### Save it in a folder in your Google Drive (recomended)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
my_path = "/content/drive/MyDrive/"

!cp -r $dataset_name/ $my_path

#### Save it on your computer (slow)


Uncomment the lines to run the cell below

In [None]:
# from google.colab import files

# !zip -r $dataset_name.zip $dataset_name/ &> /dev/null
# files.download(f"{dataset_name}.zip")