# Pre requisites

Install and import required packages

In [1]:
import boto3
from pathlib import Path
from botocore import UNSIGNED
from botocore.client import Config
from tqdm.notebook import tqdm

# Download files from s3 bucket

Get a list of all files in the bucket and download into respective folders within the colab runtime

In [2]:
def get_file_folders(s3_client, bucket_name, prefix=""):
    file_names = []
    folders = []

    default_kwargs = {
        "Bucket": bucket_name,
        "Prefix": prefix
    }
    next_token = ""

    while next_token is not None:
        updated_kwargs = default_kwargs.copy()
        if next_token != "":
            updated_kwargs["ContinuationToken"] = next_token

        response = s3_client.list_objects_v2(**updated_kwargs)
        contents = response.get("Contents")

        for result in contents:
            key = result.get("Key")
            if key[-1] == "/":
                folders.append(key)
            else:
                file_names.append(key)

        next_token = response.get("NextContinuationToken")

    return file_names, folders

In [3]:
def download_files(s3_client, bucket_name, local_path, file_names, folders):
    local_path = Path(local_path)

    for folder in tqdm(folders):
        folder_path = Path.joinpath(local_path, folder)
				# Create all folders in the path
        folder_path.mkdir(parents=True, exist_ok=True)

    for file_name in tqdm(file_names):
        file_path = Path.joinpath(local_path, file_name)
				# Create folder for parent directory
        file_path.parent.mkdir(parents=True, exist_ok=True)
        s3_client.download_file(
            bucket_name,
            file_name,
            str(file_path)
        )

In [4]:
client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
file_names, folders = get_file_folders(client, 'eyes-on-the-ground')

In [6]:
download_files(
        client,
        "eyes-on-the-ground",
        "data",
        file_names,
        folders
    )

0it [00:00, ?it/s]

  0%|          | 0/34733 [00:00<?, ?it/s]

# Archive the images to download from Colab

## Create zip archive of images

In [None]:
from zipfile import ZipFile
import os

def get_all_file_paths(directory):

    # initializing empty file paths list
    file_paths = []

    # crawling through directory and subdirectories
    for root, directories, files in os.walk(directory):
        for filename in files:
            # join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)

    # returning all file paths
    return file_paths

def create_zip_file(directory, zip_path):
    # calling function to get all file paths in the directory
    file_paths = get_all_file_paths(directory)

    # printing the list of all files to be zipped
    print('Following files will be zipped:')
    for file_name in file_paths:
        print(file_name)

    # writing files to a zipfile
    with ZipFile(zip_path,'w') as zip:
        # writing each file one by one
        for path in file_paths:
            arcname = path.split('/')[-1]
            zip.write(path, arcname)

    print('All files zipped successfully!')


In [None]:
zip_names = ['/content/train.zip', '/content/test.zip']

folders = ['/content/train', '/content/test']
for zip_name,folder in zip(zip_names, folders):
    create_zip_file(folder, zip_name)

## Download data to local storage

Save zipped images and metadata to downloads folder on local machine

In [None]:
from google.colab import files

data_files = [
    '/content/README.md',
    '/content/train.csv',
    '/content/train.zip',
    '/content/test.zip'
    ]

for pth in data_files:
    files.download(pth)