#  Download full SAFE archive as .zip and upload to kaggle

 __To use the features in this notebook you need to visit https://dataspace.copernicus.eu and create an account with Copernicus, the official governing body of Sentinel Missions for the European Space Agency (ESA). This takes about 5 minutes to do.__

In [1]:
import os
import sys
import shutil
import pandas as pd
import requests
import json
import datetime
import time
from tqdm import tqdm
import subprocess
from pathlib import Path
from dotenv import load_dotenv

# Batch setup
Steps:
- make sure your log in credentials for cdse are appropriately stored in the .env file in this format with quotation marks: <br><br>
CDSE_email = 'youremail' <br>
CDSE_password = 'yourpassword'<br><br>
- change satellite, if S2A or S2B, depending on batch <br><br>
- change startDate and endDate to reflect the time period for your batch
<br><br>
- output_dir to reflect the REgion & time for your batch in folder name<br><br>__!!Keep it strictly in PO_SX_YY format!!__ <br> longer or shorter strings will break the code <br><br>
- leave query_satellite and query_tile unchanged
- if you find more tiles than written in the table please update as pictured.
- occasionally you will find duplicate records/ instances miliseconds - seconds apart (mostly with one image predominantly blank). This is usually easily noticable because one file will be very small compared to the other, but please keep them until you can unzip and verify there is no ocean visible in the scene with possible annotations. Please still record all SAFE archives downloaded.


### 🛰️ Sentinel-2 Data Summary

| 🛰️ Satellite Type | 📅 From       | 📅 To         | 📦 Number of SAFE Files | 💾 Estimated Size     |
|-------------------|--------------|--------------|--------------------------|------------------------|
| 1️⃣ - S2🅰️_MSIL1C         | 2015-07-04   | 2016-10-17   | 30                       | Maximum 27 GB 💽       |
| 2️⃣ - S2🅰️_MSIL1C         | 2017-02-20   | 2017-10-09   | 17                       | Maximum 17 GB 💽       |
| 3️⃣ - S2🅰️_MSIL1C         | 2018-03-27   | 2018-11-13   | 27                       | Maximum 24 GB 💽       |
| 4️⃣ - <s>S2🅰️_MSIL1C</s> ✅        | <s>2019-02-13   | <s>2019-10-12   | <s>17</s>  *19**                       | Maximum 15 GB 💽     |
| 5️⃣ - S2🅰️_MSIL1C         | 2020-03-16   | 2020-11-05   | 18                       | Maximum 16 GB 💽       |
| 6️⃣ - S2🅰️_MSIL1C         | 2021-03-01   | 2021-08-19   | 18                       | Maximum 16 GB 💽       |
| 7️⃣ - S2🅱️_MSIL1C         | 2017-07-05   | 2018-11-18   | 37                       | Maximum 32 GB 💽       |
| 8️⃣ - S2🅱️_MSIL1C         | 2019-02-08   | 2019-10-17   | 14                       | Maximum 13 GB 💽       |
| 9️⃣ - S2🅱️_MSIL1C         | 2020-02-20   | 2020-11-20   | 21                       | Maximum 19 GB 💽       |
| 🔟 - S2🅱️_MSIL1C         | 2021-03-29   | 2021-06-18   | 10                       | Maximum 8 GB 💽        |

In [None]:
# 1 Required satellite category
query_satellite = 'SENTINEL-2'

# 2 Strings to be included in query for retrieval of specific product by name, 
# i.e S2A vs S2B, and code for AOI tile name
query_product = 'S2A_MSIL1C_' # change to S2B_MSIL1C_
query_tile = 'T33TUL'   # stays the same

# 3 Enter a start and end date
query_startDate = '2019-01-01'   # change as per table above
query_endDate = '2019-12-31'     # change as per table above

# 4 Load your credentials from .env
load_dotenv()
username=os.getenv("CDSE_email")
password=os.getenv("CDSE_password")
# if not already in .env config, insert them as 'string' 
# values in the following format to the .env file:
CDSE_email = username
CDSE_password = password

# 5 Set output file:
output_dir = './SAFE/PO_2A_19' #edit folder name within SAFE/ as appropriate to add batch folders
# i.e. keep format like: 
# ./SAFE/PO_2A_17 for Sentinel 2A until 2017
# ./SAFE/PO_2B_18 for Sentinel 2B until 2018 etc. 
# =============================================== 
# ! DO NOT CHANGE THE LENGTH OF THE FOLDER NAME! 
# =============================================== 
# This is important for the download script to work properly


## Run as is:

In [3]:
def get_access_token(username: str, password: str) -> str:
    data = {
        "client_id": "cdse-public",
        "username":username,
        "password":password,
        "grant_type": "password",
        }
    try:
        r = requests.post(
            "https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/token",
            data=data,
        )
        r.raise_for_status()
    except Exception as e:
        raise Exception(
            f"Access token creation failed. Reponse from the server was: {r.json()}"
            )
    print("Access token created successfully!")
    return r.json()["access_token"]

In [4]:

def get_https_request(satellite, product, tile, start_date, end_date):
    
    base_prefix = "https://catalogue.dataspace.copernicus.eu/odata/v1/Products?$filter="
    collection = "Collection/Name eq '" + satellite + "' and startswith(Name,'" + product + "') and contains(Name,'" + tile + "')"
    content_date = (
            "ContentDate/Start gt " + start_date + "T00:00:00.000Z and " +
            "ContentDate/Start lt " + end_date + "T00:00:00.000Z"
    )
    https_request = (base_prefix + collection +  " and " + content_date) 
    print("Query URL:", https_request)
    return https_request


def download_data(token, id, name, length, output):
    url = f"https://download.dataspace.copernicus.eu/odata/v1/Products({id})/$value"
    headers = {"Authorization": f"Bearer {token}"}
    session = requests.Session()
    session.headers.update(headers)
    response = session.get(url, headers=headers, stream=True)
    try:
        print('[', datetime.datetime.strftime(datetime.datetime.now(), '%H:%M:%S'), '] '+'Downloading: '+name)
        with open(output, "wb") as file:
            if length is not None:
                pbar = tqdm(total=length, unit="B", unit_scale=True, desc=name)
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        file.write(chunk)
                        pbar.update(len(chunk))
                pbar.close()
        print('[', datetime.datetime.strftime(datetime.datetime.now(), '%H:%M:%S'), '] '+'Download complete: '+name)
        response.close()
    except Exception as e:
        print('[', datetime.datetime.strftime(datetime.datetime.now(), '%H:%M:%S'), '] '+'Download failed: '+name)
        print(f"An exception occured: {e}")


def get_file_name(name):
    file_name = ''
    if query_satellite == 'SENTINEL-1':
        file_name = name.replace(".SAFE", ".zip")
    elif query_satellite == 'SENTINEL-2':
        file_name = name.replace(".SAFE", ".zip")
    elif query_satellite == 'SENTINEL-3':
        file_name = name.replace(".SEN3", ".zip")
    elif query_satellite == 'SENTINEL-5P':
        file_name = name.replace(".nc", ".zip")
    elif query_satellite == 'SENTINEL-6':
        file_name = name.replace(".SEN6", ".zip")
    return file_name


# Download non-duplicate tiles matched to litter row data. 
Make sure the path  for ```litterrows = pd.read_excel('../files/LM_centroids.xlsx')```is reflected in your folder structure or 
if using colab, changed to './LM_centroids.xlsx' and the file added to content folder (current workign dir)

In [None]:
request_url = get_https_request(
    query_satellite, query_product, query_tile, query_startDate, query_endDate #, map_geojson, 
)
def get_all_results(url):
    all_results = []
    while url:
        response = requests.get(url).json()
        if 'value' in response:
            all_results.extend(response['value'])
        else:
            print('Unexpected API response structure.')
            break
        url = response.get('@odata.nextLink')  # Move to next page if exists
        if url:
            time.sleep(1)  # Optional: small delay between pages
    return all_results

results = get_all_results(request_url)

if not results:
    print('No data found')
    sys.exit()

df = pd.DataFrame.from_dict(results)
data_id_list = df.Id
data_name_list = df.Name
date_content_length = df.ContentLength

for i in range(len(data_id_list)):
    print(data_name_list[i])
    data_id = data_id_list[i]
    data_name = get_file_name(data_name_list[i])
    data_length = date_content_length[i]

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_file = os.path.join(output_dir, data_name)
# Check if the file has been downloaded before or it has no recorded windrows, 
# in either case, skip it and do not download it (again). If you have a partial or 
# corrupted download, you can delete the file and re-run the script.
# adjust to your path if necessary:
    litterrows = pd.read_csv('../files/s2_product_unique.csv')
    samples_set = set()
    for name in litterrows['s2_product']:
        product_type = name[0:11]
        date_str = name[11:26]
        tile = name[39:45]
        key = product_type + '_' + date_str + '_' + tile
        samples_set.add(key)

    file_name = os.path.basename(output_file).replace('.zip', '')
    product_type = file_name[0:11]
    date_str = file_name[11:26]
    tile = file_name[39:45]
    key = product_type + '_' + date_str + '_' + tile

    if os.path.exists(output_file) and os.path.getsize(output_file) == data_length:
        print(output_file + ' this file already exists')   
    elif key not in samples_set:
        print(output_file + ' has no recorded litter rows')
    else:
        access_token = get_access_token(CDSE_email, CDSE_password)
        download_data(access_token, data_id, data_name, data_length, output_file)

Query URL: https://catalogue.dataspace.copernicus.eu/odata/v1/Products?$filter=Collection/Name eq 'SENTINEL-2' and startswith(Name,'S2A_MSIL1C_') and contains(Name,'T33TUL') and ContentDate/Start gt 2019-01-01T00:00:00.000Z and ContentDate/Start lt 2019-12-31T00:00:00.000Z
S2A_MSIL1C_20190312T100021_N0500_R122_T33TUL_20221129T200337.SAFE
./SAFE/PO_2A_19/S2A_MSIL1C_20190312T100021_N0500_R122_T33TUL_20221129T200337.zip this file already exists
S2A_MSIL1C_20190325T101021_N0500_R022_T33TUL_20221117T131900.SAFE
./SAFE/PO_2A_19/S2A_MSIL1C_20190325T101021_N0500_R022_T33TUL_20221117T131900.zip this file already exists
S2A_MSIL1C_20190315T101021_N0500_R022_T33TUL_20221116T052920.SAFE
./SAFE/PO_2A_19/S2A_MSIL1C_20190315T101021_N0500_R022_T33TUL_20221116T052920.zip this file already exists
S2A_MSIL1C_20190322T100031_N0500_R122_T33TUL_20221119T025646.SAFE
./SAFE/PO_2A_19/S2A_MSIL1C_20190322T100031_N0500_R122_T33TUL_20221119T025646.zip this file already exists
S2A_MSIL1C_20190111T100401_N0500_R122_

# Upload your batch to a new kaggle dataset

## edit here:

In [6]:
## Amend you project root to point to /notebooks 
## or your current directory where .kaggle/ and SAFE/ folders 
## must also be located

project_root = "/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/notebooks"
dataset_main = Path(project_root) / "dataset_main"
kaggle_json_path = Path(project_root) / ".kaggle/kaggle.json"
kaggle_config_dir = kaggle_json_path.parent

In [None]:
batch_folders = [
    Path(project_root) / "SAFE/PO_S2_19",  # change folder name!
#    Path(project_root) / "SAFE/CALAB_2A"  # add multiple if multiple batches
]
dataset_title = "Litter Rows Italy"
dataset_id = "sarahajbane/litter-windrows-batch-4" # change only the number to appropriate batch #
license_name = "CC-BY-SA-4.0"

os.environ['KAGGLE_CONFIG_DIR'] = str(kaggle_config_dir)
os.chmod(kaggle_json_path, 0o600)

dataset_main.mkdir(parents=True, exist_ok=True)

## run as is:

In [11]:
# === COPY BATCH FOLDERS INTO dataset_main ===
for src_folder in batch_folders:
    batch_name = src_folder.name
    dest_folder = dataset_main / batch_name
    dest_folder.mkdir(parents=True, exist_ok=True)

    for zip_file in src_folder.glob("*.zip"):
        dest_file = dest_folder / zip_file.name
        if not dest_file.exists():
            shutil.copy2(zip_file, dest_file)
            print(f"Copied: {zip_file.name} → {batch_name}")
        else:
            print(f"Skipped (already exists): {zip_file.name} in {batch_name}")

metadata_path = dataset_main / "dataset-metadata.json"
if not metadata_path.exists():
    subprocess.run(["kaggle", "datasets", "init", "-u", str(dataset_main)], check=True)

with open(metadata_path, "r") as f:
    metadata = json.load(f)

metadata["title"] = dataset_title
metadata["id"] = dataset_id
metadata["licenses"] = [{
    "name": license_name,
    "title": "Creative Commons Attribution Share-Alike 4.0",
    "path": "https://creativecommons.org/licenses/by-sa/4.0/"
}]

with open(metadata_path, "w") as f:
    json.dump(metadata, f, indent=4)


## edit here:

In [None]:
# === CREATE OR VERSION KAGGLE DATASET ===
def upload_or_version_kaggle_dataset(folder_path, message="Batch 04 zip folders"): # enter batch number
    try:
        result = subprocess.run([
            "kaggle", "datasets", "create",
            "-p", str(folder_path),
            "--dir-mode", "zip"
        ], capture_output=True, text=True, check=True)
        print("Dataset created successfully!")
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print("Dataset already exists. Creating a new version...")
        print(e.stderr)
        version_result = subprocess.run([
            "kaggle", "datasets", "version",
            "-p", str(folder_path),
            "-m", message,
            "--dir-mode", "zip"
        ], capture_output=True, text=True)
        print(version_result.stdout)
        print(version_result.stderr)

# === RUN ===
upload_or_version_kaggle_dataset(dataset_main)

In [None]:
for folder_path in batch_folders:
    file_path = os.path.join(folder_path, 'processing_report.txt')
    tile_count = len([
            entry for entry in os.listdir(folder_path) 
            if os.path.isfile(os.path.join(folder_path, entry))
        ])
    
    with open(file_path, 'w') as file:
        file.write(f'Upload finished, {tile_count} tiles processed and uploaded to {dataset_id}')
    print(f'Created report in {file_path}')

print("All tiles uploaded successfully!")

Created report in /Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/notebooks/SAFE/CORSI_2A/processing_report.txt
All tiled uploaded successfully!


# Finished! 

 Once you have confirmed the upload to the kaggle dataset, which you can see with the link once completed! Please run the final code block, update the notebook and push your changes to github, or let us know on slack that you have finished the upload.

# If your upload gets interrupted or you want to add more images batches to the existing dataset:

In [34]:
breakpoint()

__Edit version message only:__

In [None]:
version_message = "Added PO_2A_19 zip folder"  # edit message appropriately

def version_kaggle_dataset(folder_path, message="Updated dataset"):
    result = subprocess.run([
        "kaggle", "datasets", "version",
        "-p", str(folder_path),
        "-m", message,
        "--dir-mode", "zip"
    ], capture_output=True, text=True)
    print(result.stdout)
    if result.returncode != 0:
        print("Kaggle versioning failed:")
        print(result.stderr)

# === RUN ===
version_kaggle_dataset(dataset_main, version_message)


In [None]:
breakpoint()


In [None]:

def delete_all_files_in_directory(directory_path):
    for file_name in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
            print(f"Deleted: {file_path}")

# This will delete all files in the specified dataset_main subfolder
# run this only if you have finished your upload and no longer want the files stored locally
# if you want to delete them from the safe folder as well, change 
# Path(dataset_main) to Path(src_folder) in the line below
# Proceed with caution

directory_path = Path(dataset_main) / "CORSI_2B"
delete_all_files_in_directory(directory_path)
