# Accessing Multisprectral Satellite Imagery from Copernicus Data Space Ecosystem CDCE via API

In [None]:
pip install -r ../../requirements.txt

The Sentinel Hub API is a RESTful API interface that provides access to various satellite imagery archives. It allows you to access raw satellite data, rendered images, statistical analysis, and other features. 

To use the features in this notebook you need to visit https://dataspace.copernicus.eu and create an account with Copernicus, the official governing body of Sentinel Missions for the European Space Agency (ESA).

In [1]:
import os
import sys
import kaggle
import json


from sentinelhub import (SHConfig,
    DataCollection,
    SentinelHubCatalog,
    SentinelHubRequest,
    SentinelHubStatistical,
    BBox,
    bbox_to_dimensions,
    CRS,
    MimeType,
    Geometry,
)
from utils import plot_image
from dotenv import load_dotenv
import requests_oauthlib as requests
import matplotlib.pyplot as plt
import numpy as np
import datetime 
import pandas as pd
from PIL import Image



  from .autonotebook import tqdm as notebook_tqdm


# FIRST TIME Credentials

Run the following cells the first time you run this notebook after retrieving your credentials for CDSE from their website. The `client_id` & `client_secret` can be obtained in your [Dashboard](https://shapps.dataspace.copernicus.eu/dashboard/#/). In the User Settings you can create a new OAuth Client to generate these credentials. For more detailed instructions, visit the relevant [documentation page](https://documentation.dataspace.copernicus.eu/APIs/SentinelHub/Overview/Authentication.html).

Now that you have your `client_id` & `client_secret`, save them in the .env file in the same directory as this script, 
by simply inserting the following lines:

CLIENT_ID= "your_client_id"
CLIENT_SECRET= "your_client_secret"

For privacy and security please make sure that ".env" is included in your .gitignore file
Once this is all done, you can proceed from here:

In [None]:
load_dotenv() 

config = SHConfig(use_defaults=True)
config.sh_client_id = os.getenv("CLIENT_ID")
config.sh_client_secret = os.getenv("CLIENT_SECRET")
config.sh_token_url = os.getenv("TOKEN_URL")
config.sh_base_url = os.getenv("BASE_URL")
config.save("cdse")

Check that the complete config has also been stored on your local machine for future use,
this ensures everything is set correctly and that you can use your cedentials with other 
virtual machines for as long as your OAuth token is valid. 

In [None]:
SHConfig.get_config_location()

'/Users/sara_mac/.config/sentinelhub/config.toml'

This workflow works with credentials and configurations specific to the free CDSE platform, not Sentinel Hub, which requires a subscription after a 30 day free trial. 
Instructions on how to configure your Sentinel Hub Python package instead can be found [here](https://sentinelhub-py.readthedocs.io/en/latest/configure.html). Using these instructions you can create a profile specific to using the Sentinel Hub package for accessing Copernicus Data Space Ecosystem data collections, if you wish to do so. 

# RETRIEVE PREVIOUS Credentials
skip directly to this is you have already set up the credentials in previous cells before.

In [3]:
config = SHConfig("cdse")

In [4]:
# check that the credentials are set correctly 
if not config.sh_client_id or not config.sh_client_secret:
    print("Please provide your Sentinel Hub credentials in the .env file.")
    exit(1)

# check that the credentials are what you expect (i.e. output is the same & not None)
# NOTE: you can also set the credentials directly in the code, 
# but this is not recommended for security reasons.
# do not print the secret credentials 'id' or 'secret' in a public notebook for security reasons either
print(config.sh_token_url)
print(config.sh_base_url)

https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/token
https://sh.dataspace.copernicus.eu


# Fixed bounding boxes selected for 3 Areas of Interest

Do not change the values for the selected AOIs, but add others if required.
Make sure the aoi_size is less than 2500 pixels as the maximum image length/width to request. If larger areas are needed a mosaics approach needs to be implemented and can be added at a later stage.

In [5]:
# Po River 
resolution = 10
AOI1 = 'Po River Plume'
aoi_prp =[12.45, 44.825, 12.7, 45.055]  

aoi1_bbox = BBox(bbox=aoi_prp, crs=CRS.WGS84)
aoi1_size = bbox_to_dimensions(aoi1_bbox, resolution=resolution)
print(f"Image shape for {AOI1} at {resolution} m resolution: {aoi1_size} pixels")


Image shape for Po River Plume at 10 m resolution: (2049, 2496) pixels


In [6]:
# Corsica
resolution = 10
AOI2 = 'North East Corsica'
aoi2_nec =[9.6, 42.95, 9.9, 43.155]  

aoi2_bbox = BBox(bbox=aoi2_nec, crs=CRS.WGS84)
aoi2_size = bbox_to_dimensions(aoi2_bbox, resolution=resolution)
print(f"Image shape for {AOI2} at {resolution} m resolution: {aoi2_size} pixels")

Image shape for North East Corsica at 10 m resolution: (2423, 2298) pixels


In [7]:
# Calabria
resolution = 10
AOI3 = 'South East Calabria'
aoi3_sec =[16.5, 38.35, 16.755, 38.555]  

aoi3_bbox = BBox(bbox=aoi3_sec, crs=CRS.WGS84)
aoi3_size = bbox_to_dimensions(aoi3_bbox, resolution=resolution)
print(f"Image shape for {AOI3} at {resolution}m resolution: {aoi3_size} pixels")

Image shape for South East Calabria at 10m resolution: (2185, 2314) pixels


# Download workflow 
- Match full Sentinel Catalog for S2_L1C to AOI & time period (adjust as needed, but within litterrow time period, i.e. between 01/07/2015 - 31/08/2021)
- Limit results to images with identified litter windrows
- define evalscript with bands and images to download
- Iterate through images from filtered results (i.e. with litter rows) and save them.


## Limit Catalog to products with identified & annotated litter rows

In [None]:
catalog = SentinelHubCatalog(config=config)

In [None]:
# Retrieve images from the Sentinel Hub Catalog for the specified AOI
# and time interval in representative batches

aoi_bbox = BBox(bbox=aoi_prp, crs=CRS.WGS84) # switch for aoi_nec or aoi_sec as needed
time_interval =  "2019-07-01", "2019-07-31" # adjust dates as needed with max period:
# "2015-07-01", "2021-08-31" -- total images: 3026 for prp, 754 for nec, 378 for sec

search_iterator = catalog.search(
    DataCollection.SENTINEL2_L1C,
    bbox=aoi_bbox,
    time=time_interval,
    fields={"include": ["id", "properties.datetime"], "exclude": []},
)

results = list(search_iterator)
print("Total number of results:", len(results))


Total number of results: 48


Check if any of the id's match images with positive pixels listed in LM_centroids.xlxs

In [None]:
# Function to match LW source data to the image ID 
# Iterate through the results and check for matches by removing S2A_MSIL1C_ 
# from the 'id' and keeping only the datestr and codeT which are
# listed in the LM_centroids.xlsx file based on S2L1C naming convention: 
# S2A_MSIL1C__YYYYMMDDTXXXXXX....

def check_matching_ids(results, lm_centroids_path):
    
    lm_centroids = pd.read_excel(lm_centroids_path)
    if 'Str_time' not in lm_centroids.columns:
        raise ValueError("The column 'Str_time' is not found in the provided Excel file.")
    
    str_time_set = set(lm_centroids['Str_time'])
    
    matching_ids = []
    for result in results:
        trimmed_id = result['id'][11:26]
        if trimmed_id in str_time_set:
            matching_ids.append(result['id'])
    
    return matching_ids


lm_centroids_path = "../LM_centroids.xlsx" 
matching_ids = check_matching_ids(results, lm_centroids_path)
print("Matching IDs:", matching_ids)

Matching IDs: ['S2A_MSIL1C_20190730T100031_N0500_R122_T32TQQ_20230710T202111.SAFE', 'S2A_MSIL1C_20190730T100031_N0500_R122_T33TUK_20230710T202111.SAFE', 'S2A_MSIL1C_20190730T100031_N0500_R122_T32TQR_20230710T202111.SAFE', 'S2A_MSIL1C_20190730T100031_N0500_R122_T33TUL_20230710T202111.SAFE', 'S2B_MSIL1C_20190725T100039_N0500_R122_T32TQQ_20230619T024542.SAFE', 'S2B_MSIL1C_20190725T100039_N0500_R122_T33TUK_20230619T024542.SAFE', 'S2B_MSIL1C_20190725T100039_N0500_R122_T32TQR_20230619T024542.SAFE', 'S2B_MSIL1C_20190725T100039_N0500_R122_T33TUL_20230619T024542.SAFE', 'S2A_MSIL1C_20190723T101031_N0500_R022_T32TQQ_20230718T015529.SAFE', 'S2A_MSIL1C_20190723T101031_N0500_R022_T33TUK_20230718T015529.SAFE', 'S2A_MSIL1C_20190723T101031_N0500_R022_T32TQR_20230718T015529.SAFE', 'S2A_MSIL1C_20190723T101031_N0500_R022_T33TUL_20230718T015529.SAFE', 'S2A_MSIL1C_20190720T100031_N0500_R122_T32TQQ_20230715T235221.SAFE', 'S2A_MSIL1C_20190720T100031_N0500_R122_T33TUK_20230715T235221.SAFE', 'S2A_MSIL1C_2019072

In [None]:
print(f"{len(matching_ids)} matching images found between {time_interval[0]} and {time_interval[1]} for {AOI1}.") 
# exchange for AOI2, AOI3, as req.

24 matching images found between 2019-07-01 and 2019-07-31 for Po River Plume.


Filter images needed:

In [None]:
new_results = [
    {key: record[key] for key in record if key in ["id", "properties"]}
    for record in results if record["id"] in matching_ids
]

## Define Eval script of bands for download and how to combine the returned outputs 
(i.e. RGB image with invisible bands, FDI, NVDI, or custom false colour images)
This evalscript includes only four bands for workflow trial, but can be extended to include all bands available, as well as calculated indexes such as NVDI etc. More documentation on the components and how to write/edit eval scripts can be found in the [doumentation](https://documentation.dataspace.copernicus.eu/APIs/SentinelHub/Evalscript/V3.html).

Based on Booth et al. (2023) (who used Marida + PLP) the following 4 bands were selected:<br>
bands: ["B04", "B06", "B08", "B11"] , i.e. // red, red edge + NIR + SWIR
<br>
The authors reported the Map Mapper-Opt "model trained with only 4 bands (the ones contributing to calculating FDI and NDVI) demonstrated good (better) performance than using all 13 Sentinel-2 bands. However, it is possible that other band combinations could improve model performance. Removing some of the lower resolution bands, as well as bands where wavelengths do not correlate with plastic materials, may reduce noise in the data set."

Complete list of bands that can be selected from L1C are:
["B01", "B03", "B03", "B04", "B05", "B06", "B07", "B08", "B08a", "B09", "B10, "B11, "B12"]


Within the eval script the function evaluatePixel(sample) return value determines how data from the various multi spectral index (MSI) bands are handled. In this case it simple returns a list with the seperate values for each of the four selected bands. To combine red, green & blue(RGB) (B04, B03, B02) for a real 'true color' composites, these would be combined within the funstions return section of the code. 
The same goes for the calculations/combinations used to get the Normalized Difference Vegetation Index (NDVI) and/or the Floating Debris Index (FDI). Code snippets to get these indexes & combinations will be included shortly.

In [None]:
evalscript_true_color = """
//VERSION=3

function setup() {
    return {
        input: [{
            bands: ["B04", "B06", "B08", "B11"] 
        }],
        output: {
            bands: 4
        }
    };
}

function evaluatePixel(sample) {
    return [sample.B04, sample.B06, sample.B08, sample.B11]; // R + RE + NIR + SWIR in seperate files
}
"""

# to change outputs saved to real true colour composites, false composites or indexes, such as NVDI & FDI 
# the function evaluatePixel(sample) return value needs to be adjusted. Pre-set evals scripts will be added for each option shortly 
# Generate requests for each matching ID
requests_true_color = []
for matching_id in matching_ids:
    request = SentinelHubRequest(
        evalscript=evalscript_true_color,
        input_data=[
            SentinelHubRequest.input_data(
                data_collection=DataCollection.SENTINEL2_L1C,
                identifier=matching_id,
            time_interval=time_interval,
            other_args={"dataFilter": {"mosaickingOrder": "leastCC"}},
            )
        ],
        responses=[SentinelHubRequest.output_response("default", MimeType.TIFF)],
        bbox=aoi_bbox, #e xchange for aoi2_bbox for Corsica , aoi3_bbox for Calabria as req.
        size=aoi1_size, # exchange for aoi2_size for Corsica , aoi3_size for Calabria as req.
        config=config,
    )
    requests_true_color.append(request)

print(f"Generated {len(requests_true_color)} requests for true color images.")

Generated 24 requests for true color images.


## Iterate through the Sentinel Products identified and save them to kaggle hub [in progress...]
### use this for batch downloads, i.e. to expand the dataset spatially & temporally - more months/other AOIs

This option now saves the retrieves Sentinal bands and eval script outputs directly to a central kaggle Dataset, which can be used as the input for other notebooks in the project (satellite correction and various models people are testing/evaluating), as well as for future use by the public

In [4]:
# Make a folder for the current batch of images you are downloading
# NOTE: this will overwrite any existing folder with the same name

# Create the directory if it doesn't exist
os.makedirs(project_root + '/kaggle_dataset/Po_River_July_2019/', exist_ok=True)

In [None]:

# Download the images
for i, request in enumerate(requests_true_color):
    response = request.get_data(save_data=True)
    print(f"Downloaded image {i + 1}/{len(requests_true_color)}: {response[0]['filename']}")
# Upload the images to Kaggle
kaggle.api.dataset_create_version(
    '/root/kaggle_dataset/Po_River_March_2020', # replace with your own folder name
    version_notes='Initial version',
    convert_to_csv=False,
    delete_old_versions=False,
)

# Create a metadata file for your dataset
metadata = {
    "title": "Po_River_July_2019", # enter the title adding AOI and timeframe in name
    "id": "sarahajbane/litter_rows",  # replace if you want to use your own and have 
    # been added as a editing collaborator to the dataset on Kaggle, message me if you want to be added.
    "licenses": [{"name": "CC0-4.0"}]  # Creative Commons license with attribution to original authors of the litter_row dataset
}
with open('/root/kaggle_dataset/dataset-metadata.json', 'w') as file:
    json.dump(metadata, file)

In [None]:


current_batch = []
# Loop through each result to fetch the corresponding image

output_dir = "~/kaggle_datasets/Po_River_July_2019" # adjust this for kaggle

for result in new_results:
    timestamp = result["properties"]["datetime"]
    result_id = result["id"] 
    print(f"Fetching image for timestamp: {timestamp}")

    # Request for a single image corresponding to the timestamp
    request_selected_bands = SentinelHubRequest(
        evalscript=evalscript_true_color,
        input_data=[
            SentinelHubRequest.input_data(
                data_collection=DataCollection.SENTINEL2_L1C.define_from(
                    name="s2l1c", service_url="https://sh.dataspace.copernicus.eu"
                ),
                time_interval=(timestamp, timestamp),  # Use specific timestamp for each result
                other_args={"dataFilter": {"mosaickingOrder": "leastCC"}},
            )
        ],
        responses=[SentinelHubRequest.output_response("default", MimeType.TIFF)],
        bbox=aoi_bbox, #exchange for aoi2_bbox for Corsica , aoi3_bbox for Calabria as req.
        size=aoi1_size, #exchange for aoi2_size for Corsica , aoi3_size for Calabria as req.
        config=config,
    )

    # Get the data (the image) and append to the list
    image = request_selected_bands.get_data()
    
    # Since get_data() returns a list of images, take the first one
    current_batch.append(image[0])  # Append the first image (or modify as needed)
    
    # Save the image to the "data" folder
    image_array = image[0]
    image_path = os.path.join(output_dir, f"{result_id}.tiff")  # Use ID for naming
    Image.fromarray(image_array).save(image_path)

    print(f"Fetched and stored image for {timestamp}")

## Iterate through the Sentinel Products identified and save them locally [depracated, soon obsolete]
Use this option if you want to download images to your own machine / drive to look at and play around with, 
but all products used for future steps in the project, i.e. for use in satellite corrections, model test/train, or validation 
need to be added to kaggle so we are working on the **same data** with same bands, transformations, bounding boxes etc. 
stored in one central **public** location.

In [None]:
true_color_imgs = []
# Loop through each result to fetch the corresponding image

output_dir = "../data" # adjust this for kaggle

for result in new_results:
    timestamp = result["properties"]["datetime"]
    result_id = result["id"] 
    print(f"Fetching image for timestamp: {timestamp}")

    # Request for a single image corresponding to the timestamp
    request_true_color = SentinelHubRequest(
        evalscript=evalscript_true_color,
        input_data=[
            SentinelHubRequest.input_data(
                data_collection=DataCollection.SENTINEL2_L1C.define_from(
                    name="s2l1c", service_url="https://sh.dataspace.copernicus.eu"
                ),
                time_interval=(timestamp, timestamp),  # Use specific timestamp for each result
                other_args={"dataFilter": {"mosaickingOrder": "leastCC"}},
            )
        ],
        responses=[SentinelHubRequest.output_response("default", MimeType.TIFF)],
        bbox=aoi_bbox, #exchange for aoi2_bbox for Corsica , aoi3_bbox for Calabria as req.
        size=aoi1_size, #exchange for aoi2_size for Corsica , aoi3_size for Calabria as req.
        config=config,
    )

    # Get the data (the image) and append to the list
    image = request_true_color.get_data()
    
    # Since get_data() returns a list of images, take the first one
    true_color_imgs.append(image[0])  # Append the first image (or modify as needed)
    
    # Save the image to the "data" folder
    image_array = image[0]
    image_path = os.path.join(output_dir, f"{result_id}.tiff")  # Use ID for naming
    Image.fromarray(image_array).save(image_path)

    print(f"Fetched and stored image for {timestamp}")

# Alternative if images are already saved locally 
 &  you don't want to redownload them 
 <br>
 <br>
 !! **PLEASE** !! make sure the bands selected (esp. if not all 13) and eval script return bands are accurate if you did not use this notebook to download L1C products

In [49]:
# ALT: if you already downloaded images to your local machine, you can move them to the untracked kaggle dataset folder 
# then uncomment the last line to upload them to Kaggle, i.e. copy or move your files to this directory

!cp -r /Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/data/Po_River_July_2019/ /Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/kaggle_dataset/Po_River_July_2019/
# !kaggle datasets create -p ~/kaggle_datasets/Po_River_July_2019/ --dir-mode zip


In [50]:
# if you are workign on a local machine, add your working directory folder to the sys path
project_root = "/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/" 
# if you are working on colab use the content (default) folder instead:
# project_root = "./content"
if project_root not in sys.path:
    sys.path.append(project_root)

In [51]:
print(project_root)

/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/


In [52]:
KAGGLE_CONFIG_DIR = os.path.expanduser(project_root + ".kaggle")

In [53]:
print(KAGGLE_CONFIG_DIR)

/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/.kaggle


In [98]:
data_dir = project_root + "kaggle_dataset/Po_River_July_2019/"
if data_dir not in sys.path:
    sys.path.append(data_dir)

In [56]:
# Ensure the kaggle.json file is in the correct location
kaggle_json_path = os.path.expanduser(project_root + "/.kaggle/kaggle.json")
if not os.path.exists(kaggle_json_path):
	raise FileNotFoundError(
		f"The Kaggle API credentials file is missing. Please place your kaggle.json file at {kaggle_json_path}."
	)


Initiate a dataset in the directory where you saved the bands/images, this will create a metadata file, which we have to amend in the next step

In [57]:
!kaggle datasets init -p {data_dir}

Data package template written to: /Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/kaggle_dataset/Po_River_July_2019/dataset-metadata.json


In [87]:
# Update the metadata file to reflect what you are uploading and adding to the dataset collection
# Replace metadata from the default that wasa created in the last cell. 
# This step is not optional, the default will not be accepted by Kaggle.

# Amend as needed:
metadata = {
    "title": "litter_rows: Sentinel2 L1C bands", # enter the title adding AOI and timeframe in name
    "id": "www.kaggle.com/datasets/sarahajbane/litter_rows",  # replace if you want to use your own and have
      "resources": [
    {
      "name": "Po_River_July_2019_4B",
      #"path": "https://www.kaggle.com/datasets/sarahajbane/litter_rows",
      "description": "Sentinel2 L1C bands from CDSE - First for Po River July 2019 - FDI and NDVI bands only",
      "type": "image",
      "format": "tiff",
    }
      ],
    "licenses": [
        {
      "name": "CC-BY-SA-4.0",
      "title": "Creative Commons Attribution Share-Alike 4.0",
      "path": "https://creativecommons.org/licenses/by-sa/4.0/"
    } # Creative Commons license with proper attribution 
    ] # to original authors of the litter_row dataset
    }
with open(data_dir + 'dataset-metadata.json', 'w') as file:
    json.dump(metadata, file)

In [90]:
kaggle_path = "https://www.kaggle.com/datasets/sarahajbane/litter_rows"

In [93]:
print(type(data_dir))

<class 'str'>


In [92]:
!kaggle datasets create -p {data_dir} --dir-mode zip 


Traceback (most recent call last):
  File [35m"/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/omdena/bin/kaggle"[0m, line [35m8[0m, in [35m<module>[0m
    sys.exit([31mmain[0m[1;31m()[0m)
             [31m~~~~[0m[1;31m^^[0m
  File [35m"/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/omdena/lib/python3.13/site-packages/kaggle/cli.py"[0m, line [35m68[0m, in [35mmain[0m
    out = args.func(**command_args)
  File [35m"/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/omdena/lib/python3.13/site-packages/kaggle/api/kaggle_api_extended.py"[0m, line [35m2073[0m, in [35mdataset_create_new_cli[0m
    result = self.dataset_create_new(folder, public, quiet, convert_to_csv,
                                     dir_mode)
  File [35m"/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/omdena/lib/python3.13/site-packages/kaggle/api/kaggle

In [65]:
!kaggle datasets version -p {data_dir} -m "Update data"

/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/kaggle_dataset/Po_River_July_2019/https://www.kaggle.com/datasets/sarahajbane/litter-windrows does not exist


In [None]:
!kaggle datasets version -p /path/to/dataset -m "Add new satellite bands: PRP 07/2019 - 4 bands"

In [97]:
!kaggle datasets version -p {data_dir} -m "Update data"

Traceback (most recent call last):
  File [35m"/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/omdena/bin/kaggle"[0m, line [35m8[0m, in [35m<module>[0m
    sys.exit([31mmain[0m[1;31m()[0m)
             [31m~~~~[0m[1;31m^^[0m
  File [35m"/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/omdena/lib/python3.13/site-packages/kaggle/cli.py"[0m, line [35m68[0m, in [35mmain[0m
    out = args.func(**command_args)
  File [35m"/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/omdena/lib/python3.13/site-packages/kaggle/api/kaggle_api_extended.py"[0m, line [35m1920[0m, in [35mdataset_create_version_cli[0m
    result = self.dataset_create_version(
        folder,
    ...<3 lines>...
        delete_old_versions=delete_old_versions,
        dir_mode=dir_mode)
  File [35m"/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/omdena/lib/python3

# 

# Last task outstanding before complete:
- Create a function to save images/bands directly to kagglehub storage regardless where the code is ran, if possible, i.e. https://www.kaggle.com/datasets/sarahajbane/litter-windrows 
- Create proper true colour image from RGB bands and add transformations for NDVI and FDI from appropriate bands, creating appropriate outputs for MapMapper workflow and other model inputs output


# Notes kaggle API
 to save outputs directly to kaggle hub

This section ensures all relevant Sentinel Products (bands/images) relating to the litter_rows will be saved to a public kagglehub [dataset]("https://www.kaggle.com/datasets/sarahajbane/litter-windrows") to have one stable centralised data storage location and streamline an input directory path for further use in the Satellite Detection algorithm project.  
(i.e. for satellite corrections, training, testing & validation of current and additional model versions), 
as well as an open source resource for future models or other open-source use cases for the public, similar to MARIDA dataset.

Step 1: Install the Kaggle API if not already installed. 
If you are using a virtual environment, you may need to install the Kaggle API with --user outside of it first for it to work
with a regular pip install command (i.e. pip install kaggle) within the venv afterwards

!pip install --user kaggle

make sure you have the /.kaggle/kaggle.json file in the correct location, i.e. your home directory or the current working directory if 
using colab or jupyter notebook

- Linux: $XDG_CONFIG_HOME/kaggle/kaggle.json (defaults to ~/.config/kaggle/kaggle.json). 
- The path ~/.kaggle/kaggle.json which was used by older versions of the tool is also still supported.




# Sagar



In [100]:
# Set the project root
project_root = "/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/"
if project_root not in sys.path:
    sys.path.append(project_root)
print(project_root)

# Set Kaggle config directory
KAGGLE_CONFIG_DIR = os.path.expanduser(project_root + ".kaggle")
print(KAGGLE_CONFIG_DIR)

# Path to data directory to Po_River
data_dir = project_root + "kaggle_dataset/Po_River_July_2019/"
if data_dir not in sys.path:
    sys.path.append(data_dir)

# Check for kaggle.json credentials
kaggle_json_path = os.path.expanduser(project_root + "/.kaggle/kaggle.json")
if not os.path.exists(kaggle_json_path):
    raise FileNotFoundError(f"The Kaggle API credentials file is missing. Please place your kaggle.json file at {kaggle_json_path}.")

# Function to create or update a Kaggle dataset from a local folder
def create_kaggle_dataset_from_folder(
    folder_path,
    title,
    dataset_id,
    description="Sentinel-2 L1C subset",
    license_name="CC-BY-SA-4.0"
):
    folder_path = Path(folder_path)
    assert folder_path.exists(), "Folder does not exist!"

    metadata_path = folder_path / "dataset-metadata.json"
    image_files = [f.name for f in folder_path.glob("*.tif*")]

    resources = [
        {
            "name": Path(img).stem,
            "path": img,
            "description": f"Image: {img}",
            "type": "image",
            "format": "tiff"
        } for img in image_files
    ]

    metadata = {
        "title": title,
        "id": dataset_id,
        "licenses": [{
            "name": license_name,
            "title": "Creative Commons Attribution Share-Alike 4.0",
            "path": "https://creativecommons.org/licenses/by-sa/4.0/"
        }],
        "resources": resources
    }

    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)

    # Initialize if necessary
    if not (folder_path / "dataset-metadata.json").exists():
        subprocess.run(["kaggle", "datasets", "init", "-p", str(folder_path)])

    # Create or version the dataset
    if not any((folder_path / f).exists() for f in ["dataset-metadata.json", "dataset-metadata.yml"]):
        print("No metadata found, initializing dataset.")
        subprocess.run(["kaggle", "datasets", "init", "-p", str(folder_path)])

    try:
        subprocess.run([
            "kaggle", "datasets", "create",
            "-p", str(folder_path),
            "--dir-mode", "zip"
        ], check=True)
    except subprocess.CalledProcessError:
        subprocess.run([
            "kaggle", "datasets", "version",
            "-p", str(folder_path),
            "-m", "Update data",
            "--dir-mode", "zip"
        ])

# Example call to the above function
create_kaggle_dataset_from_folder(
    folder_path="/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/kaggle_dataset/Po_River_July_2019",
    title="Po River July 2019: 4-band Sentinel2",
    dataset_id="sarahajbane/litter_windrows"
)

/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/
/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/.kaggle


NameError: name 'Path' is not defined