## 

In [2]:
import sys
import os
import pystac_client
from dotenv import load_dotenv

import requests
import pandas as pd
from datetime import datetime, timedelta
import os
from loguru import logger
import time

In [3]:
def create_cdse_query_url(
    collection_name="SENTINEL-2",
    product_type="MSIL2A",
    polygon=None,
    start_interval=None,
    end_interval=None,
    max_cloud_cover=100,
    max_items=1000,
    additional_filters=None,
    orderby="ContentDate/Start"  # Add orderby parameter with default value
):
    """
    Create a query URL for the Copernicus Data Space Ecosystem OData API.

    Parameters:
    -----------
    collection_name : str
        The collection name (e.g., 'SENTINEL-2', 'SENTINEL-1')
    product_type : str
        The product type (e.g., 'MSIL2A', 'MSIL1C', 'GRD')
    polygon : str
        WKT polygon string for spatial filtering
    start_interval : str
        Start time in ISO format with Z for UTC (e.g., '2023-01-01T00:00:00.000Z')
    end_interval : str
        End time in ISO format with Z for UTC (e.g., '2023-01-31T23:59:59.999Z')
    max_cloud_cover : int
        Maximum cloud cover percentage (0-100)
    max_items : int
        Maximum number of items to return
    additional_filters : list
        List of additional filter strings to add to the query
    orderby : str or None
        Field to order results by (e.g., 'ContentDate/Start', 'ContentDate/Start desc')
        Set to None to skip ordering

    Returns:
    --------
    str
        Complete URL for the OData API query
    """

    # Basic filter for collection
    filter_parts = [f"Collection/Name eq '{collection_name}'"]

    # Add spatial filter if provided
    if polygon:
        filter_parts.append(f"OData.CSC.Intersects(area=geography'SRID=4326;{polygon}')")

    # Add product type filter
    if product_type:
        filter_parts.append(f"contains(Name,'{product_type}')")

    # Add temporal filters if provided
    if start_interval:
        filter_parts.append(f"ContentDate/Start gt {start_interval}")
    if end_interval:
        filter_parts.append(f"ContentDate/Start lt {end_interval}")

    # Add cloud cover filter if applicable
    # Only add for optical sensors (Sentinel-2)
    if collection_name == 'SENTINEL-2' and max_cloud_cover < 100:
        filter_parts.append(
            f"Attributes/OData.CSC.DoubleAttribute/any(att:att/Name eq 'cloudCover' and "
            f"att/OData.CSC.DoubleAttribute/Value le {max_cloud_cover})"
        )

    # Add any additional filters
    if additional_filters:
        filter_parts.extend(additional_filters)

    # Construct the URL with all filters
    filter_string = " and ".join(filter_parts)
    url = f"https://catalogue.dataspace.copernicus.eu/odata/v1/Products?$filter={filter_string}"

    # Add orderby parameter if specified
    if orderby:
        url += f"&$orderby={orderby}"

    # Add top parameter for limiting results
    url += f"&$top={max_items}"

    return url

In [4]:
# # Set up loguru logger
# log_filename = f"sentinel_query_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"

# # Remove the default sink and add custom ones
# logger.remove()
# # Add a sink for the file with the format you want
# logger.add(log_filename, format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}")
# # Add a sink for stdout with a simpler format
# logger.add(lambda msg: print(msg, end=""), colorize=True, format="{message}")

# # Define your bounding box and date range
# bbox = [146.5, -22.0, 149.5, -20.0]
# start_date = datetime(2020, 1, 1)
# end_date = datetime(2025, 1, 1)
# max_items = 1000
# max_cloud_cover = 100

# # Log query parameters
# logger.info(f"Query parameters:")
# logger.info(f"Bounding box: {bbox}")
# logger.info(f"Date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
# logger.info(f"Max items per request: {max_items}")
# logger.info(f"Max cloud cover: {max_cloud_cover}%")
# # Generate the polygon string from bbox [minx, miny, maxx, maxy]
# polygon = f"POLYGON (({bbox[0]} {bbox[1]}, {bbox[0]} {bbox[3]}, {bbox[2]} {bbox[3]}, {bbox[2]} {bbox[1]}, {bbox[0]} {bbox[1]}))"

# # Initialize empty lists to store all results
# all_l1c_results = []
# all_l2a_results = []

# # Loop through the date range with a step of 5 days
# current_date = start_date
# while current_date < end_date:
#     # Calculate the end of the current 5-day interval
#     next_date = min(current_date + timedelta(days=10), end_date)

#     # Format the dates as required for the OData query (ISO format with Z for UTC)
#     start_interval = f"{current_date.strftime('%Y-%m-%dT00:00:00.000Z')}"
#     end_interval = f"{next_date.strftime('%Y-%m-%dT23:59:59.999Z')}"

#     date_interval = f"{current_date.strftime('%Y-%m-%d')}/{next_date.strftime('%Y-%m-%d')}"

#     try:

#         l2a_query_url = create_cdse_query_url(
#             product_type="MSIL2A",
#             polygon=polygon,
#             start_interval=start_interval,
#             end_interval=end_interval,
#             max_cloud_cover=max_cloud_cover,
#             max_items=max_items,
#             orderby="ContentDate/Start"
#         )
#         # Search for Sentinel-2 L2A products for this interval
#         l2a_json = requests.get(l2a_query_url).json()

#         # Add interval as metadata to each item
#         l2a_results = l2a_json.get('value', [])
#         for item in l2a_results:
#             item['query_interval'] = date_interval


#         l1c_query_url = create_cdse_query_url(
#             product_type="MSIL1C",
#             polygon=polygon,
#             start_interval=start_interval,
#             end_interval=end_interval,
#             max_cloud_cover=max_cloud_cover,
#             max_items=max_items,
#             orderby="ContentDate/Start"
#         )
#         # Search for Sentinel-2 L1C products for this interval
#         l1c_json = requests.get(l1c_query_url).json()

#         # Add interval as metadata to each item
#         l1c_results = l1c_json.get('value', [])
#         for item in l1c_results:
#             item['query_interval'] = date_interval

#         # Count L1C products
#         l1c_count = len(l1c_results)
#         l2a_count = len(l2a_results)

#         if l1c_count == l2a_count:
#             # Append to the overall results list
#             all_l1c_results.extend(l1c_results)
#             all_l2a_results.extend(l2a_results)
#         else:
#             logger.warning(f"Mismatch in counts for {date_interval}: L1C={l1c_count}, L2A={l2a_count}")

#         # Print results for this interval
#         logger.info(f"L1C Items for {date_interval}: {l1c_count}")
#         logger.info(f"L2A Items for {date_interval}: {l2a_count}")
#         logger.info("####")

#     except Exception as e:
#         logger.error(f"Error processing interval {date_interval}: {str(e)}")

#     # Move to the next n-day interval
#     current_date = next_date

# # Create DataFrames from the collected results
# df_l1c = pd.DataFrame(all_l1c_results)
# df_l2a = pd.DataFrame(all_l2a_results)

# # Log final counts
# logger.success(f"Query completed. Total L1C items: {len(df_l1c)}, Total L2A items: {len(df_l2a)}")
# logger.info(f"Log saved to {log_filename}")

# # Save DataFrames to CSV
# csv_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# l1c_csv = f"sentinel_l1c_data_{csv_timestamp}.csv"
# l2a_csv = f"sentinel_l2a_data_{csv_timestamp}.csv"

In [5]:
df_l1c = pd.read_csv("/mnt/disk/dataset/sentinel-ai-processor/V0/sentinel_l1c_data_20250411_175445.csv")
df_l2a = pd.read_csv("/mnt/disk/dataset/sentinel-ai-processor/V0/sentinel_l2a_data_20250411_175445.csv")

In [6]:
len(df_l1c), len(df_l2a)

(11311, 11311)

In [7]:
df_l2a = df_l2a[["Name","S3Path","Footprint","GeoFootprint"]]
df_l1c = df_l1c[["Name","S3Path","Footprint","GeoFootprint"]]

# Check product id mnatchig
def remove_last_segment_rsplit(sentinel_id):
    # Split from the right side, max 1 split
    parts = sentinel_id.rsplit('_', 1)
    return parts[0]

# First, create the id_key columns as you already did
df_l2a['id_key'] = df_l2a['Name'].apply(remove_last_segment_rsplit)
df_l2a['id_key'] = df_l2a['id_key'].str.replace('MSIL2A_', 'MSIL1C_')
df_l1c['id_key'] = df_l1c['Name'].apply(remove_last_segment_rsplit)

# Now sort both dataframes by id_key
df_l1c_sorted = df_l1c.sort_values(by='id_key')
df_l2a_sorted = df_l2a.sort_values(by='id_key')

# Reset indices after sorting (optional)
df_l1c_sorted = df_l1c_sorted.reset_index(drop=True)
df_l2a_sorted = df_l2a_sorted.reset_index(drop=True)

# Print statistics about sorted dataframes
print(f"Number of L1C products: {len(df_l1c_sorted)}")
print(f"Number of L2A products: {len(df_l2a_sorted)}")
print(f"Number of unique L1C products: {len(df_l1c_sorted['id_key'].unique())}")
print(f"Number of unique L2A products: {len(df_l2a_sorted['id_key'].unique())}")
print(f"Difference unique nb of products (L2A - L1C): {len(df_l2a_sorted['id_key'].unique()) - len(df_l1c_sorted['id_key'].unique())}")

# If you want to replace the original dataframes with sorted versions
df_l1c = df_l1c_sorted
df_l2a = df_l2a_sorted


# For df_l2a: Remove duplicates based on "Name" and keep the first occurrence
df_l2a_unique = df_l2a.drop_duplicates(subset="Name", keep="first")

# For df_l1c: Remove duplicates based on "Name" and keep the first occurrence
df_l1c_unique = df_l1c.drop_duplicates(subset="Name", keep="first")

df_l2a = df_l2a_unique
df_l1c = df_l1c_unique
df_l2a = df_l2a.reset_index(drop=True)
df_l1c = df_l1c.reset_index(drop=True)

Number of L1C products: 11311
Number of L2A products: 11311
Number of unique L1C products: 8674
Number of unique L2A products: 8674
Difference unique nb of products (L2A - L1C): 0


In [8]:
df_l1c.tail(5)

Unnamed: 0,Name,S3Path,Footprint,GeoFootprint,id_key
8782,S2C_MSIL1C_20241213T002731_N9905_R016_T55KES_2...,/eodata/Sentinel-2/MSI/L1C/2024/12/13/S2C_MSIL...,geography'SRID=4326;POLYGON ((147.531131965078...,"{'type': 'Polygon', 'coordinates': [[[147.5311...",S2C_MSIL1C_20241213T002731_N9905_R016_T55KES
8783,S2C_MSIL1C_20241213T002731_N9905_R016_T55KET_2...,/eodata/Sentinel-2/MSI/L1C/2024/12/13/S2C_MSIL...,geography'SRID=4326;POLYGON ((146.999808181059...,"{'type': 'Polygon', 'coordinates': [[[146.9998...",S2C_MSIL1C_20241213T002731_N9905_R016_T55KET
8784,S2C_MSIL1C_20241213T002731_N9905_R016_T55KET_2...,/eodata/Sentinel-2/MSI/L1C/2024/12/13/S2C_MSIL...,geography'SRID=4326;POLYGON ((147.061646842367...,"{'type': 'Polygon', 'coordinates': [[[147.0616...",S2C_MSIL1C_20241213T002731_N9905_R016_T55KET
8785,S2C_MSIL1C_20241217T001141_N9905_R073_T55KGR_2...,/eodata/Sentinel-2/MSI/L1C/2024/12/17/S2C_MSIL...,geography'SRID=4326;POLYGON ((149.178651078187...,"{'type': 'Polygon', 'coordinates': [[[149.1786...",S2C_MSIL1C_20241217T001141_N9905_R073_T55KGR
8786,S2C_MSIL1C_20241217T001141_N9905_R073_T55KGS_2...,/eodata/Sentinel-2/MSI/L1C/2024/12/17/S2C_MSIL...,geography'SRID=4326;POLYGON ((149.411624290498...,"{'type': 'Polygon', 'coordinates': [[[149.4116...",S2C_MSIL1C_20241217T001141_N9905_R073_T55KGS


In [9]:
df_l2a.tail(5)

Unnamed: 0,Name,S3Path,Footprint,GeoFootprint,id_key
8782,S2C_MSIL2A_20241213T002731_N9905_R016_T55KES_2...,/eodata/Sentinel-2/MSI/L2A/2024/12/13/S2C_MSIL...,geography'SRID=4326;POLYGON ((147.531264070563...,"{'type': 'Polygon', 'coordinates': [[[147.5312...",S2C_MSIL1C_20241213T002731_N9905_R016_T55KES
8783,S2C_MSIL2A_20241213T002731_N9905_R016_T55KET_2...,/eodata/Sentinel-2/MSI/L2A/2024/12/13/S2C_MSIL...,geography'SRID=4326;POLYGON ((146.999808181059...,"{'type': 'Polygon', 'coordinates': [[[146.9998...",S2C_MSIL1C_20241213T002731_N9905_R016_T55KET
8784,S2C_MSIL2A_20241213T002731_N9905_R016_T55KET_2...,/eodata/Sentinel-2/MSI/L2A/2024/12/13/S2C_MSIL...,geography'SRID=4326;POLYGON ((147.061646842367...,"{'type': 'Polygon', 'coordinates': [[[147.0616...",S2C_MSIL1C_20241213T002731_N9905_R016_T55KET
8785,S2C_MSIL2A_20241217T001141_N9905_R073_T55KGR_2...,/eodata/Sentinel-2/MSI/L2A/2024/12/17/S2C_MSIL...,geography'SRID=4326;POLYGON ((149.178651078187...,"{'type': 'Polygon', 'coordinates': [[[149.1786...",S2C_MSIL1C_20241217T001141_N9905_R073_T55KGR
8786,S2C_MSIL2A_20241217T001141_N9905_R073_T55KGS_2...,/eodata/Sentinel-2/MSI/L2A/2024/12/17/S2C_MSIL...,geography'SRID=4326;POLYGON ((149.411624290498...,"{'type': 'Polygon', 'coordinates': [[[149.4116...",S2C_MSIL1C_20241217T001141_N9905_R073_T55KGS


In [10]:
df_l1c = df_l1c.sample(n=2500, random_state=42)
df_l2a = df_l2a.sample(n=2500, random_state=42)

In [11]:
df_l1c = df_l1c.reset_index(drop=True)
df_l2a = df_l2a.reset_index(drop=True)


In [12]:
df_l1c

Unnamed: 0,Name,S3Path,Footprint,GeoFootprint,id_key
0,S2A_MSIL1C_20220607T002721_N0400_R016_T55KER_2...,/eodata/Sentinel-2/MSI/L1C/2022/06/07/S2A_MSIL...,geography'SRID=4326;POLYGON ((147.32780425815 ...,"{'type': 'Polygon', 'coordinates': [[[147.3278...",S2A_MSIL1C_20220607T002721_N0400_R016_T55KER
1,S2A_MSIL1C_20231010T002701_N0509_R016_T55KDR_2...,/eodata/Sentinel-2/MSI/L1C/2023/10/10/S2A_MSIL...,geography'SRID=4326;POLYGON ((146.032866526591...,"{'type': 'Polygon', 'coordinates': [[[146.0328...",S2A_MSIL1C_20231010T002701_N0509_R016_T55KDR
2,S2A_MSIL1C_20221111T002101_N0400_R116_T55KFR_2...,/eodata/Sentinel-2/MSI/L1C/2022/11/11/S2A_MSIL...,geography'SRID=4326;POLYGON ((147.966746816606...,"{'type': 'Polygon', 'coordinates': [[[147.9667...",S2A_MSIL1C_20221111T002101_N0400_R116_T55KFR
3,S2B_MSIL1C_20230316T002059_N0510_R116_T55KGT_2...,/eodata/Sentinel-2/MSI/L1C_N0500/2023/03/16/S2...,geography'SRID=4326;POLYGON ((148.909918459371...,"{'type': 'Polygon', 'coordinates': [[[148.9099...",S2B_MSIL1C_20230316T002059_N0510_R116_T55KGT
4,S2B_MSIL1C_20220226T001109_N0510_R073_T55KGS_2...,/eodata/Sentinel-2/MSI/L1C_N0500/2022/02/26/S2...,geography'SRID=4326;POLYGON ((149.412819069152...,"{'type': 'Polygon', 'coordinates': [[[149.4128...",S2B_MSIL1C_20220226T001109_N0510_R073_T55KGS
...,...,...,...,...,...
2495,S2A_MSIL1C_20240204T002051_N0510_R116_T55KER_2...,/eodata/Sentinel-2/MSI/L1C/2024/02/04/S2A_MSIL...,geography'SRID=4326;POLYGON ((146.999806640161...,"{'type': 'Polygon', 'coordinates': [[[146.9998...",S2A_MSIL1C_20240204T002051_N0510_R116_T55KER
2496,S2A_MSIL1C_20231024T001111_N0509_R073_T55KGR_2...,/eodata/Sentinel-2/MSI/L1C/2023/10/24/S2A_MSIL...,geography'SRID=4326;POLYGON ((149.180879988832...,"{'type': 'Polygon', 'coordinates': [[[149.1808...",S2A_MSIL1C_20231024T001111_N0509_R073_T55KGR
2497,S2B_MSIL1C_20230207T002709_N0510_R016_T55KDT_2...,/eodata/Sentinel-2/MSI/L1C_N0500/2023/02/07/S2...,geography'SRID=4326;POLYGON ((146.044318921720...,"{'type': 'Polygon', 'coordinates': [[[146.0443...",S2B_MSIL1C_20230207T002709_N0510_R016_T55KDT
2498,S2B_MSIL1C_20210604T002059_N0500_R116_T55KDS_2...,/eodata/Sentinel-2/MSI/L1C_N0500/2021/06/04/S2...,geography'SRID=4326;POLYGON ((146.885021787865...,"{'type': 'Polygon', 'coordinates': [[[146.8850...",S2B_MSIL1C_20210604T002059_N0500_R116_T55KDS


In [13]:
df_l2a

Unnamed: 0,Name,S3Path,Footprint,GeoFootprint,id_key
0,S2A_MSIL2A_20220607T002721_N0400_R016_T55KER_2...,/eodata/Sentinel-2/MSI/L2A/2022/06/07/S2A_MSIL...,geography'SRID=4326;POLYGON ((147.32780425815 ...,"{'type': 'Polygon', 'coordinates': [[[147.3278...",S2A_MSIL1C_20220607T002721_N0400_R016_T55KER
1,S2A_MSIL2A_20231010T002701_N0509_R016_T55KDR_2...,/eodata/Sentinel-2/MSI/L2A/2023/10/10/S2A_MSIL...,geography'SRID=4326;POLYGON ((146.032866526591...,"{'type': 'Polygon', 'coordinates': [[[146.0328...",S2A_MSIL1C_20231010T002701_N0509_R016_T55KDR
2,S2A_MSIL2A_20221111T002101_N0400_R116_T55KFR_2...,/eodata/Sentinel-2/MSI/L2A/2022/11/11/S2A_MSIL...,geography'SRID=4326;POLYGON ((147.966746816606...,"{'type': 'Polygon', 'coordinates': [[[147.9667...",S2A_MSIL1C_20221111T002101_N0400_R116_T55KFR
3,S2B_MSIL2A_20230316T002059_N0510_R116_T55KGT_2...,/eodata/Sentinel-2/MSI/L2A_N0500/2023/03/16/S2...,geography'SRID=4326;POLYGON ((148.909918459371...,"{'type': 'Polygon', 'coordinates': [[[148.9099...",S2B_MSIL1C_20230316T002059_N0510_R116_T55KGT
4,S2B_MSIL2A_20220226T001109_N0510_R073_T55KGS_2...,/eodata/Sentinel-2/MSI/L2A_N0500/2022/02/26/S2...,geography'SRID=4326;POLYGON ((149.412819069152...,"{'type': 'Polygon', 'coordinates': [[[149.4128...",S2B_MSIL1C_20220226T001109_N0510_R073_T55KGS
...,...,...,...,...,...
2495,S2A_MSIL2A_20240204T002051_N0510_R116_T55KER_2...,/eodata/Sentinel-2/MSI/L2A/2024/02/04/S2A_MSIL...,geography'SRID=4326;POLYGON ((146.999806640161...,"{'type': 'Polygon', 'coordinates': [[[146.9998...",S2A_MSIL1C_20240204T002051_N0510_R116_T55KER
2496,S2A_MSIL2A_20231024T001111_N0509_R073_T55KGR_2...,/eodata/Sentinel-2/MSI/L2A/2023/10/24/S2A_MSIL...,geography'SRID=4326;POLYGON ((149.180879988832...,"{'type': 'Polygon', 'coordinates': [[[149.1808...",S2A_MSIL1C_20231024T001111_N0509_R073_T55KGR
2497,S2B_MSIL2A_20230207T002709_N0510_R016_T55KDT_2...,/eodata/Sentinel-2/MSI/L2A_N0500/2023/02/07/S2...,geography'SRID=4326;POLYGON ((146.044318921720...,"{'type': 'Polygon', 'coordinates': [[[146.0443...",S2B_MSIL1C_20230207T002709_N0510_R016_T55KDT
2498,S2B_MSIL2A_20210604T002059_N0500_R116_T55KDS_2...,/eodata/Sentinel-2/MSI/L2A_N0500/2021/06/04/S2...,geography'SRID=4326;POLYGON ((146.885021787865...,"{'type': 'Polygon', 'coordinates': [[[146.8850...",S2B_MSIL1C_20210604T002059_N0500_R116_T55KDS


In [14]:
for i in range(min(len(df_l1c), len(df_l2a))):
    if df_l1c['id_key'][i] == df_l2a['id_key'][i]:
        print(f"Match: {df_l1c['id_key'][i]} == {df_l2a['id_key'][i]}")
    else:
        print(f"Mismatch: {df_l1c['id_key'][i]} != {df_l2a['id_key'][i]}")

Match: S2A_MSIL1C_20220607T002721_N0400_R016_T55KER == S2A_MSIL1C_20220607T002721_N0400_R016_T55KER
Match: S2A_MSIL1C_20231010T002701_N0509_R016_T55KDR == S2A_MSIL1C_20231010T002701_N0509_R016_T55KDR
Match: S2A_MSIL1C_20221111T002101_N0400_R116_T55KFR == S2A_MSIL1C_20221111T002101_N0400_R116_T55KFR
Match: S2B_MSIL1C_20230316T002059_N0510_R116_T55KGT == S2B_MSIL1C_20230316T002059_N0510_R116_T55KGT
Match: S2B_MSIL1C_20220226T001109_N0510_R073_T55KGS == S2B_MSIL1C_20220226T001109_N0510_R073_T55KGS
Match: S2B_MSIL1C_20230329T002709_N0509_R016_T55KDT == S2B_MSIL1C_20230329T002709_N0509_R016_T55KDT
Match: S2A_MSIL1C_20200222T001101_N0500_R073_T55KGR == S2A_MSIL1C_20200222T001101_N0500_R073_T55KGR
Match: S2B_MSIL1C_20210115T002059_N0500_R116_T55KGS == S2B_MSIL1C_20210115T002059_N0500_R116_T55KGS
Match: S2A_MSIL1C_20240224T002051_N0510_R116_T55KFR == S2A_MSIL1C_20240224T002051_N0510_R116_T55KFR
Match: S2B_MSIL1C_20231231T002059_N0510_R116_T55KDS == S2B_MSIL1C_20231231T002059_N0510_R116_T55KDS


In [63]:
df_l1c = df_l1c.to_csv(f"/mnt/disk/dataset/sentinel-ai-processor/V0/input_l1c.csv")
df_l2a = df_l2a.to_csv(f"/mnt/disk/dataset/sentinel-ai-processor/V0/output_l2a.csv")

In [64]:
# import pandas as pd
# from sklearn.model_selection import train_test_split

# # Set random state for reproducibility
# random_state = 42

# # Split L1C data into train, validation, and test sets (70%/20%/10%)
# # First split into train and temp, then split temp into validation and test
# train_l1c, temp_l1c = train_test_split(df_l1c_sampled, test_size=0.3, random_state=random_state)
# val_l1c, test_l1c = train_test_split(temp_l1c, test_size=0.33, random_state=random_state)
# # This creates a 70-20-10 split because 0.3*0.33 = 0.099 (approximately 10%)

# # Split L2A data into train, validation, and test sets (70%/20%/10%)
# train_l2a, temp_l2a = train_test_split(df_l2a_sampled, test_size=0.3, random_state=random_state)
# val_l2a, test_l2a = train_test_split(temp_l2a, test_size=0.33, random_state=random_state)

# # Print the split sizes to verify (corrected to use sampled dataframes in percentage calculation)
# print(f"L1C - Training: {len(train_l1c)} ({len(train_l1c)/len(df_l1c_sampled):.1%})")
# print(f"L1C - Validation: {len(val_l1c)} ({len(val_l1c)/len(df_l1c_sampled):.1%})")
# print(f"L1C - Test: {len(test_l1c)} ({len(test_l1c)/len(df_l1c_sampled):.1%})")

# print(f"L2A - Training: {len(train_l2a)} ({len(train_l2a)/len(df_l2a_sampled):.1%})")
# print(f"L2A - Validation: {len(val_l2a)} ({len(val_l2a)/len(df_l2a_sampled):.1%})")
# print(f"L2A - Test: {len(test_l2a)} ({len(test_l2a)/len(df_l2a_sampled):.1%})")

# # Save to CSV files
# train_l1c.to_csv("/mnt/disk/dataset/sentinel-ai-processor/V0/train_l1c.csv", index=False)
# val_l1c.to_csv("/mnt/disk/dataset/sentinel-ai-processor/V0/val_l1c.csv", index=False)
# test_l1c.to_csv("/mnt/disk/dataset/sentinel-ai-processor/V0/test_l1c.csv", index=False)

# train_l2a.to_csv("/mnt/disk/dataset/sentinel-ai-processor/V0/train_l2a.csv", index=False)
# val_l2a.to_csv("/mnt/disk/dataset/sentinel-ai-processor/V0/val_l2a.csv", index=False)
# test_l2a.to_csv("/mnt/disk/dataset/sentinel-ai-processor/V0/test_l2a.csv", index=False)


In [65]:
# df_l1c_sampled.to_csv("/mnt/disk/dataset/sentinel-ai-processor/V0/input_l1c.csv", index=False)
# df_l2a_sampled.to_csv("/mnt/disk/dataset/sentinel-ai-processor/V0/output_l2a.csv", index=False)

In [12]:
import folium
from folium.plugins import HeatMap
import json
import numpy as np
from shapely.geometry import shape

# Function to extract center of a GeoJSON polygon
def get_polygon_center(geo_json_str):
    try:
        # Parse the GeoJSON string
        geo_obj = json.loads(geo_json_str.replace("'", '"'))
        # Create a shapely geometry
        polygon = shape(geo_obj)
        # Get centroid
        centroid = polygon.centroid
        return [centroid.y, centroid.x]  # folium wants [lat, lon]
    except Exception as e:
        print(f"Error processing GeoJSON: {e}")
        return None

# Extract centers from the GeoFootprint column
centers = []
for geofootprint in df_l2a['GeoFootprint']:
    center = get_polygon_center(geofootprint)
    if center:
        centers.append(center)

# Create a base map centered at the mean of all centers
center_lat = np.mean([c[0] for c in centers])
center_lon = np.mean([c[1] for c in centers])

m = folium.Map(location=[center_lat, center_lon], zoom_start=8)

# Add the heatmap layer
HeatMap(centers).add_to(m)


# # Save the map
# m.save('footprint_centers_heatmap.html')

# print(f"Created heatmap with {len(centers)} points")
# print(f"Map center: Lat {center_lat:.6f}, Lon {center_lon:.6f}")
# print("Heatmap saved as 'footprint_centers_heatmap.html'")


<folium.plugins.heat_map.HeatMap at 0x702369fd7b30>

In [13]:
m

In [None]:
from dotenv import load_dotenv
notebook_dir = os.path.abspath('')
project_root = os.path.dirname(notebook_dir)
sys.path.append(project_root)

# Now import the module
from src.auth.auth import S3Connector
from src.utils.utils import extract_s3_path_from_url
from src.utils.stac_client import get_product, get_product_content

load_dotenv()
ACCESS_KEY_ID = os.environ.get("ACCESS_KEY_ID")
SECRET_ACCESS_KEY = os.environ.get("SECRET_ACCESS_KEY")
ENDPOINT_URL = 'https://eodata.dataspace.copernicus.eu'
ENDPOINT_STAC = "https://stac.dataspace.copernicus.eu/v1/"
BUCKETNAME= "eodata"
connector = S3Connector(
    endpoint_url=ENDPOINT_URL,
    access_key_id=ACCESS_KEY_ID,
    secret_access_key=SECRET_ACCESS_KEY,
    region_name='default'
)
# Get S3 client and resource from the connector instance
s3 = connector.get_s3_resource()
s3_client = connector.get_s3_client()
buckets = connector.list_buckets()
print("Available buckets:", buckets)
bucket = s3.Bucket("eodata")

Available buckets: ['EODATA', 'DIAS']


In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import os
from typing import Optional

def parse_safe_manifest(content: str) -> Optional[pd.DataFrame]:
    """
    Parse a Sentinel SAFE manifest file and extract href attributes.

    Args:
        manifest_path (str): Path to the manifest.safe file

    Returns:
        pd.DataFrame: DataFrame containing href values and file information,
                     or None if an error occurred
    """
    try:
        # if not os.path.exists(manifest_path):
        #     print(f"Error: File not found at {manifest_path}")
        #     return None

        # # Read the file content
        # with open(manifest_path, 'r', encoding='utf-8') as file:
        #     content = file.read()

        # Parse the content
        root = ET.fromstring(content)

        # Extract all elements with an href attribute using a generic approach
        hrefs = []
        for elem in root.findall(".//*[@href]"):
            href = elem.get('href')
            if href:
                hrefs.append(href)

       

        # Create DataFrame with href values and file information
        df_files = pd.DataFrame({
            'href': hrefs,
            'file_type': [href.split('.')[-1] if '.' in href else 'unknown' for href in hrefs],
            'file_name': [os.path.basename(href) for href in hrefs]
        })

        # Optional: Add additional processing or filtering here

        return df_files

    except ET.ParseError as e:
        logger.error(f"XML parsing error : {str(e)}")
        return None
    except Exception as e:
        logger.error(f"Error processing manifest: {str(e)}")
        return None


def filter_band_files(df_files, bands=None, product_type=None, resolution=None):
    """
    Filter a dataframe for Sentinel-2 band files supporting both L1C and L2A formats.

    Args:
        df_files (pd.DataFrame): DataFrame with 'href' column containing file paths
        bands (list, optional): List of band names to filter for (e.g., ['B02', 'B03', 'B04']).
                               If None, defaults to RGB bands.
        product_type (str, optional): Product type ('L1C' or 'L2A'). If None, both types are included.
        resolution (str or int, optional): Specific resolution to filter for L2A products ('10m', '20m', '60m' or 10, 20, 60).
                                         If None, includes all resolutions.

    Returns:
        pd.DataFrame: Filtered DataFrame containing only requested band files
    """
    # Define default bands to filter if not specified
    if bands is None:
        bands = ['B02', 'B03', 'B04']  # RGB bands by default

    # Convert resolution to string if it's an integer
    if resolution is not None:
        resolution = str(resolution)

    # Build regex patterns to match both L1C and L2A formats
    band_patterns = []

    for band in bands:
        # L1C format: IMG_DATA/*_B02.jp2
        if product_type is None or product_type.upper() == 'L1C':
            band_patterns.append(r'IMG_DATA/.*_' + band + r'\.jp2')

        # L2A formats with correct pattern: IMG_DATA/R20m/T55KGR_20200103T001101_B02_20m.jp2
        if product_type is None or product_type.upper() == 'L2A':
            if resolution:
                # If specific resolution is provided, filter for that resolution
                band_patterns.append(r'IMG_DATA/R' + resolution + r'm/.*_' + band + r'_' + resolution + r'm\.jp2')
            else:
                # If no resolution is specified, include all resolutions
                band_patterns.extend([
                    r'IMG_DATA/R10m/.*_' + band + r'_10m\.jp2',
                    r'IMG_DATA/R20m/.*_' + band + r'_20m\.jp2',
                    r'IMG_DATA/R60m/.*_' + band + r'_60m\.jp2'
                ])

    filter_condition = False
    for pattern in band_patterns:
        filter_condition = filter_condition | df_files['href'].str.contains(pattern, regex=True)

    df_gr = df_files[filter_condition].copy()  # Create a copy to avoid the warning

    # Remove leading ./ from href paths
    df_gr['href'] = df_gr['href'].str.replace(r'^\./', '', regex=True)

    return df_gr

In [None]:
# Define the dataset structure
dataset_version = "V0"  # Change this for different versions/
base_dir = f"/mnt/disk/dataset/sentinel-ai-processor/{dataset_version}"

input_dir = os.path.join(base_dir, "input")
output_dir = os.path.join(base_dir, "output")
os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

In [None]:
def download_bands(s3_client, bucket, bucket_name, df, bands, product_type, resolution, output_dir, max_attempts=10,retry_delay=2) :
    """
    Download Sentinel-2 band files from S3 based on dataframe information.
    
    Args:
        bucket: S3 bucket object
        df (pd.DataFrame): DataFrame with 'S3Path' column containing S3 paths
        bands (list): List of bands to download
        product_type (str): Product type ('L1C' or 'L2A')
        resolution (int, optional): Resolution in meters. Required for L2A products.
        output_dir (str): Base directory to save files
    """

    for index, row in df[100:110].iterrows():
        # Extract base S3 URL
        s3_base_url = extract_s3_path_from_url(row['S3Path']).replace("/eodata","")
        s3_manifest_url = f"{s3_base_url}/manifest.safe"
        _, filename = os.path.split(s3_manifest_url)

        
        
        # Try to download manifest file with retry logic
        max_attempts = 5
        attempt = 0
        content = None
        
        while attempt < max_attempts:
            try:
                # Get the manifest file
                response = s3_client.get_object(Bucket=bucket_name, Key=s3_manifest_url)
                
               
                
                # Check if successful
                if response["ResponseMetadata"]['HTTPStatusCode'] == 200:
                    content = response['Body'].read()

                    logger.info(f"Downloaded manifest from {s3_manifest_url}")
                    break
                
              
                else:
                    logger.warning(f"Unexpected status: {response['ResponseMetadata']['HTTPStatusCode']}")
                    attempt += 1
                    time.sleep(retry_delay)
                    
            except Exception as e:
                logger.warning(f"Error downloading manifest: {str(e)}")
                attempt += 1
                time.sleep(retry_delay)
        
        if content is None:
            logger.error(f"Failed to download manifest after {max_attempts} attempts, skipping this product")
            continue
       
        df_tmp = parse_safe_manifest(content=content)

        df_bands = filter_band_files(df_tmp, bands=bands, product_type=product_type, resolution=resolution)

        for gr in df_bands['href']:
            # Create full S3 URL for the band file
            band_s3_url = f"{s3_base_url}/{gr}"
            
            # Extract just the filename from the path
            filename = os.path.basename(gr)
            
            # Extract product ID for folder structure
            path_safe = s3_base_url.split(os.sep)[7].replace(".SAFE","")
            path_save = os.path.join(output_dir, path_safe)
            os.makedirs(path_save, exist_ok=True)

            print(f"Downloading{band_s3_url}") 
            
            # Download the file
            bucket.download_file(band_s3_url, f"{path_save}/{filename}")
            logger.info(f"Download {filename} to {path_save}")

In [None]:
def download_manifest(s3_client, bucket_name, s3_path, max_attempts=5, retry_delay=2):
    """
    Download and parse a Sentinel-2 product manifest file from S3.
    
    Args:
        s3_client: Boto3 S3 client
        bucket_name (str): S3 bucket name
        s3_path (str): Base S3 path to the product
        max_attempts (int): Maximum number of download attempts
        retry_delay (int): Seconds to wait between retry attempts
        
    Returns:
        tuple: (success (bool), dataframe of files or None)
    """
    # Extract base S3 URL and create manifest URL
    s3_base_url = extract_s3_path_from_url(s3_path).replace("/eodata", "")
    s3_manifest_url = f"{s3_base_url}/manifest.safe"
    
    # Try to download manifest file with retry logic
    attempt = 0
    content = None
    
    while attempt < max_attempts:
        try:
            # Get the manifest file
            response = s3_client.get_object(Bucket=bucket_name, Key=s3_manifest_url)
            
            # Check if successful
            if response["ResponseMetadata"]['HTTPStatusCode'] == 200:
                content = response['Body'].read()
                logger.info(f"Downloaded manifest from {s3_manifest_url}")
                break
            else:
                logger.warning(f"Unexpected status: {response['ResponseMetadata']['HTTPStatusCode']}")
                attempt += 1
                time.sleep(retry_delay)
                
        except Exception as e:
            logger.warning(f"Error downloading manifest: {str(e)}")
            attempt += 1
            time.sleep(retry_delay)
    
    if content is None:
        logger.error(f"Failed to download manifest after {max_attempts} attempts")
        return False, None
   
    # Parse the manifest into a dataframe
    df_files = parse_safe_manifest(content=content)
    
    return df_files



In [None]:
bands=['B02', 'B03', 'B04', 'TCI']
bucket_name="eodata"

all_l2a_metadata = []

for i in range(len(df_l2a[0:100])):
    df_test = download_manifest(s3_client=s3_client, bucket_name=bucket_name, s3_path=df_l2a['S3Path'][i] , max_attempts=5, retry_delay=2)
    df_test = filter_band_files(df_files=df_test, bands=bands, product_type="L2A", resolution=60)
    df_test[list(df_l2a.columns)] = df_l2a.loc[i][list(df_l2a.columns)].values
    all_l2a_metadata.append(df_test)

combined_df = pd.concat(all_l2a_metadata, ignore_index=True)
combined_df

[32m2025-04-13 08:22:03.562[0m | [1mINFO    [0m | [36m__main__[0m:[36mdownload_manifest[0m:[36m31[0m - [1mDownloaded manifest from /Sentinel-2/MSI/L2A/2024/04/02/S2B_MSIL2A_20240402T002709_N0510_R016_T55KDT_20240402T015338.SAFE/manifest.safe[0m
[32m2025-04-13 08:22:03.651[0m | [1mINFO    [0m | [36m__main__[0m:[36mdownload_manifest[0m:[36m31[0m - [1mDownloaded manifest from /Sentinel-2/MSI/L2A_N0500/2023/05/17/S2A_MSIL2A_20230517T001111_N0510_R073_T55KGR_20240810T110959.SAFE/manifest.safe[0m
[32m2025-04-13 08:22:03.735[0m | [1mINFO    [0m | [36m__main__[0m:[36mdownload_manifest[0m:[36m31[0m - [1mDownloaded manifest from /Sentinel-2/MSI/L2A/2022/08/18/S2B_MSIL2A_20220818T002059_N0400_R116_T55KFS_20220818T014455.SAFE/manifest.safe[0m
[32m2025-04-13 08:22:03.849[0m | [1mINFO    [0m | [36m__main__[0m:[36mdownload_manifest[0m:[36m31[0m - [1mDownloaded manifest from /Sentinel-2/MSI/L2A/2024/07/08/S2B_MSIL2A_20240708T002059_N0510_R116_T55KFT_20240