#  Download full SAFE archive as .zip for acolite processing

In [1]:
import os
import sys
import pandas as pd
import requests
import json
import datetime
from tqdm import tqdm
from dotenv import load_dotenv

In [16]:
# 1 Required satellite category
query_satellite = 'SENTINEL-2'

# 2 String to be included in filename for retrieval of specific product by name, 
# i.e L1C, and code for tile name
query_product = 'S2A_MSIL1C_'
query_tile = 'T33TUL'   # best tile for Po River Delta = 'T33TUL' -- other Po delta tiles (overlap) T33TUK, T32TQQ, T32TQR,
# other AOIs: CALABRIA: 'T33SXC' | NE CORSICA  = 'T32TNN'

# 3 Enter a start and end date
query_startDate = '2019-07-01'
query_endDate = '2019-07-31'

# 4 Load geo.json polygon of area of interest: 
# map_geojson = './map.geojson'

# 6 load your credentials from .env
load_dotenv()
username=os.getenv("CDSE_email")
password=os.getenv("CDSE_password")
# if not already in .env config, insert them as 'string' 
# values in the following format to the .env file:
CDSE_email = username
CDSE_password = password

# 7 Set output file:
output_dir = '../data/SAFE/March2019' #edit as appropriate to add batch folders


# Disclaimer ! ! !
This code will query all records of the specified tile in the specified time period, without limiting whether windrows were annotated in the windrows catalogue. 
Cloud cover was hard set to be under 20%, which should limit the number of non-litter-row tiles, but this is no guarantee.
The SAFE files should be checked against the LM_centroids/matched products before proceeding to acolite correction.

In [25]:
def get_access_token(username: str, password: str) -> str:
    data = {
        "client_id": "cdse-public",
        "username":username,
        "password":password,
        "grant_type": "password",
        }
    try:
        r = requests.post(
            "https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/token",
            data=data,
        )
        r.raise_for_status()
    except Exception as e:
        raise Exception(
            f"Access token creation failed. Reponse from the server was: {r.json()}"
            )
    print("Access token created successfully!")
    return r.json()["access_token"]

In [26]:


def get_https_request(satellite, product, tile, start_date, end_date): #, geojson
    
    base_prefix = "https://catalogue.dataspace.copernicus.eu/odata/v1/Products?$filter="
    collection = "Collection/Name eq '" + satellite + "' and startswith(Name,'" + product + "') and contains(Name,'" + tile + "')"
    #roi_coordinates = get_coordinates(geojson)
    #geographic_criteria = "OData.CSC.Intersects(area=geography'SRID=4326;POLYGON((" + roi_coordinates + "))') "
    content_date = (
            "ContentDate/Start gt " + start_date + "T00:00:00.000Z and " +
            "ContentDate/Start lt " + end_date + "T00:00:00.000Z"
    )
    https_request = ( base_prefix + collection +  " and Attributes/OData.CSC.DoubleAttribute/any(att:att/Name eq 'cloudCover' and att/OData.CSC.DoubleAttribute/Value le 20.00) and " 
                     + content_date) # geographic_criteria + " and " +
    print("Query URL:", https_request)
    return https_request


def download_data(token, id, name, length, output):
    url = f"https://download.dataspace.copernicus.eu/odata/v1/Products({id})/$value"
    headers = {"Authorization": f"Bearer {token}"}
    session = requests.Session()
    session.headers.update(headers)
    response = session.get(url, headers=headers, stream=True)
    try:
        print('[', datetime.datetime.strftime(datetime.datetime.now(), '%H:%M:%S'), '] '+'Downloading: '+name)
        with open(output, "wb") as file:
            if length is not None:
                # set the total length of the progress bar for tracking downloads
                pbar = tqdm(total=length, unit="B", unit_scale=True, desc=name)
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        file.write(chunk)
                        # update progress bar
                        pbar.update(len(chunk))
                pbar.close()
        print('[', datetime.datetime.strftime(datetime.datetime.now(), '%H:%M:%S'), '] '+'Download complete: '+name)
        response.close()
    except Exception as e:
        print('[', datetime.datetime.strftime(datetime.datetime.now(), '%H:%M:%S'), '] '+'Download failed: '+name)
        print(f"An exception occured: {e}")


# zip the Safe files for download
def get_file_name(name):
    file_name = ''
    if query_satellite == 'SENTINEL-1':
        file_name = name.replace(".SAFE", ".zip")
    elif query_satellite == 'SENTINEL-2':
        file_name = name.replace(".SAFE", ".zip")
    elif query_satellite == 'SENTINEL-3':
        file_name = name.replace(".SEN3", ".zip")
    elif query_satellite == 'SENTINEL-5P':
        file_name = name.replace(".nc", ".zip")
    elif query_satellite == 'SENTINEL-6':
        file_name = name.replace(".SEN6", ".zip")
    return file_name


Download non-duplicate tiles matched to litter row data. Make sure the path  for ```litterrows = pd.read_excel('../files/LM_centroids.xlsx')```is reflected in your folder structure or 
changed to './LM_centroids.xlsx' if you have the folder in you content folder in colab

In [None]:

request_url = get_https_request(
    query_satellite, query_product, query_tile, query_startDate, query_endDate #, map_geojson, 
)
JSON = requests.get(request_url).json()
if 'detail' in JSON:
    print(JSON['detail']['message'])
    sys.exit()
elif 'value' in JSON:
    df = pd.DataFrame.from_dict(JSON['value'])
    # print(df.columns)
    if len(df) == 0:
        print('No data found')
        sys.exit()
    
    data_id_list = df.Id
    data_name_list = df.Name
    date_content_length = df.ContentLength
else:
    print('Unknown query error')
    sys.exit()

for i in range(len(data_id_list)):
    print(data_name_list[i])
    data_id = data_id_list[i]
    data_name = get_file_name(data_name_list[i])
    data_length = date_content_length[i]
    # Check if the data storage path exists. If not, create the data storage path.
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_file = os.path.join(output_dir, data_name)
    # # Check if the file has been downloaded before of it has no recorded windrows, in either case, skip it and do not download it (again).
    litterrows = pd.read_excel('../files/LM_centroids.xlsx')
    samples_set = set(litterrows['Str_time'])
    if os.path.exists(output_file) and os.path.getsize(output_file) == data_length:
        print(output_file + 'File already exists')   
    elif output_file[34:49] not in samples_set:
        print(output_file + ' has no recorded litter rows' )
    else:
        access_token = get_access_token(CDSE_email, CDSE_password)
        download_data(access_token, data_id, data_name, data_length, output_file)

In [29]:
print(request_url)

https://catalogue.dataspace.copernicus.eu/odata/v1/Products?$filter=Collection/Name eq 'SENTINEL-2' and startswith(Name,'S2A_MSIL1C_') and contains(Name,'T33TUL') and Attributes/OData.CSC.DoubleAttribute/any(att:att/Name eq 'cloudCover' and att/OData.CSC.DoubleAttribute/Value le 20.00) and ContentDate/Start gt 2019-07-01T00:00:00.000Z and ContentDate/Start lt 2019-07-31T00:00:00.000Z
