#  Download full SAFE archive as .zip and upload to kaggle

 __To use the features in this notebook you need to visit https://dataspace.copernicus.eu and create an account with Copernicus, the official governing body of Sentinel Missions for the European Space Agency (ESA). This takes about 5 minutes to do.__

In [1]:
import os
import sys
import shutil
import pandas as pd
import requests
import json
import datetime
import time
from tqdm import tqdm
import subprocess
from pathlib import Path
from dotenv import load_dotenv

# Batch setup
Steps:
- make sure your log in credentials for cdse are appropriately stored in the .env file in this format with quotation marks: <br><br>
CDSE_email = 'youremail' <br>
CDSE_password = 'yourpassword'<br><br>
- change satellite, if S2A or S2B, depending on batch <br><br>
- change startDate and endDate to reflect the time period for your batch
<br><br>
- output_dir to reflect the REgion & time for your batch in folder name<br><br>__!!Keep it strictly in PO_SX_YY format!!__ <br> longer or shorter strings will break the code <br><br>
- leave query_satellite and query_tile unchanged
- if you find more tiles than written in the table please update as pictured.
- occasionally you will find duplicate records/ instances miliseconds - seconds apart (mostly with one image predominantly blank). This is usually easily noticable because one file will be very small compared to the other, but please keep them until you can unzip and verify there is no ocean visible in the scene with possible annotations. Please still record all SAFE archives downloaded.


### 🛰️ Sentinel-2 Data Summary

| 🛰️ Satellite Type | 📅 From       | 📅 To         | 📦 Number of SAFE Files | 💾 Estimated Size     |
|-------------------|--------------|--------------|--------------------------|------------------------|
| 1️⃣ - S2🅰️_MSIL1C         | 2015-07-04   | 2016-10-17   | 30                       | Maximum 27 GB 💽       |
| 2️⃣ - S2🅰️_MSIL1C         | 2017-02-20   | 2017-10-09   | 17                       | Maximum 17 GB 💽       |
| 3️⃣ - S2🅰️_MSIL1C         | 2018-03-27   | 2018-11-13   | 27                       | Maximum 24 GB 💽       |
| 4️⃣ - <s>S2🅰️_MSIL1C</s> ✅        | <s>2019-02-13   | <s>2019-10-12   | <s>17</s>  *19**                       | Maximum 15 GB 💽     |
| 5️⃣ - <s>S2🅰️_MSIL1C </s> ✅         |<s> 2020-03-16   | 2020-11-05   | <s>18</s>                      | Maximum 16 GB 💽       |
| 6️⃣ - S2🅰️_MSIL1C         | 2021-03-01   | 2021-08-19   | 18                       | Maximum 16 GB 💽       |
| 7️⃣ - S2🅱️_MSIL1C         | 2017-07-05   | 2018-11-18   | 37                       | Maximum 32 GB 💽       |
| 8️⃣ - S2🅱️_MSIL1C         | 2019-02-08   | 2019-10-17   | 14                       | Maximum 13 GB 💽       |
| 9️⃣ - S2🅱️_MSIL1C         | 2020-02-20   | 2020-11-20   | 21                       | Maximum 19 GB 💽       |
| 🔟 - S2🅱️_MSIL1C         | 2021-03-29   | 2021-06-18   | 10                       | Maximum 8 GB 💽        |

In [None]:
# 1 Required satellite category
query_satellite = 'SENTINEL-2'

# 2 Strings to be included in query for retrieval of specific product by name, 
# i.e S2A vs S2B, and code for AOI tile name
query_product = 'S2A_MSIL1C_' # change to S2B_MSIL1C_
query_tile = 'T33TUL' # stays the same

# 3 Enter a start and end date
query_startDate = '2015-08-14'   # change as per table above
query_endDate = '2020-12-29'     # change as per table above

# 4 Load your credentials from .env
load_dotenv()
username=os.getenv("CDSE_email")
password=os.getenv("CDSE_password")
# if not already in .env config, insert them as 'string' 
# values in the following format to the .env file:
CDSE_email = username
CDSE_password = password

# 5 Set output file:
output_dir = './SAFE/PO_2A_17' #edit folder name within SAFE/ as appropriate to add batch folders
# i.e. keep format like: 
# ./SAFE/PO_2A_17 for Sentinel 2A until 2017
# ./SAFE/PO_2B_18 for Sentinel 2B until 2018 etc. 
# =============================================== 
# ! DO NOT CHANGE THE LENGTH OF THE FOLDER NAME! 
# =============================================== 
# This is important for the download script to work properly


## Run as is:

In [7]:
def get_access_token(username: str, password: str) -> str:
    data = {
        "client_id": "cdse-public",
        "username":username,
        "password":password,
        "grant_type": "password",
        }
    try:
        r = requests.post(
            "https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/token",
            data=data,
        )
        r.raise_for_status()
    except Exception as e:
        raise Exception(
            f"Access token creation failed. Reponse from the server was: {r.json()}"
            )
    print("Access token created successfully!")
    return r.json()["access_token"]

In [8]:

def get_https_request(satellite, product, tile, start_date, end_date):
    
    base_prefix = "https://catalogue.dataspace.copernicus.eu/odata/v1/Products?$filter="
    collection = "Collection/Name eq '" + satellite + "' and startswith(Name,'" + product + "') and contains(Name,'" + tile + "')"
    content_date = (
            "ContentDate/Start gt " + start_date + "T00:00:00.000Z and " +
            "ContentDate/Start lt " + end_date + "T00:00:00.000Z"
    )
    https_request = (base_prefix + collection +  " and " + content_date) 
    print("Query URL:", https_request)
    return https_request


def download_data(token, id, name, length, output):
    url = f"https://download.dataspace.copernicus.eu/odata/v1/Products({id})/$value"
    headers = {"Authorization": f"Bearer {token}"}
    session = requests.Session()
    session.headers.update(headers)
    response = session.get(url, headers=headers, stream=True)
    try:
        print('[', datetime.datetime.strftime(datetime.datetime.now(), '%H:%M:%S'), '] '+'Downloading: '+name)
        with open(output, "wb") as file:
            if length is not None:
                pbar = tqdm(total=length, unit="B", unit_scale=True, desc=name)
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        file.write(chunk)
                        pbar.update(len(chunk))
                pbar.close()
        print('[', datetime.datetime.strftime(datetime.datetime.now(), '%H:%M:%S'), '] '+'Download complete: '+name)
        response.close()
    except Exception as e:
        print('[', datetime.datetime.strftime(datetime.datetime.now(), '%H:%M:%S'), '] '+'Download failed: '+name)
        print(f"An exception occured: {e}")


def get_file_name(name):
    file_name = ''
    if query_satellite == 'SENTINEL-1':
        file_name = name.replace(".SAFE", ".zip")
    elif query_satellite == 'SENTINEL-2':
        file_name = name.replace(".SAFE", ".zip")
    elif query_satellite == 'SENTINEL-3':
        file_name = name.replace(".SEN3", ".zip")
    elif query_satellite == 'SENTINEL-5P':
        file_name = name.replace(".nc", ".zip")
    elif query_satellite == 'SENTINEL-6':
        file_name = name.replace(".SEN6", ".zip")
    return file_name


# Download non-duplicate tiles matched to litter row data. 
Make sure the path  for ```litterrows = pd.read_excel('../files/LM_centroids.xlsx')```is reflected in your folder structure or 
if using colab, changed to './LM_centroids.xlsx' and the file added to content folder (current workign dir)

In [9]:
request_url = get_https_request(
    query_satellite, query_product, query_tile, query_startDate, query_endDate #, map_geojson, 
)
def get_all_results(url):
    all_results = []
    while url:
        response = requests.get(url).json()
        if 'value' in response:
            all_results.extend(response['value'])
        else:
            print('Unexpected API response structure.')
            break
        url = response.get('@odata.nextLink')  # Move to next page if exists
        if url:
            time.sleep(1)  # Optional: small delay between pages
    return all_results

results = get_all_results(request_url)

if not results:
    print('No data found')
    sys.exit()

df = pd.DataFrame.from_dict(results)
data_id_list = df.Id
data_name_list = df.Name
date_content_length = df.ContentLength

for i in range(len(data_id_list)):
    print(data_name_list[i])
    data_id = data_id_list[i]
    data_name = get_file_name(data_name_list[i])
    data_length = date_content_length[i]

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    output_file = os.path.join(output_dir, data_name)
# Check if the file has been downloaded before or it has no recorded windrows, 
# in either case, skip it and do not download it (again). If you have a partial or 
# corrupted download, you can delete the file and re-run the script.
# adjust to your path if necessary:
    litterrows = pd.read_csv('../files/s2_product_unique.csv')
    samples_set = set()
    for name in litterrows['s2_product']:
        product_type = name[0:11]
        date_str = name[11:26]
        tile = name[39:45]
        key = product_type + '_' + date_str + '_' + tile
        samples_set.add(key)

    file_name = os.path.basename(output_file).replace('.zip', '')
    product_type = file_name[0:11]
    date_str = file_name[11:26]
    tile = file_name[39:45]
    key = product_type + '_' + date_str + '_' + tile

    if os.path.exists(output_file) and os.path.getsize(output_file) == data_length:
        print(output_file + ' this file already exists')   
    elif key not in samples_set:
        print(output_file + ' has no recorded litter rows')
    else:
        access_token = get_access_token(CDSE_email, CDSE_password)
        download_data(access_token, data_id, data_name, data_length, output_file)

Query URL: https://catalogue.dataspace.copernicus.eu/odata/v1/Products?$filter=Collection/Name eq 'SENTINEL-2' and startswith(Name,'S2A_MSIL1C_') and contains(Name,'T33SXC') and ContentDate/Start gt 2015-08-14T00:00:00.000Z and ContentDate/Start lt 2020-12-29T00:00:00.000Z
S2A_MSIL1C_20150827T095016_N0500_R079_T33SXC_20231010T060850.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20150827T095016_N0500_R079_T33SXC_20231010T060850.zip has no recorded litter rows
S2A_MSIL1C_20150824T094006_N0500_R036_T33SXC_20231010T050858.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20150824T094006_N0500_R036_T33SXC_20231010T050858.zip has no recorded litter rows
S2A_MSIL1C_20150817T095016_N0500_R079_T33SXC_20231009T220107.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20150817T095016_N0500_R079_T33SXC_20231009T220107.zip has no recorded litter rows
S2A_MSIL1C_20150814T094006_N0500_R036_T33SXC_20231010T034522.SAFE
Access token created successfully!
[ 13:23:06 ] Downloading: S2A_MSIL1C_20150814T094006_N0500_R036_T33SXC_20231010T034522.zip


S2A_MSIL1C_20150814T094006_N0500_R036_T33SXC_20231010T034522.zip: 100%|██████████| 685M/685M [02:01<00:00, 5.64MB/s] 


[ 13:25:08 ] Download complete: S2A_MSIL1C_20150814T094006_N0500_R036_T33SXC_20231010T034522.zip
S2A_MSIL1C_20150817T095016_N0500_R079_T33SXC_20231010T035733.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20150817T095016_N0500_R079_T33SXC_20231010T035733.zip has no recorded litter rows
S2A_MSIL1C_20150824T094006_N0500_R036_T33SXC_20231010T072334.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20150824T094006_N0500_R036_T33SXC_20231010T072334.zip has no recorded litter rows
S2A_MSIL1C_20150916T095016_N0500_R079_T33SXC_20231015T142340.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20150916T095016_N0500_R079_T33SXC_20231015T142340.zip has no recorded litter rows
S2A_MSIL1C_20150923T094016_N0500_R036_T33SXC_20231028T021114.SAFE
Access token created successfully!
[ 13:25:10 ] Downloading: S2A_MSIL1C_20150923T094016_N0500_R036_T33SXC_20231028T021114.zip


S2A_MSIL1C_20150923T094016_N0500_R036_T33SXC_20231028T021114.zip: 100%|██████████| 635M/635M [02:37<00:00, 4.03MB/s] 


[ 13:27:47 ] Download complete: S2A_MSIL1C_20150923T094016_N0500_R036_T33SXC_20231028T021114.zip
S2A_MSIL1C_20150913T094016_N0500_R036_T33SXC_20231026T155405.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20150913T094016_N0500_R036_T33SXC_20231026T155405.zip has no recorded litter rows
S2A_MSIL1C_20150903T094006_N0500_R036_T33SXC_20231024T104552.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20150903T094006_N0500_R036_T33SXC_20231024T104552.zip has no recorded litter rows
S2A_MSIL1C_20151112T094232_N0500_R036_T33SXC_20231009T031134.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20151112T094232_N0500_R036_T33SXC_20231009T031134.zip has no recorded litter rows
S2A_MSIL1C_20151003T094016_N0500_R036_T33SXC_20231009T152002.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20151003T094016_N0500_R036_T33SXC_20231009T152002.zip has no recorded litter rows
S2A_MSIL1C_20151125T095332_N0500_R079_T33SXC_20231020T055914.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20151125T095332_N0500_R079_T33SXC_20231020T055914.zip has no recorded litter rows
S2A_MSIL1C_20151006T095016_N

S2A_MSIL1C_20151115T095252_N0500_R079_T33SXC_20231021T074400.zip: 100%|██████████| 24.6M/24.6M [00:03<00:00, 6.55MB/s]


[ 13:27:53 ] Download complete: S2A_MSIL1C_20151115T095252_N0500_R079_T33SXC_20231021T074400.zip
S2A_MSIL1C_20151222T094412_N0500_R036_T33SXC_20231024T163647.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20151222T094412_N0500_R036_T33SXC_20231024T163647.zip has no recorded litter rows
S2A_MSIL1C_20151023T094042_N0500_R036_T33SXC_20231020T065655.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20151023T094042_N0500_R036_T33SXC_20231020T065655.zip has no recorded litter rows
S2A_MSIL1C_20160321T094012_N0500_R036_T33SXC_20231019T185057.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20160321T094012_N0500_R036_T33SXC_20231019T185057.zip has no recorded litter rows
S2A_MSIL1C_20160101T094412_N0500_R036_T33SXC_20231023T170817.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20160101T094412_N0500_R036_T33SXC_20231023T170817.zip has no recorded litter rows
S2A_MSIL1C_20160311T094002_N0500_R036_T33SXC_20231019T172957.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20160311T094002_N0500_R036_T33SXC_20231019T172957.zip has no recorded litter rows
S2A_MSIL1C_20160203T095222_N

S2A_MSIL1C_20161126T094332_N0500_R036_T33SXC_20231025T160709.zip: 100%|██████████| 665M/665M [01:37<00:00, 6.80MB/s] 


[ 13:29:33 ] Download complete: S2A_MSIL1C_20161126T094332_N0500_R036_T33SXC_20231025T160709.zip
S2A_MSIL1C_20161116T094252_N0500_R036_T33SXC_20231027T140107.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20161116T094252_N0500_R036_T33SXC_20231027T140107.zip has no recorded litter rows
S2A_MSIL1C_20161229T095402_N0500_R079_T33SXC_20231101T194508.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20161229T095402_N0500_R079_T33SXC_20231101T194508.zip has no recorded litter rows
S2A_MSIL1C_20161206T094402_N0500_R036_T33SXC_20230920T092229.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20161206T094402_N0500_R036_T33SXC_20230920T092229.zip has no recorded litter rows
S2A_MSIL1C_20161020T095022_N0500_R079_T33SXC_20231016T050322.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20161020T095022_N0500_R079_T33SXC_20231016T050322.zip has no recorded litter rows
S2A_MSIL1C_20161119T095312_N0500_R079_T33SXC_20230924T093135.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20161119T095312_N0500_R079_T33SXC_20230924T093135.zip has no recorded litter rows
S2A_MSIL1C_20161216T094412_N

S2A_MSIL1C_20181027T094101_N0500_R036_T33SXC_20230817T053449.zip: 100%|██████████| 737M/737M [03:52<00:00, 3.17MB/s] 


[ 13:33:28 ] Download complete: S2A_MSIL1C_20181027T094101_N0500_R036_T33SXC_20230817T053449.zip
S2A_MSIL1C_20181119T095301_N0500_R079_T33SXC_20230726T230224.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20181119T095301_N0500_R079_T33SXC_20230726T230224.zip has no recorded litter rows
S2A_MSIL1C_20181216T094401_N0500_R036_T33SXC_20230725T040248.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20181216T094401_N0500_R036_T33SXC_20230725T040248.zip has no recorded litter rows
S2A_MSIL1C_20181126T094321_N0500_R036_T33SXC_20230818T095539.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20181126T094321_N0500_R036_T33SXC_20230818T095539.zip has no recorded litter rows
S2A_MSIL1C_20181129T095341_N0500_R079_T33SXC_20230726T095942.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20181129T095341_N0500_R079_T33SXC_20230726T095942.zip has no recorded litter rows
S2A_MSIL1C_20181206T094351_N0500_R036_T33SXC_20230725T214016.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20181206T094351_N0500_R036_T33SXC_20230725T214016.zip has no recorded litter rows
S2A_MSIL1C_20181219T095411_N

S2A_MSIL1C_20181007T094031_N0500_R036_T33SXC_20230727T233522.zip: 100%|██████████| 604M/604M [01:44<00:00, 5.80MB/s] 


[ 13:35:13 ] Download complete: S2A_MSIL1C_20181007T094031_N0500_R036_T33SXC_20230727T233522.zip
S2A_MSIL1C_20181020T095031_N0500_R079_T33SXC_20230728T031003.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20181020T095031_N0500_R079_T33SXC_20230728T031003.zip has no recorded litter rows
S2A_MSIL1C_20181116T094251_N0500_R036_T33SXC_20230727T013941.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20181116T094251_N0500_R036_T33SXC_20230727T013941.zip has no recorded litter rows
S2A_MSIL1C_20181106T094201_N0500_R036_T33SXC_20230727T210154.SAFE
Access token created successfully!
[ 13:35:15 ] Downloading: S2A_MSIL1C_20181106T094201_N0500_R036_T33SXC_20230727T210154.zip


S2A_MSIL1C_20181106T094201_N0500_R036_T33SXC_20230727T210154.zip: 100%|██████████| 649M/649M [01:45<00:00, 6.16MB/s] 


[ 13:37:00 ] Download complete: S2A_MSIL1C_20181106T094201_N0500_R036_T33SXC_20230727T210154.zip
S2A_MSIL1C_20181010T095031_N0500_R079_T33SXC_20230729T115020.SAFE
Access token created successfully!
[ 13:37:02 ] Downloading: S2A_MSIL1C_20181010T095031_N0500_R079_T33SXC_20230729T115020.zip


S2A_MSIL1C_20181010T095031_N0500_R079_T33SXC_20230729T115020.zip: 100%|██████████| 27.5M/27.5M [00:25<00:00, 1.08MB/s]


[ 13:37:28 ] Download complete: S2A_MSIL1C_20181010T095031_N0500_R079_T33SXC_20230729T115020.zip
S2A_MSIL1C_20181017T094011_N0500_R036_T33SXC_20230729T163213.SAFE
Access token created successfully!
[ 13:37:29 ] Downloading: S2A_MSIL1C_20181017T094011_N0500_R036_T33SXC_20230729T163213.zip


S2A_MSIL1C_20181017T094011_N0500_R036_T33SXC_20230729T163213.zip: 100%|██████████| 719M/719M [03:42<00:00, 3.23MB/s] 


[ 13:41:12 ] Download complete: S2A_MSIL1C_20181017T094011_N0500_R036_T33SXC_20230729T163213.zip
S2A_MSIL1C_20181030T095121_N0500_R079_T33SXC_20230729T213514.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20181030T095121_N0500_R079_T33SXC_20230729T213514.zip has no recorded litter rows
S2A_MSIL1C_20181109T095221_N0500_R079_T33SXC_20230813T172922.SAFE
Access token created successfully!
[ 13:41:13 ] Downloading: S2A_MSIL1C_20181109T095221_N0500_R079_T33SXC_20230813T172922.zip


S2A_MSIL1C_20181109T095221_N0500_R079_T33SXC_20230813T172922.zip: 100%|██████████| 25.1M/25.1M [00:03<00:00, 6.63MB/s]


[ 13:41:17 ] Download complete: S2A_MSIL1C_20181109T095221_N0500_R079_T33SXC_20230813T172922.zip
S2A_MSIL1C_20181226T094411_N0500_R036_T33SXC_20230724T212943.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20181226T094411_N0500_R036_T33SXC_20230724T212943.zip has no recorded litter rows
S2A_MSIL1C_20181229T095411_N0500_R079_T33SXC_20230814T111000.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20181229T095411_N0500_R079_T33SXC_20230814T111000.zip has no recorded litter rows
S2A_MSIL1C_20190329T095031_N0500_R079_T33SXC_20221115T061029.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20190329T095031_N0500_R079_T33SXC_20221115T061029.zip has no recorded litter rows
S2A_MSIL1C_20190326T094031_N0500_R036_T33SXC_20221115T094205.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20190326T094031_N0500_R036_T33SXC_20221115T094205.zip has no recorded litter rows
S2A_MSIL1C_20190319T095031_N0500_R079_T33SXC_20221114T092815.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20190319T095031_N0500_R079_T33SXC_20221114T092815.zip has no recorded litter rows
S2A_MSIL1C_20190316T094031_N

S2A_MSIL1C_20200110T094351_N0500_R036_T33SXC_20230603T093842.zip: 100%|██████████| 633M/633M [02:08<00:00, 4.92MB/s] 


[ 13:43:27 ] Download complete: S2A_MSIL1C_20200110T094351_N0500_R036_T33SXC_20230603T093842.zip
S2A_MSIL1C_20200303T095031_N0500_R079_T33SXC_20230608T023359.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20200303T095031_N0500_R079_T33SXC_20230608T023359.zip has no recorded litter rows
S2A_MSIL1C_20200103T095401_N0500_R079_T33SXC_20230611T001337.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20200103T095401_N0500_R079_T33SXC_20230611T001337.zip has no recorded litter rows
S2A_MSIL1C_20200310T094031_N0500_R036_T33SXC_20230619T230821.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20200310T094031_N0500_R036_T33SXC_20230619T230821.zip has no recorded litter rows
S2A_MSIL1C_20200323T095031_N0500_R079_T33SXC_20230408T100540.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20200323T095031_N0500_R079_T33SXC_20230408T100540.zip has no recorded litter rows
S2A_MSIL1C_20200229T094031_N0500_R036_T33SXC_20230629T045956.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20200229T094031_N0500_R036_T33SXC_20230629T045956.zip has no recorded litter rows
S2A_MSIL1C_20200219T094031_N

S2A_MSIL1C_20200409T094031_N0500_R036_T33SXC_20230426T035222.zip: 100%|██████████| 663M/663M [01:37<00:00, 6.77MB/s] 


[ 13:45:07 ] Download complete: S2A_MSIL1C_20200409T094031_N0500_R036_T33SXC_20230426T035222.zip
S2A_MSIL1C_20200608T094041_N0500_R036_T33SXC_20230618T024628.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20200608T094041_N0500_R036_T33SXC_20230618T024628.zip has no recorded litter rows
S2A_MSIL1C_20200512T095031_N0500_R079_T33SXC_20230507T182306.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20200512T095031_N0500_R079_T33SXC_20230507T182306.zip has no recorded litter rows
S2A_MSIL1C_20200519T094041_N0500_R036_T33SXC_20230618T162419.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20200519T094041_N0500_R036_T33SXC_20230618T162419.zip has no recorded litter rows
S2A_MSIL1C_20200502T095031_N0500_R079_T33SXC_20230429T045445.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20200502T095031_N0500_R079_T33SXC_20230429T045445.zip has no recorded litter rows
S2A_MSIL1C_20200422T095031_N0500_R079_T33SXC_20230521T093754.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20200422T095031_N0500_R079_T33SXC_20230521T093754.zip has no recorded litter rows
S2A_MSIL1C_20200522T095041_N

S2A_MSIL1C_20201228T095421_N0500_R079_T33SXC_20230401T085523.zip: 100%|██████████| 23.6M/23.6M [00:03<00:00, 6.87MB/s]

[ 13:45:13 ] Download complete: S2A_MSIL1C_20201228T095421_N0500_R079_T33SXC_20230401T085523.zip
S2A_MSIL1C_20201208T095401_N0500_R079_T33SXC_20230403T085257.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20201208T095401_N0500_R079_T33SXC_20230403T085257.zip has no recorded litter rows
S2A_MSIL1C_20201125T094331_N0500_R036_T33SXC_20230629T110157.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20201125T094331_N0500_R036_T33SXC_20230629T110157.zip has no recorded litter rows
S2A_MSIL1C_20201205T094401_N0500_R036_T33SXC_20230303T122630.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20201205T094401_N0500_R036_T33SXC_20230303T122630.zip has no recorded litter rows
S2A_MSIL1C_20201215T094411_N0500_R036_T33SXC_20230304T194412.SAFE
./SAFE/CALAB_2A/S2A_MSIL1C_20201215T094411_N0500_R036_T33SXC_20230304T194412.zip has no recorded litter rows





# Upload your batch to a new kaggle dataset

## edit here:

In [42]:
## Amend you project root to point to /notebooks 
## or your current directory where .kaggle/ and SAFE/ folders 
## must also be located

project_root = "/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/notebooks"
dataset_main = Path(project_root) / "dataset_main"
kaggle_json_path = Path(project_root) / ".kaggle/kaggle.json"
kaggle_config_dir = kaggle_json_path.parent

In [None]:
batch_folders = [
    Path(project_root) / "SAFE/CALAB_2A"  # add multiple if multiple batches
]
dataset_title = "Litter Rows Batch 4" # change only the number to appropriate batch #
dataset_id = "sarahajbane/litter-windrows-batch-4" # change only the number to appropriate batch #
license_name = "CC-BY-SA-4.0"

os.environ['KAGGLE_CONFIG_DIR'] = str(kaggle_config_dir)
os.chmod(kaggle_json_path, 0o600)

dataset_main.mkdir(parents=True, exist_ok=True)

In [None]:
## LEAVE AS IS
if project_root not in sys.path:
    sys.path.append(project_root)
print(project_root)

# Function to create or update a Kaggle dataset from a local folder
def create_kaggle_dataset_from_folder(
    folder_path,
    title,
    dataset_id,
    description="Sentinel-2 L1C subset",
    license_name="CC-BY-SA-4.0"
):
    folder_path = Path(folder_path)
    assert folder_path.exists(), "Folder does not exist!"

    metadata_path = folder_path / "dataset-metadata.json"
    safe_files = [f.name for f in folder_path.glob("*.zip*")]

    resources = [
        {
            "name": Path(zipf).stem,
            "path": zipf,
            "description": f"Zipped .SAFE Sentinel-2: {zipf}",
                "type": "other",
                "format": "zip"
        } for zipf in safe_files
    ]

    metadata = {
        "title": title,
        "id": dataset_id,
        "licenses": [{
            "name": license_name,
            "title": "Creative Commons Attribution Share-Alike 4.0",
            "path": "https://creativecommons.org/licenses/by-sa/4.0/"
        }],
        "resources": resources
    }

    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)

    # Initialize if necessary
    if not (folder_path / "dataset-metadata.json").exists():
        subprocess.run(["kaggle", "datasets", "init", "-p", str(folder_path)])

    # Create or version the dataset
    if not any((folder_path / f).exists() for f in ["dataset-metadata.json", "dataset-metadata.yml"]):
        print("No metadata found, initializing dataset.")
        subprocess.run(["kaggle", "datasets", "init", "-p", str(folder_path)])

    try:
        subprocess.run([
            "kaggle", "datasets", "create",
            "-p", str(folder_path),
            "--dir-mode", "zip"
        ], check=True)
    except subprocess.CalledProcessError:
        subprocess.run([
            "kaggle", "datasets", "version",
            "-p", str(folder_path),
            "-m", "Update data",
            "--dir-mode", "zip"
        ])


In [None]:
# Run your function

folder_path= str(Path(dataset_main / "CALAB_2A"))  ### CHANGE FOLDERNAME HERE

create_kaggle_dataset_from_folder(
    folder_path = folder_path,    
    title=dataset_title,
    dataset_id= dataset_id
)

/Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/notebooks
Starting upload for file S2A_MSIL1C_20200409T094031_N0500_R036_T33SXC_20230426T035222.zip


100%|██████████| 633M/633M [04:19<00:00, 2.56MB/s] 


Upload successful: S2A_MSIL1C_20200409T094031_N0500_R036_T33SXC_20230426T035222.zip (633MB)
Starting upload for file S2A_MSIL1C_20150814T094006_N0500_R036_T33SXC_20231010T034522.zip


100%|██████████| 654M/654M [04:32<00:00, 2.52MB/s] 


Upload successful: S2A_MSIL1C_20150814T094006_N0500_R036_T33SXC_20231010T034522.zip (654MB)
Starting upload for file S2A_MSIL1C_20181027T094101_N0500_R036_T33SXC_20230817T053449.zip


100%|██████████| 703M/703M [04:53<00:00, 2.51MB/s] 


Upload successful: S2A_MSIL1C_20181027T094101_N0500_R036_T33SXC_20230817T053449.zip (703MB)
Starting upload for file S2A_MSIL1C_20201228T095421_N0500_R079_T33SXC_20230401T085523.zip


100%|██████████| 22.5M/22.5M [00:11<00:00, 2.13MB/s]


Upload successful: S2A_MSIL1C_20201228T095421_N0500_R079_T33SXC_20230401T085523.zip (22MB)
Starting upload for file S2A_MSIL1C_20200110T094351_N0500_R036_T33SXC_20230603T093842.zip


100%|██████████| 604M/604M [04:07<00:00, 2.56MB/s] 


Upload successful: S2A_MSIL1C_20200110T094351_N0500_R036_T33SXC_20230603T093842.zip (604MB)
Starting upload for file S2A_MSIL1C_20181007T094031_N0500_R036_T33SXC_20230727T233522.zip


100%|██████████| 576M/576M [03:56<00:00, 2.55MB/s] 


Upload successful: S2A_MSIL1C_20181007T094031_N0500_R036_T33SXC_20230727T233522.zip (576MB)
Starting upload for file S2A_MSIL1C_20150923T094016_N0500_R036_T33SXC_20231028T021114.zip


100%|██████████| 605M/605M [04:09<00:00, 2.55MB/s] 


Upload successful: S2A_MSIL1C_20150923T094016_N0500_R036_T33SXC_20231028T021114.zip (605MB)
Starting upload for file S2A_MSIL1C_20181010T095031_N0500_R079_T33SXC_20230729T115020.zip


100%|██████████| 26.2M/26.2M [00:12<00:00, 2.26MB/s]


Upload successful: S2A_MSIL1C_20181010T095031_N0500_R079_T33SXC_20230729T115020.zip (26MB)
Starting upload for file S2A_MSIL1C_20181017T094011_N0500_R036_T33SXC_20230729T163213.zip


100%|██████████| 686M/686M [04:41<00:00, 2.55MB/s] 


Upload successful: S2A_MSIL1C_20181017T094011_N0500_R036_T33SXC_20230729T163213.zip (686MB)
Starting upload for file S2A_MSIL1C_20161126T094332_N0500_R036_T33SXC_20231025T160709.zip


100%|██████████| 635M/635M [04:21<00:00, 2.55MB/s] 


Upload successful: S2A_MSIL1C_20161126T094332_N0500_R036_T33SXC_20231025T160709.zip (635MB)
Starting upload for file S2A_MSIL1C_20181109T095221_N0500_R079_T33SXC_20230813T172922.zip


100%|██████████| 24.0M/24.0M [00:11<00:00, 2.22MB/s]


Upload successful: S2A_MSIL1C_20181109T095221_N0500_R079_T33SXC_20230813T172922.zip (24MB)
Starting upload for file S2A_MSIL1C_20181106T094201_N0500_R036_T33SXC_20230727T210154.zip


100%|██████████| 619M/619M [04:15<00:00, 2.55MB/s] 


Upload successful: S2A_MSIL1C_20181106T094201_N0500_R036_T33SXC_20230727T210154.zip (619MB)
Starting upload for file S2A_MSIL1C_20151115T095252_N0500_R079_T33SXC_20231021T074400.zip


100%|██████████| 23.5M/23.5M [00:10<00:00, 2.24MB/s]


Upload successful: S2A_MSIL1C_20151115T095252_N0500_R079_T33SXC_20231021T074400.zip (23MB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/datasets/sarahajbane/litter-windrows-batch-calA


 Once you have confirmed the upload to the kaggle dataset, which you can see with the link once completed! Please run the final code block, update the notebook and push your changes to github, or let us know on slack that you have finished the upload.

In [57]:
for folder_path in batch_folders:
    file_path = os.path.join(folder_path, 'processing_report.txt')
    tile_count = len([
            entry for entry in os.listdir(folder_path) 
            if os.path.isfile(os.path.join(folder_path, entry))
        ])
    
    with open(file_path, 'w') as file:
        file.write(f'Upload finished, {tile_count} tiles processed and uploaded to {dataset_id}')
    print(f'Created report in {file_path}')

print("All tiles uploaded successfully!")

Created report in /Users/sara_mac/Desktop/projects/plastic_detection/Sentinel2PlasticDetectionProject/task2-data-collection/notebooks/SAFE/CALAB_2A/processing_report.txt
All tiles uploaded successfully!


# Finished! 

In [None]:
# delete files if you want to remove them from your local machine
def delete_all_files_in_directory(directory_path):
    for file_name in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
            print(f"Deleted: {file_path}")

# This will delete all files in the specified dataset_main subfolder
# run this only if you have finished your upload and no longer want the files stored locally
# if you want to delete them from the safe folder as well, change 
# Path(dataset_main) to Path(src_folder) in the line below
# Proceed with caution

directory_path = Path(dataset_main) / "CORSI_2B"
delete_all_files_in_directory(directory_path)
