# Convert patchlets from `EOPatch`es to `.npz` files

The deep learning model is trained loading lazily small `npz` files, to optimise disk IO and RAM loading efficiency when size of training data cannot fit into memory.

This script converts and chunks the patchlets present in `EOPatch`es to `.npz` files. In each `.npz` file, a number of `chunk_size` samples is saved.

In [None]:
from abc import abstractmethod
import os
import boto3
import fs
from fs_s3fs import S3FS

from datetime import datetime, timedelta
import dateutil
import rasterio
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from concurrent.futures import ProcessPoolExecutor

from sentinelhub import CRS, BBox
from s2cloudless import S2PixelCloudDetector
import fs
from fs.osfs import OSFS
from eolearn.core import FeatureType, EOPatch, EOTask, EOWorkflow, SaveTask, OverwritePermission, EOExecutor, FeatureTypeSet

In [None]:
def multiprocess(process_fun: Callable, arguments: List[Any], max_workers: int = 4) -> List[Any]:
    """
    Executes multiprocessing with tqdm.
    Parameters
    ----------
    process_fun: A function that processes a single item.
    arguments: Arguments with which te function is called.
    max_workers: Max workers for the process pool executor.

    Returns A list of results.
    -------

    """
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(executor.map(process_fun, arguments), total=len(arguments)))
    return results

### Define filesystem and eopatches location 

In [None]:
filesystem = S3FS("bucket-name", 
              aws_access_key_id="",
              aws_secret_access_key="",
              region="eu-central-1") 

In [None]:
PATCHLETS_FOLDER = 'data/Lithuania/patchlets/2019'

In [None]:
PATCHLET_PATHS = [os.path.join(PATCHLETS_FOLDER, patchlet_name) for patchlet_name in filesystem.listdir(PATCHLETS_FOLDER)]

In [None]:
def generate_numpy(pp):
    try: 
        eop = EOPatch.load(pp, filesystem=filesystem, lazy_loading=True)
        X_data = eop.data['BANDS']
        X_boundary = np.repeat(eop.mask_timeless['BOUNDARY'][np.newaxis, ...], len(eop.timestamp), axis=0)
        X_extent = np.repeat(eop.mask_timeless['EXTENT'][np.newaxis, ...], len(eop.timestamp), axis=0)
        X_distance = np.repeat(eop.mask_timeless['DISTANCE'][np.newaxis, ...], len(eop.timestamp), axis=0)
        timestamps = eop.timestamp
        eop_names = np.repeat([pp], len(eop.timestamp), axis=0)
    except: 
        print(f"Could not create for {pp}")
        return None, None, None, None, None, None
    return X_data, X_boundary, X_extent, X_distance, timestamps, eop_names

In [None]:
results = multiprocess(generate_numpy, PATCHLET_PATHS, max_workers=25)

In [None]:
X = np.concatenate([x[0] for x in results if x[0] is not None])
y_boundary = np.concatenate([x[1] for x in results if x[1] is not None])
y_extent = np.concatenate([x[2] for x in results if x[2] is not None])
y_distance = np.concatenate([x[3] for x in results if x[3] is not None])
timestamps = np.concatenate([x[4] for x in results if x[4] is not None])
eop_names = np.concatenate([x[5] for x in results if x[5] is not None])

In [None]:
chunk_info_dfs = [] 

idx = 0
chunksize = 2000 
for i in range(0, len(X), chunksize):
    filename = f'patchlets_field_delineation_{idx}'
    eop_names = [x.sp]
    chunk_info = {'chunk': filename, 'eopatch'. }
        
    np.savez(f'arrays1/{filename}}', 
                X=X[i:i+chunksize], 
                y_boundary=y_boundary[i:i+chunksize], 
                y_extent=y_extent[i:i+chunksize], 
                y_distance=y_distance[i:i+chunksize], 
                timestamps=timestamps[i:i+chunksize],
                eopatches=eop_names[i:i+chunksize])
    idx += 1 

In [None]:
def copy_to_s3(src_folder, dest_folder): 
    for arr in src_filesystem.listdir(src_folder): 
        if arr.startswith('patchlets'): 
            fs.copy.copy_file(src_fs=src_filesystem, src_path=os.path.join(src_folder, arr), dst_fs=filesystem, dst_path=os.path.join(dest_folder, arr))

# NPZ patchlet info dataframe

In [None]:
dfs = [] 

NPZ_PATCHLET_FOLDER = '' # Location of the NPZ files 

for filename in filesystem.listdir(NPZ_PATCHLET_FOLDER)[:6]:
    if filename.startswith('patchlet'):
        npz = np.load(filesystem.openbin(f'{NPZ_PATCHLET_FOLDER}/{filename}'), allow_pickle=True)

        dfs.append(pd.DataFrame({'chunk': filename, 
         'eopatch': [os.path.basename(x).split('_')[0] for x in npz['eopatches']],
         'patchlet': [os.path.basename(x) for x in npz['eopatches']],
         'chunk_pos': [os.path.basename(x).split('_')[1] for x in npz['eopatches']],
         'timestamp': npz['timestamps']}))

df = pd.concat(dfs)
df.to_csv('data/Lithuania/patchlets_meta/patchlet_eopatch.csv')