## Importing Libraries

In [None]:
import fiona
import pandas as pd
import geopandas as gpd
import rasterio
import rasterio.features
import zarr
import xarray as xr
import os
import numpy as np
from tqdm import tqdm
from datetime import datetime, date
import urllib.request
import zipfile
import dask
import shutil

## Updating Dataset Attributes

The function `attributes_update` is used to update the attributes of a dataset. It takes four parameters: `dataset`, `title`, `resolution`, and `zipurl`.

```python
def attributes_update(dataset, title, resolution, zipurl):

In [None]:
def attributes_update(dataset, title, resolution, zipurl):
        latitudeattrs = {'_CoordinateAxisType': 'Lat', 
                            'axis': 'Y', 
                            'long_name': 'latitude', 
                            'max': dataset.latitude.values.max(), 
                            'min': dataset.latitude.values.min(), 
                            'standard_name': 'latitude', 
                            'step': (dataset.latitude.values.max() - dataset.latitude.values.min()) / dataset.latitude.values.shape[0], 
                            'units': 'degrees_north'
            }
        longitudeattrs = {'_CoordinateAxisType': 'Lon', 
                        'axis': 'X', 
                        'long_name': 'longitude',
                        'max': dataset.longitude.values.max(),
                        'min': dataset.longitude.values.min(),
                        'standard_name': 'longitude', 
                        'step': (dataset.longitude.values.max() - dataset.longitude.values.min()) / dataset.longitude.values.shape[0], 
                        'units': 'degrees_east'
        }
        dataset.latitude.attrs.update(latitudeattrs)
        dataset.longitude.attrs.update(longitudeattrs)

        # Set the CRS as an attribute
        dataset.attrs['proj:epsg'] = 4326
        dataset.attrs['resolution'] = resolution
        dataset.attrs.update({
            'geospatial_lat_min': dataset['latitude'].min().item(),
            'geospatial_lat_max': dataset['latitude'].max().item(),
            'geospatial_lon_min': dataset['longitude'].min().item(),
            'geospatial_lon_max': dataset['longitude'].max().item()
        })
        dataset.attrs['resolution'] = resolution
        #include where the data comes and when its been converted
        dataset.attrs['History'] = f'Zarr dataset converted from {title}.gdb, downloaded from {zipurl}, on {date.today()}'
        
        #add any other attributes you think necessary to include in the metadata of your zarr dataset
        #dataset.attrs['sources'] = source
    

        return dataset

## Function: `gdf2zarrconverter`

This function converts a geodataframe to a Zarr array. It accepts six parameters: `file_path`, `native_var`, `title`, `layer`, `arco_asset_tmp_path`, and `zipurl`.

The function first defines two helper functions, `cleaner` and `encode_categorical`. `cleaner` is used to clean the data by replacing certain values with 'None'. `encode_categorical` is used to encode categorical data into numerical values.

The function then opens the file at `file_path` and reads its contents. It calculates the bounds and resolution of the data, and creates an empty raster of the appropriate size.

Next, the function iterates over the features in the source data. For each feature, it cleans the data and appends it to a list. It also appends the feature's geometry to a separate list.

The data is then encoded using the `encode_categorical` function. The geometries and encoded data are used to rasterize the data.

An xarray dataset is created from the raster, and the latitude is sorted. If there is a category mapping, it is saved as an attribute of the dataset.

Finally, the dataset is updated with the `attributes_update` function, saved as a Zarr array at `zarr_var_path`, and the path is returned.

```python
def gdf2zarrconverter(file_path, native_var, title, layer, arco_asset_tmp_path, zipurl):
    ...

    

In [None]:
def gdf2zarrconverter(file_path, native_var, title, layer, arco_asset_tmp_path, zipurl):

    def cleaner(data):
        if isinstance(data, str):
            if data == '0' or data == ' ' or data == np.nan or data == 'nan' or data == "" or data == " ":
                data = 'None'
        return data

    def encode_categorical(data):
        if isinstance(data[0], str):
            data = pd.Series(data)
            data = data.fillna('None')  # replace None values with 'None'
            
            data[data == ' '] = 'None'
            data[data == '0'] = 'None'
            data = data.values 
            unique_categories = np.unique(data)
            category_mapping = {'None': 1}
            counter = 2
            for category in unique_categories:
                if category != 'None':
                    category_mapping[category] = counter
                    counter += 1
            encoded_data = np.array([category_mapping.get(item, np.nan) for item in data])
        else:
            encoded_data = data.astype(np.float32)
            category_mapping = {}
        return encoded_data, category_mapping

    with fiona.open(file_path, 'r', layer=layer) as src:
        crs = src.crs
        total_bounds = src.bounds
        lon_min, lat_min, lon_max, lat_max = total_bounds
        resolution = 0.01
        width = int(np.ceil((lon_max - lon_min) / resolution))
        height = int(np.ceil((lat_max - lat_min) / resolution))
        raster_transform = rasterio.transform.from_bounds(lon_min, lat_min, lon_max, lat_max, width, height)
        raster = np.zeros((height, width), dtype=np.float32)
        data = []
        geometries = []
        with tqdm(total=len(src), desc=f"Processing features of {layer} - {native_var}") as pbar:
            for feature in src:
                value = cleaner(feature['properties'][native_var])
                data.append(value)
                geometries.append(feature['geometry'])
                pbar.update()
        data = np.array(data)
        encoded_data, category_mapping = encode_categorical(data)
        with tqdm(total=len(geometries), desc="Rasterizing") as pbar:
            rasterio.features.rasterize(
                ((geom, value) for geom, value in zip(geometries, encoded_data)),
                out=raster,
                transform=raster_transform,
                merge_alg=rasterio.enums.MergeAlg.replace,
                dtype=np.float32,
            )
            pbar.update()
        
        # make xarray dataset, arrange latitude from max to min since rasterio makes rasters from top left to bottom right
        dataset = xr.Dataset(coords={'latitude':  np.round(np.linspace(lat_max, lat_min, height, dtype=float), decimals=4),
                                    'longitude': np.round(np.linspace(lon_min, lon_max, width, dtype=float), decimals=4)})
        dataset[native_var] = (['latitude', 'longitude'], raster)
        dataset = dataset.sortby('latitude')

        if category_mapping:
            # save the mappig dictionary with the variable attributes
            dataset[native_var].attrs['categorical_encoding']= category_mapping

        dataset = attributes_update(dataset, title, resolution, zipurl)
        zarr_var_path = f"{arco_asset_tmp_path}/{title}_{native_var}.zarr"
        dataset.to_zarr(zarr_var_path, mode='w', consolidated=True)
        return zarr_var_path

## Downloading and Extracting a Zip File

This Python code snippet is used to download a zip file from a specified URL, extract it, and find a specfic geodatabase. 


In [None]:
# Download the zip file
zipurl = 'https://s3.waw3-1.cloudferro.com/emodnet/emodnet_native/emodnet_geology/seabed_substrate/multiscale_folk_5/EMODnet_GEO_Seabed_Substrate_All_Res.zip'
geodatabase = 'EMODnet_Seabed_Substrate_1M.gdb'
zip_file = os.path.basename(zipurl)
class TqdmUpTo(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)
with TqdmUpTo(unit='B', unit_scale=True, miniters=1, desc=zip_file) as t:
    urllib.request.urlretrieve(zipurl, filename=zip_file, reporthook=t.update_to)
    # Extract the geodatabase from the zip file
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall('extracted_files')
for root, dirs, files in os.walk('extracted_files'):
    for dir in dirs:
        if dir.endswith('.gdb') and os.path.basename(dir) == geodatabase:
            gdb_path = os.path.join(root, dir)
            break


## Convert to Zarr

This Python code snippet goes through each layer and converts each variable of each layer into Zarr format, using the gdf2zarrconverter.  This is to minimize memory consumption during the conversion process.  Then each dataset is rechunked and combined into a dataset.

In [None]:
temp_zarr_path = 'converted_zarr_files'
os.makedirs(temp_zarr_path, exist_ok=True)
title = os.path.splitext(os.path.basename(geodatabase))[0]


# Get the layers from the geodatabase
layers = fiona.listlayers(gdb_path)

# Create an empty xarray dataset to hold the combined data
combined_dataset = xr.Dataset()

# Process each layer and each variable using gdf2zarr
for layer in layers:
    # Get the variables from the layer
    variables = fiona.open(gdb_path, layer=layer).meta['schema']['properties'].keys()
    
    zarr_vars_paths = [] # replace with your column names
    for variable in variables:
        try:
            print(f"Processing {layer} - {variable}")
            zarr_var_path = gdf2zarrconverter(gdb_path, variable, title, layer, temp_zarr_path, zipurl)
            zarr_vars_paths.append(zarr_var_path)
        except Exception as e:
            print(f"Failed to process {layer} - {variable}: {e}")
            continue

    with dask.config.set(scheduler='single-threaded'):
        for path in zarr_vars_paths:
            try:
                dataset = xr.open_dataset(path, chunks={})  # Use Dask to lazily load the dataset
                dataset = dataset.chunk({'latitude': 'auto', 'longitude': 'auto'}) 
                combined_dataset = xr.merge([combined_dataset, dataset], compat='override', join='outer')
            except Exception as e:
                print(f"Failed to combine zarr dataset {path}: {e}")
                continue

    # add applicable categorical encodings
    categorical_encodings_dict = {}
    for var in combined_dataset.variables:
        if 'categorical_encoding' in combined_dataset[var].attrs:
            categorical_encodings_dict[var] = combined_dataset[var].attrs['categorical_encoding']

    combined_dataset.attrs['categorical_encoding'] = categorical_encodings_dict

    with dask.config.set(scheduler='single-threaded'):
        try:    
            final_dataset = combined_dataset.chunk({'latitude': 'auto', 'longitude': 'auto'})  # for var in dataset.variables:
            zarr_path = f"{layer}.zarr"
            final_dataset.to_zarr(zarr_path, mode = 'w')
            shutil.rmtree(temp_zarr_path)
        except Exception as e:
            print(f"final zarr dataset did not save {layer}: {e}")
            continue

# Print the combined dataset
print(combined_dataset)