# File Transformations

In [None]:
import pandas as pd
import time
import dask.dataframe as dd
import dask.array as dsa
import zarr
import xarray as xr
import numpy as np
import intake
from contextlib import contextmanager

In [None]:
# This will be needed to write data into your bucket if it does not have public write access
token = '/path/to/your/token.json' 

# Bucket name/public URL that contains the data you would like to convert
root = 'gs://your/bucket/'

Note: The `name_function` does not sort partitions in the output files. Therefore, when using this method to split up CSV files into partitions of the same (or different) file type, make sure to include a sorting feature in the naming function.

In this instance, since these files will be used to measure read speed, the order that the files are concatenated by Dask when they are called into the timing program does not matter. If this method is being used for machine learning or data analysis, it might be a good idea to preserve the partition order.

## Timing Setup

We will be using the same diagnostic timer as seen in the `transferSpeeds.ipynb` notebook. In this case, it will keep track of how long the file conversion process takes. Note that some files will have to be loaded locally in order to convert, given the limitations of the python libraries used to facilitate the conversions.

In [None]:
class DiagnosticTimer:
    def __init__(self):
        self.diagnostics = []
        
    @contextmanager
    def time(self, **kwargs):
        tic = time.time()
        yield
        toc = time.time()
        kwargs["runtime"] = toc - tic
        self.diagnostics.append(kwargs)
        
    def dataframe(self):
        return pd.DataFrame(self.diagnostics)
    
diag_timer = DiagnosticTimer()

In [None]:
# Names to give CSV columns. If the file does not have column names, Dask/Pandas will use your first line of data as such.
names=['lon', 'lat', 'z']

## CSV to Partitioned Parquets

In [None]:
name_function = lambda x: f"ETOPO1_Ice_g_gmt4_{x}.parquet"

with diag_timer.time(conversionType='csv2partparqet'):
    df = dd.read_csv(root + 'ETOPO1_Ice_g_gmt4.csv', assume_missing=True, header=None, names=names)
    dd.to_parquet(df, root + 'parquetpartitions', name_function=name_function, 
                  storage_options={'token':token})
del df

## CSV to One Parquet File

Note that using this method requires that the CSV and output Parquet file are stored in a local disk. You cannot read and write directly from cloud storage using Pandas. The time this cell takes to execute is not representative of the total time it would take to copy the file from cloud storage onto a local disk, convert the file, and move back onto cloud storage.

In [None]:
with diag_timer.time(conversionType='csv2parquet_local'):
    # Replace path with your own local file path. Ensure that an appropriate engine is installed within your environment.
    df = pd.read_csv('/local/file/path/ETOPO1_Ice_g_gmt4.csv', header=None, names=names)
    df.to_parquet('/local/file/path/ETOPO1_Ice_g_gmt4.parquet', engine='fastparquet')
del df

## CSV to Partitioned CSVs

The `header_first_partition_only=True` argument is very important in this instance, otherwise your header line will be written to each output file. To keep only your data, also make sure that no header options are included in the `dd.read_csv(...)` line, including `header=None` & `names=[...]`

In [None]:
def name_function(i):
    return "ETOPO1_Ice_g_gmt4_" + str(i) + ".csv"

with diag_timer.time(conversionType='csv2partcsv'):
    df = dd.read_csv(root + 'ETOPO1_Ice_g_gmt4.csv', assume_missing=True)
    dd.to_csv(df, root + 'csvpartitions', name_function=name_function, 
              storage_options={'token':token}, header_first_partition_only=True)
del df

## NetCDF to Zarr

For multi-variable gridded data, you will need to create `n` Zarr arrays for `n` variables, whereas a Zarr Group will be able to incorporate all variables in a single parent directory in object storage using `Xarray.DataSet.to_zarr(...)`.

In [None]:
intake.open_netcdf(root + 'ETOPO1_Ice_g_gmt4.nc').to_dask().data_vars 
# Lists all data variables contained in the data set.

In [None]:
variable = 'Z1' # Choose data variable to convert into Zarr Array & Group. If you have multiple data variables
                 # they need to be chunked separately and put into the same DataSet to be converted into a Zarr Group.
    
# variable2 = ['Your Variable Name']  Add as many variables as your data set contains    

### Zarr Group

This approach uses Xarray to store the contents of the NetCDF file within a Zarr group. Note that there is no method of retrieving the NetCDF file directly from cloud storage. Writing consolidated metadata is recommended for maximum read speedup.

Selecting the correct chunk sizes is very important to get efficient read speed, and I have found that preserving the internal chunk sizes of the original NetCDF4 data allows for the best data retieval speeds when being accessed from Zarr formats.

In [None]:
with diag_timer.time(conversionType='netcdf2zgroup'):
    ds = intake.open_netcdf(root + 'ETOPO1_Ice_g_gmt4.nc').to_dask()
    da = ds[variable]
    # da2 = ds[variable2]  Can be increased to N variables
    internal_chunks = da.encoding['chunksizes']
    coords = da.dims
    
    da = da.chunk(chunks=dict(zip(coords, internal_chunks)))
    ds = da.to_dataset() # Will need to be altered if you have multiple variables
    ds.to_zarr(store= root + 'ETOPO1_Ice_g_gmt4.zarr', storage_options={'token':token}, 
               consolidated=True)
del ds, da

### Zarr Array

When converting from a Xarray DataSet to Xarray DataArray, note that you can only select one variable at a time. If you are working with data that has multiple data variables, you must convert each variable to a separate array. So, for `n` data variables you must execute the below cell `n` times.

In [None]:
with diag_timer.time(conversionType='netcdf2zarray'):
    ds = intake.open_netcdf(root + 'ETOPO1_Ice_g_gmt4.nc').to_dask()
    da = ds[variable].chunk(chunks=dict(zip(coords, internal_chunks))).data # Change the variable name as needed
    dsa.to_zarr(da, root + 'ETOPO1_Ice_g_gmt4.zarray', storage_options={'token':token})
del ds, da

## Present Timing Results

In [None]:
df = diag_timer.dataframe()
df