# File Transformations

## For Use in Cloud Data Throughput Benchmarking

In [1]:
import pandas as pd
import time
import dask.dataframe as dd
import dask.array as dsa
import zarr
import xarray as xr
import numpy as np
import intake
from contextlib import contextmanager
import gcsfs

In [None]:
token = '/path/to/token.json' # This will be needed to write data into your bucket if it does not have public write access

Note: The name_function does not sort partitions in the output files. Therefore, when using this method to split up CSV files into partitions of the same (or different) file type, make sure to include a sorting feature in the naming function.

In this instance, since these files will be used to measure read speed, the order that the files are concatenated by Dask when they are called into the timing program does not matter. If this method is being used for machine learning or data analysis, it might be a good idea to preserve the partition order.

## Timing Setup

We will be using the same diagnostic timer as seen in the Transfer Speeds notebook. In this case, it will keep track of how long the file conversion process takes. Note that some files will have to be loaded locally in order to convert, given the limitations of the python libraries used to facilitate the conversions.

In [2]:
class DiagnosticTimer:
    def __init__(self):
        self.diagnostics = []
        
    @contextmanager
    def time(self, **kwargs):
        tic = time.time()
        yield
        toc = time.time()
        kwargs["runtime"] = toc - tic
        self.diagnostics.append(kwargs)
        
    def dataframe(self):
        return pd.DataFrame(self.diagnostics)
    
diag_timer = DiagnosticTimer()

In [3]:
# Names to give CSV columns. If the file does not have column names, Dask/Pandas will use your first line of data as such.
names=['lon', 'lat', 'z']

## CSV to Partitioned Parquets

In [4]:
name_function = lambda x: f"ETOPO1_Ice_g_gmt4_{x}.parquet"

with diag_timer.time(conversionType='csv2partparqet'):
    df = dd.read_csv('gs://path/to/bucket/ETOPO1_Ice_g_gmt4.csv', assume_missing=True, header=None, names=names)
    dd.to_parquet(df, 'gs://path/to/bucket/parquetpartitions', name_function=name_function, 
                  storage_options={'token':token})
del df

## CSV to One Parquet File

Note that using this method requires that the CSV and output Parquet file are stored in a local disk. You cannot read and write directly from cloud storage using Pandas. Therefore, the time it takes to read and write this file will not be accurate.

In [5]:
with diag_timer.time(conversionType='csv2parquet_local'):
    # Replace path with your own local file path. Ensure that an appropriate engine is installed within your environment.
    df = pd.read_csv('/local/file/path/ETOPO1_Ice_g_gmt4.csv', header=None, names=names)
    df.to_parquet('/local/file/path/ETOPO1_Ice_g_gmt4.parquet', engine='fastparquet')
del df

## CSV to Partitioned CSVs

In [6]:
def name_function(i):
    return "ETOPO1_Ice_g_gmt4_" + str(i) + ".csv"

with diag_timer.time(conversionType='csv2partcsv'):
    df = dd.read_csv('gs://path/to/bucket/ETOPO1_Ice_g_gmt4.csv', assume_missing=True, header=None, names=names)
    dd.to_csv(df, 'gs://path/to/bucket/csvpartitions', name_function=name_function, storage_options={'token':token})
del df

In order to preserve order between partitions paths created with ``name_function`` should sort to partition order


## NetCDF to Zarr

### Zarr Group

This approach uses Xarray to store the contents of the NetCDF file within a Zarr group. Note that there is no method of retrieving the NetCDF file directly from cloud storage. Writing consolidated metadata is recommended for maximum read speedup.

In [7]:
with diag_timer.time(conversionType='netcdf2zgroup'):
    ds = intake.open_netcdf('gs://path/to/bucket/ETOPO1_Ice_g_gmt4.nc').to_dask()
    ds.to_zarr(store='gs://path/to/bucket/ETOPO1_Ice_g_gmt4.zarr', storage_options={'token':token}, 
               consolidated=True)
del ds

### Zarr Array

In [8]:
with diag_timer.time(conversionType='netcdf2zarray'):
    ds = intake.open_netcdf('gs://path/to/bucket/ETOPO1_Ice_g_gmt4.nc').to_dask()
    darray = ds.to_array()
    data = darray.data
    da = dsa.from_array(data)
    dsa.to_zarr(da, 'gs://path/to/bucket/ETOPO1_Ice_g_gmt4.zarray', storage_options={'token':token})
del ds, darray, data, da

## Present Timing Results

In [9]:
df = diag_timer.dataframe()
df

Unnamed: 0,conversionType,runtime
0,csv2partparqet,66.089746
1,csv2parquet_local,125.892774
2,csv2partcsv,909.665932
3,netcdf2zgroup,82.143818
4,netcdf2zarray,24.640987
