In [0]:
%pip install xarray netcdf4

In [0]:
%sh pip list

In [0]:
%pip install psutil
%restart_python


In [0]:
import xarray as xr
import psutil
import os
import shutil
import time
import glob
import threading
import concurrent.futures

SHOULD_STOP = False
TARGET_DIR = '/local_disk0/era5'


def copy_file(file_path):
    shutil.copy(file_path, TARGET_DIR)
    target_path = os.path.join(TARGET_DIR, file_path)
    size = os.path.getsize(target_path)
    try:
        ds = xr.open_dataset(target_path)
        ds.load()
        ds.close()
        return file_path, size, None
    except Exception as exception:
        try:
            os.remove(target_path)
        except:
            pass
        return file_path, size, exception


def main():
    max_workers = 1
    offset = 365
    index = 0
    os.makedirs(TARGET_DIR, exist_ok=True)

    start = time.time()
    era5_path_list = [
        p for p in glob.glob('/Volumes/aer-processed/era5/daily_summary/*.nc')]
    print(f'took {time.time()-start:.2f}s to read {len(era5_path_list)} files', flush=True)

    batch_start = time.time()
    total_bytes_copied = 0
    total_completed = 0
    bandwidth = 0
    gbps = 0
    approx_time_left = 0

    print(f'max workers: {max_workers} -- ', end='', flush=True)

    #monitor_thread = threading.Thread(target=monitor_cpu, daemon=True)
    #monitor_thread.start()

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        print('submitting copy commands', flush=True)
        futures = [executor.submit(copy_file, fp) for fp in era5_path_list]

        for f in concurrent.futures.as_completed(futures):
            file_path, file_size, exception = f.result()
            if exception:
                print(f'error when copying {file_path}: {exception}')
            total_completed += 1
            total_bytes_copied += file_size
            elapsed = time.time() - batch_start
            files_left = len(era5_path_list) - total_completed
            if elapsed > 0 and total_completed > 0:
                bandwidth = (total_bytes_copied / (1024 * 1024)) / elapsed
                gbps = (total_bytes_copied * 8 / (1024 * 1024 * 1024)) / elapsed
                avg_time_per_file = elapsed / total_completed
                approx_time_left = avg_time_per_file * files_left

            print(
                f'Files left: {files_left}, approx time left: {approx_time_left:.2f}s, bandwidth: {bandwidth:.2f} MB/s ({gbps:.2f} Gbps)', flush=True)

    SHOULD_STOP = True

if __name__ == '__main__':
    main()


In [0]:
df_read = spark.table('experimental.my_schema.my_netcdf_blobs')