In [1]:
from dask.distributed import Client, performance_report, get_task_stream
from dask.diagnostics import ProgressBar, ResourceProfiler

from sgkit.distance.api import pairwise_distance
import dask.array as da
import fsspec, zarr

# holds dask client
c = None

In [2]:
def get_dask_cluster(target="cpu"):
    if c:
        c and c.shutdown()
        time.sleep(11)
    if target == "cpu":
        print(f"Target: {target}")
        client = Client()
    else:
        from dask_cuda import LocalCUDACluster
        cluster = LocalCUDACluster()
        client = Client(cluster)
    return client

## Create GPU Dask Cluster

In [3]:
c = get_dask_cluster(target="gpu")
c

0,1
Client  Scheduler: tcp://127.0.0.1:43649  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 67.47 GB


## Load MalariaGEN Data

In [5]:
store = fsspec.get_mapper('gs://ag1000g-release/phase2.AR1/variation/main/zarr/all/ag1000g.phase2.ar1')
callset_snps = zarr.open_consolidated(store=store)
gt = callset_snps['2R/calldata/GT']

gt_da = da.from_zarr(gt)
x = gt_da[:, :, 1].T
x = x.rechunk((-1, 100000))
x

Unnamed: 0,Array,Chunk
Bytes,28.28 GB,114.20 MB
Shape,"(1142, 24767689)","(1142, 100000)"
Count,5084 Tasks,248 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 28.28 GB 114.20 MB Shape (1142, 24767689) (1142, 100000) Count 5084 Tasks 248 Chunks Type int8 numpy.ndarray",24767689  1142,

Unnamed: 0,Array,Chunk
Bytes,28.28 GB,114.20 MB
Shape,"(1142, 24767689)","(1142, 100000)"
Count,5084 Tasks,248 Chunks
Type,int8,numpy.ndarray


In [6]:
def run_with_report(x, metric, target, report_name):
    with performance_report(filename=f"dask-report-{metric}-{report_name}.html"), get_task_stream(filename=f"task-stream-{metric}-{report_name}.html"):
        out = pairwise_distance(x, metric=metric, target=target)
        out.compute()

## Pairwise Distance: Euclidean Metric

In [8]:
%%time
run_with_report(x, metric="euclidean", target="gpu", report_name="full")

CPU times: user 2min 48s, sys: 4.32 s, total: 2min 52s
Wall time: 5min 44s


## Pairwise Distance: Correlation Metric

In [9]:
%%time
run_with_report(x, metric="correlation", target="gpu", report_name="full")

CPU times: user 10min 4s, sys: 10.5 s, total: 10min 15s
Wall time: 10min 52s
