# Intro

What is [Dask](https://dask.org/)?

# Environment

## Load 
```bash
conda env create -f env_environment.yml
```

## List of packages
```bash
conda create -n pydata_stg python=3.7 --channel conda-forge
conda activate pydata_stg
conda config --env --add channels conda-forge
conda install dask
conda install holidays
conda install ipykernel
pip install d6tstack # make a conda package
python -m ipykernel install --user --name pydata_stg
```

In [1]:
from dask import delayed, compute
import dask.dataframe as dd
import pandas as pd
import os
import numpy as np

# `dask.delayed`

See [this](https://stackoverflow.com/questions/58960985/how-to-separate-files-using-dask-groupby-on-a-column/58975835#58975835)

In [2]:
fldr_raw = "datasets/dummy/raw"
os.makedirs(fldr_raw, exist_ok=True)

In [8]:
def generate_df(n, N):
    df = pd.DataFrame({"id":np.random.randint(10,100,N),
                       "value":np.random.rand(N)})
    df["value"] = df["id"]*df["value"]
    fn = f'{fldr_raw}/file_{n:02d}.csv'
    df.to_csv(fn, index=False)

In [10]:
%%time
N = int(1e6)
for i in range(10):
    generate_df(i+1, N)

CPU times: user 40 s, sys: 393 ms, total: 40.4 s
Wall time: 40.4 s


In [17]:
from dask.distributed import Client

client = Client(n_workers=4)

In [52]:
@delayed
def generate_df(n, N):
    df = pd.DataFrame({"id":np.random.randint(10,100,N),
                       "value":np.random.rand(N)})
    df["value"] = df["id"]+df["value"]
    fn = f'{fldr_raw}/file_{n:02d}.csv'
    df.to_csv(fn, index=False)

In [53]:
%%time
N = int(1e6)
to_process = []
for i in range(10):
    to_process.append(generate_df(i+1, N))

CPU times: user 2.62 ms, sys: 0 ns, total: 2.62 ms
Wall time: 3.75 ms


In [54]:
%%time
out = compute(to_process)

CPU times: user 2.19 s, sys: 187 ms, total: 2.37 s
Wall time: 22.2 s


## `multiprocessing`

In [3]:
import multiprocessing as mp
from tqdm import tqdm

def parallelize(fun, vec, cores=mp.cpu_count(), pbar=True):
    with mp.Pool(cores) as p:
        if pbar is True:
            res = list(tqdm(p.imap(fun, vec),
                            total=len(vec)))
        else:
            res = p.map(fun, vec)
    return res

In [4]:
def generate_df_par(x):
    n, N = x
    df = pd.DataFrame({"id":np.random.randint(10,100,N),
                       "value":np.random.rand(N)})
    df["value"] = df["id"]*df["value"]
    fn = f'{fldr_raw}/file_{n:02d}.csv'
    df.to_csv(fn, index=False)

In [5]:
N = int(1e6)
vec = [(i+1, N) for i in range(10)]

In [6]:
%%time
out = parallelize(generate_df_par, vec)

100%|██████████| 10/10 [00:19<00:00,  1.98s/it]

CPU times: user 54.5 ms, sys: 5.88 ms, total: 60.4 ms
Wall time: 20 s





# `dask.dataframe`

In [21]:
client.restart()



0,1
Client  Scheduler: tcp://127.0.0.1:42889  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.52 GB


In [40]:
%%time
df = dd.read_csv(f'{fldr_raw}/*.csv')

CPU times: user 17.3 ms, sys: 3.14 ms, total: 20.4 ms
Wall time: 21.1 ms


In [31]:
df

Unnamed: 0_level_0,id,value
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1
,int64,float64
,...,...
...,...,...
,...,...
,...,...


In [32]:
df.head()

Unnamed: 0,id,value
0,74,67.97041
1,44,24.071932
2,23,9.312546
3,99,25.188768
4,59,10.212522


In [33]:
%%time
df = df.compute()

CPU times: user 687 ms, sys: 357 ms, total: 1.04 s
Wall time: 2.28 s


In [55]:
%%time
df = dd.read_csv(f'{fldr_raw}/*.csv')

CPU times: user 18.5 ms, sys: 3.65 ms, total: 22.1 ms
Wall time: 23.3 ms


In [56]:
type(df)

dask.dataframe.core.DataFrame

In [57]:
%%time
out = df.groupby("id")["value"].mean()

CPU times: user 13.8 ms, sys: 182 µs, total: 14 ms
Wall time: 13.4 ms


In [58]:
%%time
out = out.compute()

CPU times: user 554 ms, sys: 35.3 ms, total: 590 ms
Wall time: 2 s


In [59]:
out

id
10    10.499807
11    11.501002
12    12.501151
13    13.499577
14    14.499517
        ...    
95    95.501972
96    96.499668
97    97.500471
98    98.499381
99    99.499715
Name: value, Length: 90, dtype: float64