# 10+ Minutes to Dask

<a href="https://colab.research.google.com/github/shauryashaurya/learn-data-munging/blob/main/03.001%20-%2010%2B%20minutes%20to%20dask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
import numpy as np
import pandas as pd
import dask
import dask.dataframe as dd
import dask.array as da
import dask.bag as db

# Dask Objects

## Dask DataFrames

Dask Dataframes coordinate many Pandas dataframes, partitioned along an index.  
Support a subset of the Pandas API.  


In [2]:
# dask dataframe
# from pandas
idx = pd.date_range("2023-05-06", periods = 1000, freq="1H")

In [3]:
idx

DatetimeIndex(['2023-05-06 00:00:00', '2023-05-06 01:00:00',
               '2023-05-06 02:00:00', '2023-05-06 03:00:00',
               '2023-05-06 04:00:00', '2023-05-06 05:00:00',
               '2023-05-06 06:00:00', '2023-05-06 07:00:00',
               '2023-05-06 08:00:00', '2023-05-06 09:00:00',
               ...
               '2023-06-16 06:00:00', '2023-06-16 07:00:00',
               '2023-06-16 08:00:00', '2023-06-16 09:00:00',
               '2023-06-16 10:00:00', '2023-06-16 11:00:00',
               '2023-06-16 12:00:00', '2023-06-16 13:00:00',
               '2023-06-16 14:00:00', '2023-06-16 15:00:00'],
              dtype='datetime64[ns]', length=1000, freq='H')

In [4]:
pd_df = pd.DataFrame({"a": np.arange(1000), "b": list("abcd"*250)}, index = idx)

In [5]:
pd_df

Unnamed: 0,a,b
2023-05-06 00:00:00,0,a
2023-05-06 01:00:00,1,b
2023-05-06 02:00:00,2,c
2023-05-06 03:00:00,3,d
2023-05-06 04:00:00,4,a
...,...,...
2023-06-16 11:00:00,995,d
2023-06-16 12:00:00,996,a
2023-06-16 13:00:00,997,b
2023-06-16 14:00:00,998,c


In [6]:
dask_df = dd.from_pandas(pd_df, npartitions=10)

In [7]:
dask_df

Unnamed: 0_level_0,a,b
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-05-06 00:00:00,int32,object
2023-05-10 04:00:00,...,...
...,...,...
2023-06-12 12:00:00,...,...
2023-06-16 15:00:00,...,...


In [8]:
dask_df.divisions

(Timestamp('2023-05-06 00:00:00'),
 Timestamp('2023-05-10 04:00:00'),
 Timestamp('2023-05-14 08:00:00'),
 Timestamp('2023-05-18 12:00:00'),
 Timestamp('2023-05-22 16:00:00'),
 Timestamp('2023-05-26 20:00:00'),
 Timestamp('2023-05-31 00:00:00'),
 Timestamp('2023-06-04 04:00:00'),
 Timestamp('2023-06-08 08:00:00'),
 Timestamp('2023-06-12 12:00:00'),
 Timestamp('2023-06-16 15:00:00'))

In [9]:
dask_df.partitions[1]

Unnamed: 0_level_0,a,b
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-05-10 04:00:00,int32,object
2023-05-14 08:00:00,...,...


In [10]:
# data types of each of the columns
dask_df.dtypes

a     int32
b    object
dtype: object

We can do regular Pandas stuff with Dask Dataframes now...

In [11]:
# get a subset based on index (date-time)
dask_df2 = dask_df.loc[idx[0:100]]

In [12]:
dask_df2

Unnamed: 0_level_0,a,b
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-05-06 00:00:00,int32,object
2023-05-10 03:00:00,...,...


In [13]:
# perform analysis on the subset
dask_df2_grpby_count = dask_df2.groupby("b").count()

In [14]:
# Dask evaluates lazy
# nothing happens untill we call .compute()
dask_df2_grpby_count.compute()

Unnamed: 0_level_0,a
b,Unnamed: 1_level_1
a,25
b,25
c,25
d,25


## Dask Arrays

Dask arrays coordinate many Numpy arrays, arranged into chunks within a grid.  
Dask arrays support a subset of Numpy API.

In [15]:
np_array = np.arange(100000).reshape(200,500)

In [16]:
dask_array = da.from_array(np_array, chunks = (100,100))

In [17]:
dask_array

Unnamed: 0,Array,Chunk
Bytes,390.62 kiB,39.06 kiB
Shape,"(200, 500)","(100, 100)"
Dask graph,10 chunks in 1 graph layer,10 chunks in 1 graph layer
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 390.62 kiB 39.06 kiB Shape (200, 500) (100, 100) Dask graph 10 chunks in 1 graph layer Data type int32 numpy.ndarray",500  200,

Unnamed: 0,Array,Chunk
Bytes,390.62 kiB,39.06 kiB
Shape,"(200, 500)","(100, 100)"
Dask graph,10 chunks in 1 graph layer,10 chunks in 1 graph layer
Data type,int32 numpy.ndarray,int32 numpy.ndarray


In [18]:
dask_array.chunks

((100, 100), (100, 100, 100, 100, 100))

In [19]:
dask_array.blocks[1,3]

Unnamed: 0,Array,Chunk
Bytes,39.06 kiB,39.06 kiB
Shape,"(100, 100)","(100, 100)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 39.06 kiB 39.06 kiB Shape (100, 100) (100, 100) Dask graph 1 chunks in 2 graph layers Data type int32 numpy.ndarray",100  100,

Unnamed: 0,Array,Chunk
Bytes,39.06 kiB,39.06 kiB
Shape,"(100, 100)","(100, 100)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray


In [20]:
# let's play with a slightly more interesting example
# x is a matrix of random numbers
x = da.random.random((100, 100), chunks=(10,10))

In [21]:
x

Unnamed: 0,Array,Chunk
Bytes,78.12 kiB,800 B
Shape,"(100, 100)","(10, 10)"
Dask graph,100 chunks in 1 graph layer,100 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 78.12 kiB 800 B Shape (100, 100) (10, 10) Dask graph 100 chunks in 1 graph layer Data type float64 numpy.ndarray",100  100,

Unnamed: 0,Array,Chunk
Bytes,78.12 kiB,800 B
Shape,"(100, 100)","(10, 10)"
Dask graph,100 chunks in 1 graph layer,100 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [22]:
# operations just like Numpy
y = x + x.T
y

Unnamed: 0,Array,Chunk
Bytes,78.12 kiB,800 B
Shape,"(100, 100)","(10, 10)"
Dask graph,100 chunks in 3 graph layers,100 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 78.12 kiB 800 B Shape (100, 100) (10, 10) Dask graph 100 chunks in 3 graph layers Data type float64 numpy.ndarray",100  100,

Unnamed: 0,Array,Chunk
Bytes,78.12 kiB,800 B
Shape,"(100, 100)","(10, 10)"
Dask graph,100 chunks in 3 graph layers,100 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [23]:
z1 = y[::2, 50:].mean(axis=0)
z2 = y[::2, 50:].mean(axis=1)

In [24]:
z1

Unnamed: 0,Array,Chunk
Bytes,400 B,80 B
Shape,"(50,)","(10,)"
Dask graph,5 chunks in 7 graph layers,5 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 400 B 80 B Shape (50,) (10,) Dask graph 5 chunks in 7 graph layers Data type float64 numpy.ndarray",50  1,

Unnamed: 0,Array,Chunk
Bytes,400 B,80 B
Shape,"(50,)","(10,)"
Dask graph,5 chunks in 7 graph layers,5 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [25]:
# to actually compute z1, let's use .compute()
z1.compute()

array([0.9762545 , 1.03572593, 0.97806574, 1.13349766, 1.0540171 ,
       0.97040159, 0.92104922, 1.01397592, 0.9084497 , 1.03117358,
       1.03013133, 1.08811395, 1.01318829, 1.01110945, 1.00573487,
       1.18655799, 0.92192718, 1.02198486, 0.96483665, 1.00945573,
       0.88939484, 1.0271935 , 0.99454104, 0.98301834, 0.94208149,
       0.94500553, 1.0903829 , 0.99807706, 0.85378371, 1.0188624 ,
       1.0378555 , 0.88875267, 1.07715284, 1.07249123, 1.04990734,
       0.97158336, 1.01295687, 0.87130616, 1.08101291, 1.03526235,
       0.9724833 , 0.94657702, 0.98617538, 0.91101613, 1.09580199,
       0.98542156, 0.98772736, 1.00386451, 1.0024587 , 1.07317316])

In [26]:
z2

Unnamed: 0,Array,Chunk
Bytes,400 B,40 B
Shape,"(50,)","(5,)"
Dask graph,10 chunks in 7 graph layers,10 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 400 B 40 B Shape (50,) (5,) Dask graph 10 chunks in 7 graph layers Data type float64 numpy.ndarray",50  1,

Unnamed: 0,Array,Chunk
Bytes,400 B,40 B
Shape,"(50,)","(5,)"
Dask graph,10 chunks in 7 graph layers,10 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [27]:
z2.compute()

array([0.97311589, 0.95285174, 1.00106068, 1.03975061, 0.97905205,
       1.02144948, 1.00934132, 0.89756189, 1.03855542, 1.07249502,
       0.97666846, 1.07064896, 1.02684477, 0.95244938, 1.02983345,
       1.03670556, 0.95124203, 0.95370923, 1.08869905, 0.97149896,
       1.07679346, 1.04285809, 1.06514418, 0.95767533, 1.05340805,
       1.02157472, 0.99839404, 1.01811581, 0.85515696, 1.01146911,
       0.94582088, 0.99675665, 1.06061824, 0.97271197, 1.04350138,
       0.98494545, 0.89892522, 0.93691227, 1.09090741, 0.93640208,
       1.02476913, 1.03497728, 1.01025587, 0.98708791, 1.07967409,
       1.00176401, 0.91369194, 1.05442214, 0.98398051, 0.9787243 ])

## Dask Bag

Bag is unordered collection of objects allowing repeats. Use these for semi/un-structured data.  
It's fun but slower than dataframes and arrays.  
The [examples](https://examples.dask.org/bag.html) page is really interesting.

In [28]:
dask_bag = db.from_sequence([1,2,3,4,5,6,7,8,9,0], npartitions = 2)

In [29]:
dask_bag

dask.bag<from_sequence, npartitions=2>

In [30]:
dask_bag.take(2)

(1, 2)

In [31]:
# dask is lazy - this one grabs values from one partition
dask_bag.filter(lambda x: x>3).take(2)

(4, 5)

In [32]:
# Here's how we take ALL across all partitions
dask_bag.filter(lambda x: x>3).compute()

[4, 5, 6, 7, 8, 9]

In [33]:
dask_bag.map(lambda x:x*x).take(5)

(1, 4, 9, 16, 25)

In [34]:
dask_bag.count().compute()

10

In [35]:
# convert to a dask dataframe
# this is a trivial example
dask_df_from_bag = dask_bag.to_dataframe()

In [36]:
dask_df_from_bag

Unnamed: 0_level_0,0
npartitions=2,Unnamed: 1_level_1
,int64
,...
,...


### Build bag with complex json and convert to dataframe
* Step 1: define a 'flatten' function
* Step 2: map 'flatten' to the bag
* Step 3: convert the flattened bag to dataframe using bag_instance.to_dataframe()

Using example from https://examples.dask.org/bag.html

#### Create Random Data

In [39]:
import json
import os

In [41]:
os.makedirs("./data/dask-bag-example-01", exist_ok = True)

In [47]:
b = dask.datasets.make_people()

In [49]:
b.map(json.dumps).to_textfiles("./data/dask-bag-example-01/*.json")

['D:/2/shaurya-lab/learn-data-munging/data/dask-bag-example-01/0.json',
 'D:/2/shaurya-lab/learn-data-munging/data/dask-bag-example-01/1.json',
 'D:/2/shaurya-lab/learn-data-munging/data/dask-bag-example-01/2.json',
 'D:/2/shaurya-lab/learn-data-munging/data/dask-bag-example-01/3.json',
 'D:/2/shaurya-lab/learn-data-munging/data/dask-bag-example-01/4.json',
 'D:/2/shaurya-lab/learn-data-munging/data/dask-bag-example-01/5.json',
 'D:/2/shaurya-lab/learn-data-munging/data/dask-bag-example-01/6.json',
 'D:/2/shaurya-lab/learn-data-munging/data/dask-bag-example-01/7.json',
 'D:/2/shaurya-lab/learn-data-munging/data/dask-bag-example-01/8.json',
 'D:/2/shaurya-lab/learn-data-munging/data/dask-bag-example-01/9.json']

#### Read JSON Data

In [61]:
# for windows
# !more .\data\dask-bag-example-01\0.json
# for linux
# !head -n 2 ./data/dask-bag-example-01/0.json

In [62]:
b = db.read_text('./data/dask-bag-example-01/*.json').map(json.loads)
b

dask.bag<loads, npartitions=10>

In [63]:
b.take(2)

({'age': 19,
  'name': ['Sebastian', 'Harding'],
  'occupation': 'Materials Manager',
  'telephone': '+13360560285',
  'address': {'address': '100 Gilroy Trace', 'city': 'Kiryas Joel'},
  'credit-card': {'number': '4618 3864 8470 9901',
   'expiration-date': '10/19'}},
 {'age': 28,
  'name': ['Guy', 'Middleton'],
  'occupation': 'Purchasing Assistant',
  'telephone': '+1-720-180-9954',
  'address': {'address': '529 Warren Lane', 'city': 'Maple Grove'},
  'credit-card': {'number': '2485 0396 8412 5196',
   'expiration-date': '04/24'}})