# Generate Data

In [3]:
import pandas as pd
import numpy as np
import string
import itertools
import multiprocessing as mp
import os

In [4]:
def parallelize(fun, vec, pool):
    with mp.Pool(pool) as p:
        res = p.map(fun, vec)
    return(res)

def genData(x):
    df = pd.DataFrame({"key": x[1],
                       "values": np.arange(x[0]*N, (x[0]+1)*N)})
    fldr1 = "datasets/atmcCSV/"
    fldr2 = "datasets/atmcPARQ/"
    fn ="part{:05}".format(x[0]+1) 
    df.to_csv(fldr1+fn+".csv", index=False)
    df.to_parquet(fldr2+fn+".parq")

In [3]:
lst = [[i, "".join(l)] for i, l in enumerate(list(itertools.permutations(string.ascii_uppercase, 3)))]

N = 1000

ncpu = os.cpu_count()

In [4]:
%%time
out = parallelize(genData, lst, ncpu)

CPU times: user 47.4 ms, sys: 25.3 ms, total: 72.8 ms
Wall time: 51 s


# Process Data

In [50]:
import numpy as np
import pandas as pd
import time
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

In [51]:
cluster = LocalCluster()
client = Client(cluster)

In [52]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:40489  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.52 GB


In [8]:
%%time
df = dd.read_csv("datasets/atmcCSV/*.csv")

CPU times: user 5.84 s, sys: 211 ms, total: 6.05 s
Wall time: 5.97 s


In [9]:
df

Unnamed: 0_level_0,key,values
npartitions=15600,Unnamed: 1_level_1,Unnamed: 2_level_1
,object,int64
,...,...
...,...,...
,...,...
,...,...


In [12]:
%%time
out = df.groupby("key")["values"].mean()

CPU times: user 276 ms, sys: 2.09 ms, total: 278 ms
Wall time: 276 ms


In [13]:
%%time
out = out.compute()

CPU times: user 2min 34s, sys: 4.19 s, total: 2min 38s
Wall time: 4min 10s


In [14]:
client.restart()

0,1
Client  Scheduler: tcp://127.0.0.1:44585  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.52 GB


In [19]:
%%time
df = dd.read_parquet("datasets/atmcPARQ/*.parq")

CPU times: user 6.83 s, sys: 633 ms, total: 7.46 s
Wall time: 7.25 s


In [20]:
df

Unnamed: 0_level_0,key,values
npartitions=15600,Unnamed: 1_level_1,Unnamed: 2_level_1
,object,int64
,...,...
...,...,...
,...,...
,...,...


In [21]:
%%time
out = df.groupby("key")["values"].mean()

out = out.compute()

CPU times: user 1min 43s, sys: 5.51 s, total: 1min 49s
Wall time: 2min 57s


## Using multiprocessing

In [47]:
def fun1CSV(x):
    df = pd.read_csv(x)
    return [df["key"].unique()[0], df["values"].mean()]

def fun1PARQ(x):
    df = pd.read_parquet(x)
    return [df["key"].unique()[0], df["values"].mean()]

ncpu = os.cpu_count()

In [9]:
fldr1 = "datasets/atmcCSV/"
fldr2 = "datasets/atmcPARQ/"

In [44]:
%%time
files = [os.path.join(fldr1, x) for x in os.listdir(fldr1)]
out = parallelize(fun1CSV, files, ncpu)
out = pd.DataFrame(out, columns=["key","values"])
out["key"] =  out["key"].astype(str)
out = out.sort_values("key").set_index("key")

CPU times: user 119 ms, sys: 24.4 ms, total: 144 ms
Wall time: 20.4 s


In [43]:
out.head()

Unnamed: 0_level_0,values
key,Unnamed: 1_level_1
ABC,499.5
ABD,1499.5
ABE,2499.5
ABF,3499.5
ABG,4499.5


In [48]:
%%time
files = [os.path.join(fldr2, x) for x in os.listdir(fldr2)]
out = parallelize(fun1PARQ, files, ncpu)
out = pd.DataFrame(out, columns=["key","values"])
out["key"] =  out["key"].astype(str)
out = out.sort_values("key").set_index("key")

CPU times: user 103 ms, sys: 43.8 ms, total: 147 ms
Wall time: 25.1 s


Unnamed: 0_level_0,values
key,Unnamed: 1_level_1
ABC,499.5
ABD,1499.5
ABE,2499.5
ABF,3499.5
ABG,4499.5


In [53]:
df = dd.read_csv("datasets/atmcCSV/*.csv")

In [55]:
%%time
out = df.map_partitions(lambda x:[x["key"].unique()[0], x["values"].mean()])

CPU times: user 42.6 ms, sys: 4.71 ms, total: 47.3 ms
Wall time: 53.2 ms


In [57]:
out =  out.compute()

In [None]:
df = df.compute()