# Create Data

In [1]:
import pandas as pd
import numpy as np
import multiprocessing as mp
import os


dataDirs = ["data/csv/atmc10", "data/parq/atmc10",
            "data/csv/atmc50", "data/parq/atmc50"]
for dataDir in dataDirs:
    if not os.path.exists(dataDir):
        os.makedirs(dataDir)

def parallelize(fun, vec, pool):
    with mp.Pool(pool) as p:
        res = p.map(fun, vec)
    return(res)

def genData10(x):
    df = pd.DataFrame({"key": x[1],
                       "values": np.arange(x[0]*N, (x[0]+1)*N)})
    fldr1 = "data/csv/atmc10"
    fldr2 = "data/parq/atmc10"
    fn ="part{:05}".format(x[0]+1) 
    df.to_csv(os.path.join(fldr1, fn)+".csv", index=False)
    df.to_parquet(os.path.join(fldr2,fn)+".parq")
    
    
def genData50(x):
    df = pd.DataFrame({"key": x[1],
                       "values": np.arange(x[0]*N, (x[0]+1)*N)})
    fldr1 = "data/csv/atmc50"
    fldr2 = "data/parq/atmc50"
    fn ="part{:05}".format(x[0]+1) 
    df.to_csv(os.path.join(fldr1, fn)+".csv", index=False)
    df.to_parquet(os.path.join(fldr2,fn)+".parq")

In [2]:
N = 1000
ncpu =  os.cpu_count()

In [3]:
%%time
lst = [[i,"A{:05}".format(i+1)] for i in range(int(1e4))]
out = parallelize(genData10, lst, ncpu)

CPU times: user 37.9 ms, sys: 27.4 ms, total: 65.3 ms
Wall time: 32.5 s


In [4]:
%%time
lst = [[i,"A{:05}".format(i+1)] for i in range(int(5e4))]
out = parallelize(genData50, lst, ncpu)

CPU times: user 176 ms, sys: 68.2 ms, total: 244 ms
Wall time: 2min 43s


# In One file

In [1]:
import pandas as pd
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

In [2]:
cluster = LocalCluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:39423  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.52 GB


In [3]:
fldrs = ["data/csv/atmc10", "data/parq/atmc10",
         "data/csv/atmc50", "data/parq/atmc50"]

## 10M

In [None]:
%%time
df = dd.read_parq(fldrs[1]+"/*")

In [None]:
%%time
# to pandas
df = df.compute()

In [None]:
%%time
df.to_parquet("data/file10SRTD.parq")
df.to_csv("data/file10SRTD.csv", index=False)

In [None]:
%%time
df = df.sample(frac=1)\
       .reset_index(drop=True)

In [None]:
df.to_parquet("data/file10SHFFL.parq")
df.to_csv("data/file10SHFFL.csv", index=False)

## 50M

In [4]:
%%time
df = dd.read_parquet(fldrs[3]+"/*")

CPU times: user 23.5 s, sys: 2.06 s, total: 25.6 s
Wall time: 25 s


In [5]:
%%time
# to pandas
df = df.compute()

CPU times: user 3min 47s, sys: 15.6 s, total: 4min 3s
Wall time: 5min 36s


In [None]:
%%time
df.to_parquet("data/file50SRTD.parq")
df.to_csv("data/file50SRTD.csv", index=False)

In [None]:
%%time
df = df.sample(frac=1)\
       .reset_index(drop=True)

In [None]:
df.to_parquet("data/file50SHFFL.parq")
df.to_csv("data/file50SHFFL.csv", index=False)