# Create Data

In [1]:
import pandas as pd
import numpy as np
import multiprocessing as mp
import os


dataDirs = ["data/csv/atmc10", "data/parq/atmc10",
            "data/csv/atmc50", "data/parq/atmc50"]
for dataDir in dataDirs:
    if not os.path.exists(dataDir):
        os.makedirs(dataDir)

def parallelize(fun, vec, pool):
    with mp.Pool(pool) as p:
        res = p.map(fun, vec)
    return(res)

def genData10(x):
    df = pd.DataFrame({"key": x[1],
                       "values": np.arange(x[0]*N, (x[0]+1)*N)})
    fldr1 = "data/csv/atmc10"
    fldr2 = "data/parq/atmc10"
    fn ="part{:05}".format(x[0]+1) 
    df.to_csv(os.path.join(fldr1, fn)+".csv", index=False)
    df.to_parquet(os.path.join(fldr2,fn)+".parq")
    
    
def genData50(x):
    df = pd.DataFrame({"key": x[1],
                       "values": np.arange(x[0]*N, (x[0]+1)*N)})
    fldr1 = "data/csv/atmc50"
    fldr2 = "data/parq/atmc50"
    fn ="part{:05}".format(x[0]+1) 
    df.to_csv(os.path.join(fldr1, fn)+".csv", index=False)
    df.to_parquet(os.path.join(fldr2,fn)+".parq")

In [2]:
N = 1000
ncpu =  os.cpu_count()

In [3]:
%%time
lst = [[i,"A{:05}".format(i+1)] for i in range(int(1e4))]
out = parallelize(genData10, lst, ncpu)

CPU times: user 44.7 ms, sys: 113 ms, total: 158 ms
Wall time: 8.76 s


In [4]:
%%time
lst = [[i,"A{:05}".format(i+1)] for i in range(int(5e4))]
out = parallelize(genData50, lst, ncpu)

CPU times: user 160 ms, sys: 96.2 ms, total: 257 ms
Wall time: 40 s


# In One file

In [1]:
import pandas as pd
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

In [2]:
cluster = LocalCluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:33203  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 16  Cores: 16  Memory: 63.32 GB


In [3]:
fldrs = ["data/csv/atmc10", "data/parq/atmc10",
         "data/csv/atmc50", "data/parq/atmc50"]

## 10M

In [7]:
%%time
df = dd.read_parquet(fldrs[1]+"/*")

CPU times: user 3.4 s, sys: 308 ms, total: 3.71 s
Wall time: 3.72 s


In [8]:
%%time
# to pandas
df = df.compute()

CPU times: user 1min 17s, sys: 41.1 s, total: 1min 58s
Wall time: 1min 2s


In [9]:
%%time
df.to_parquet("data/file10SRTD.parq")
df.to_csv("data/file10SRTD.csv", index=False)

CPU times: user 22.4 s, sys: 879 ms, total: 23.2 s
Wall time: 21 s


In [10]:
%%time
df = df.sample(frac=1)\
       .reset_index(drop=True)

CPU times: user 9.64 s, sys: 0 ns, total: 9.64 s
Wall time: 7.67 s


In [11]:
%%time
df.to_parquet("data/file10SHFFL.parq")
df.to_csv("data/file10SHFFL.csv", index=False)

CPU times: user 32.6 s, sys: 0 ns, total: 32.6 s
Wall time: 30.8 s


## 50M

In [4]:
client.restart()

0,1
Client  Scheduler: tcp://127.0.0.1:33203  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 16  Cores: 16  Memory: 63.32 GB


In [5]:
%%time
df = dd.read_parquet(fldrs[3]+"/*")

CPU times: user 21.8 s, sys: 3.57 s, total: 25.3 s
Wall time: 22.7 s


In [6]:
%%time
# to pandas
df = df.compute()

CPU times: user 1min 42s, sys: 17.3 s, total: 2min
Wall time: 1min 53s


In [7]:
%%time
df.to_parquet("data/file50SRTD.parq")
df.to_csv("data/file50SRTD.csv", index=False)

CPU times: user 1min 55s, sys: 11.1 s, total: 2min 6s
Wall time: 1min 53s


In [8]:
%%time
df = df.sample(frac=1)\
       .reset_index(drop=True)

CPU times: user 47.4 s, sys: 2.1 s, total: 49.5 s
Wall time: 46.8 s


In [9]:
%%time
df.to_parquet("data/file50SHFFL.parq")
df.to_csv("data/file50SHFFL.csv", index=False)

CPU times: user 3min 3s, sys: 7.46 s, total: 3min 11s
Wall time: 2min 59s


# Chunks

In [1]:
import pandas as pd
import os

In [2]:
dataDirs = ["data/csv/split10", "data/parq/split10",
            "data/csv/split50", "data/parq/split50"]

for dataDir in dataDirs:
    if not os.path.exists(dataDir):
        os.makedirs(dataDir)

In [3]:
%%time
df = pd.read_csv("data/file10SRTD.csv")
df = df.reset_index(drop=True)
chunks = 8
N = int(len(df)/chunks)
for i in range(chunks):
    out = df[i*N:(i+1)*N-1]
    out = out.reset_index(drop=True)
    fn1 = os.path.join(dataDirs[0], "part{:02}.csv".format(i+1))
    fn2 = os.path.join(dataDirs[1], "part{:02}.parq".format(i+1))
    out.to_csv(fn1, index=False)
    out.to_parquet(fn2)

CPU times: user 47.9 s, sys: 1.94 s, total: 49.9 s
Wall time: 24.8 s


In [4]:
%%time
df = pd.read_csv("data/file50SRTD.csv")
df = df.reset_index(drop=True)
chunks = 8
N = int(len(df)/chunks)
for i in range(chunks):
    out = df[i*N:(i+1)*N-1]
    out = out.reset_index(drop=True)
    fn1 = os.path.join(dataDirs[2], "part{:02}.csv".format(i+1))
    fn2 = os.path.join(dataDirs[3], "part{:02}.parq".format(i+1))
    out.to_csv(fn1, index=False)
    out.to_parquet(fn2)

CPU times: user 2min 25s, sys: 9.36 s, total: 2min 34s
Wall time: 2min 5s
