# MultiProcessing

In [1]:
import pandas as pd
import numpy as np
import multiprocessing as mp
import os
import gc

In [2]:
fldrs = ["data/csv/atmc10", "data/parq/atmc10",
            "data/csv/atmc50", "data/parq/atmc50"]

In [3]:
def parallelize(fun, vec, pool):
    with mp.Pool(pool) as p:
        res = p.map(fun, vec)
    return(res)


def funCSV(x):
    df = pd.read_csv(x)
    return [df["key"].unique()[0], df["values"].mean()]


def funPARQ(x):
    df = pd.read_parquet(x)
    return [df["key"].unique()[0], df["values"].mean()]

ncpu = os.cpu_count()

## csv

In [4]:
%%time
fldr = fldrs[0]
files = [os.path.join(fldr, x) for x in os.listdir(fldr)]
out = parallelize(funCSV, files, ncpu)
out = pd.DataFrame(out, columns=["key","values"])
out["key"] =  out["key"].astype(str)
out = out.sort_values("key").set_index("key")

CPU times: user 74.6 ms, sys: 25.3 ms, total: 99.9 ms
Wall time: 12.6 s


In [4]:
%%time
fldr = fldrs[2]
files = [os.path.join(fldr, x) for x in os.listdir(fldr)]
out = parallelize(funCSV, files, ncpu)
out = pd.DataFrame(out, columns=["key","values"])
out["key"] = out["key"].astype(str)
out = out.sort_values("key").set_index("key")

CPU times: user 316 ms, sys: 87.7 ms, total: 404 ms
Wall time: 1min 1s


## parquet

In [4]:
%%time
fldr = fldrs[1]
files = [os.path.join(fldr, x) for x in os.listdir(fldr)]
out = parallelize(funPARQ, files, ncpu)
out = pd.DataFrame(out, columns=["key","values"])
out["key"] =  out["key"].astype(str)
out = out.sort_values("key").set_index("key")

CPU times: user 80.6 ms, sys: 40.7 ms, total: 121 ms
Wall time: 15 s


In [4]:
%%time
fldr = fldrs[3]
files = [os.path.join(fldr, x) for x in os.listdir(fldr)]
out = parallelize(funPARQ, files, ncpu)
out = pd.DataFrame(out, columns=["key","values"])
out["key"] =  out["key"].astype(str)
out = out.sort_values("key").set_index("key")

CPU times: user 357 ms, sys: 61.8 ms, total: 418 ms
Wall time: 1min 11s


# Pandas

In [1]:
import pandas as pd
import numpy as np

## 10M

In [2]:
df = pd.read_csv("data/file10SRTD.csv")

In [3]:
%%time
out = df.groupby("key")["values"].mean()

CPU times: user 579 ms, sys: 127 ms, total: 707 ms
Wall time: 746 ms


In [6]:
df = pd.read_csv("data/file10SHFFL.csv")

In [7]:
%%time
out = df.groupby("key")["values"].mean()

CPU times: user 913 ms, sys: 43.8 ms, total: 957 ms
Wall time: 956 ms


# Dask

In [1]:
import pandas as pd
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

In [5]:
cluster = LocalCluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:34603  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.52 GB


In [2]:
fldrs = ["data/csv/atmc10", "data/parq/atmc10",
            "data/csv/atmc50", "data/parq/atmc50"]