# MultiProcessing

In [1]:
import pandas as pd
import numpy as np
import multiprocessing as mp
import os
import gc

In [2]:
fldrs = ["data/csv/atmc10", "data/parq/atmc10",
            "data/csv/atmc50", "data/parq/atmc50"]

In [3]:
def parallelize(fun, vec, pool):
    with mp.Pool(pool) as p:
        res = p.map(fun, vec)
    return(res)


def funCSV(x):
    df = pd.read_csv(x)
    return [df["key"].unique()[0], df["values"].mean()]


def funPARQ(x):
    df = pd.read_parquet(x)
    return [df["key"].unique()[0], df["values"].mean()]

ncpu = os.cpu_count()

In [11]:
ncpu

16

## csv

In [5]:
%%time
fldr = fldrs[0]
files = [os.path.join(fldr, x) for x in os.listdir(fldr)]
out = parallelize(funCSV, files, ncpu//2)
out = pd.DataFrame(out, columns=["key","values"])
out["key"] =  out["key"].astype(str)
out = out.sort_values("key").set_index("key")

CPU times: user 68.5 ms, sys: 45.6 ms, total: 114 ms
Wall time: 3.09 s


In [8]:
%%time
fldr = fldrs[0]
files = [os.path.join(fldr, x) for x in os.listdir(fldr)]
out = parallelize(funCSV, files, ncpu)
out = pd.DataFrame(out, columns=["key","values"])
out["key"] =  out["key"].astype(str)
out = out.sort_values("key").set_index("key")

CPU times: user 80.2 ms, sys: 78.9 ms, total: 159 ms
Wall time: 2.43 s


In [9]:
%%time
fldr = fldrs[2]
files = [os.path.join(fldr, x) for x in os.listdir(fldr)]
out = parallelize(funCSV, files, ncpu//2)
out = pd.DataFrame(out, columns=["key","values"])
out["key"] = out["key"].astype(str)
out = out.sort_values("key").set_index("key")

CPU times: user 240 ms, sys: 82.8 ms, total: 323 ms
Wall time: 15.4 s


In [10]:
%%time
fldr = fldrs[2]
files = [os.path.join(fldr, x) for x in os.listdir(fldr)]
out = parallelize(funCSV, files, ncpu)
out = pd.DataFrame(out, columns=["key","values"])
out["key"] = out["key"].astype(str)
out = out.sort_values("key").set_index("key")

CPU times: user 332 ms, sys: 120 ms, total: 451 ms
Wall time: 11.2 s


## parquet

In [12]:
%%time
fldr = fldrs[1]
files = [os.path.join(fldr, x) for x in os.listdir(fldr)]
out = parallelize(funPARQ, files, ncpu//2)
out = pd.DataFrame(out, columns=["key","values"])
out["key"] =  out["key"].astype(str)
out = out.sort_values("key").set_index("key")

CPU times: user 49.1 ms, sys: 63.2 ms, total: 112 ms
Wall time: 4.5 s


In [13]:
%%time
fldr = fldrs[1]
files = [os.path.join(fldr, x) for x in os.listdir(fldr)]
out = parallelize(funPARQ, files, ncpu)
out = pd.DataFrame(out, columns=["key","values"])
out["key"] =  out["key"].astype(str)
out = out.sort_values("key").set_index("key")

CPU times: user 75.5 ms, sys: 96.7 ms, total: 172 ms
Wall time: 3.25 s


In [14]:
%%time
fldr = fldrs[3]
files = [os.path.join(fldr, x) for x in os.listdir(fldr)]
out = parallelize(funPARQ, files, ncpu//2)
out = pd.DataFrame(out, columns=["key","values"])
out["key"] =  out["key"].astype(str)
out = out.sort_values("key").set_index("key")

CPU times: user 287 ms, sys: 89.7 ms, total: 376 ms
Wall time: 21.9 s


In [15]:
%%time
fldr = fldrs[3]
files = [os.path.join(fldr, x) for x in os.listdir(fldr)]
out = parallelize(funPARQ, files, ncpu)
out = pd.DataFrame(out, columns=["key","values"])
out["key"] =  out["key"].astype(str)
out = out.sort_values("key").set_index("key")

CPU times: user 270 ms, sys: 120 ms, total: 390 ms
Wall time: 15.1 s


# Pandas

In [1]:
import pandas as pd

## 10M

### csv

In [2]:
%%time
df = pd.read_csv("data/file10SRTD.csv")
out = df.groupby("key")["values"].mean()

CPU times: user 4.03 s, sys: 631 ms, total: 4.66 s
Wall time: 3.02 s


In [2]:
%%time
df = pd.read_csv("data/file10SHFFL.csv")
out = df.groupby("key")["values"].mean()

CPU times: user 4.94 s, sys: 625 ms, total: 5.56 s
Wall time: 3.94 s


### parquet

In [2]:
%%time
df = pd.read_parquet("data/file10SRTD.parq")
out = df.groupby("key")["values"].mean()

CPU times: user 3.18 s, sys: 949 ms, total: 4.13 s
Wall time: 2.59 s


In [2]:
%%time
df = pd.read_parquet("data/file10SHFFL.parq")
out = df.groupby("key")["values"].mean()

CPU times: user 3.45 s, sys: 1.08 s, total: 4.53 s
Wall time: 2.99 s


## 50M

### csv

In [2]:
%%time
df = pd.read_csv("data/file50SRTD.csv")
out = df.groupby("key")["values"].mean()

CPU times: user 14.5 s, sys: 2.6 s, total: 17.1 s
Wall time: 15.2 s


In [2]:
%%time
df = pd.read_csv("data/file50SHFFL.csv")
out = df.groupby("key")["values"].mean()

CPU times: user 33.2 s, sys: 2.7 s, total: 35.9 s
Wall time: 34 s


### parquet

In [3]:
%%time
df = pd.read_parquet("data/file50SRTD.parq")
out = df.groupby("key")["values"].mean()

CPU times: user 9.88 s, sys: 5.7 s, total: 15.6 s
Wall time: 13.3 s


In [2]:
%%time
df = pd.read_parquet("data/file50SHFFL.parq")
out = df.groupby("key")["values"].mean()

CPU times: user 15.3 s, sys: 5.37 s, total: 20.7 s
Wall time: 18.5 s


# Dask

In [1]:
import pandas as pd
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

In [5]:
cluster = LocalCluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:34603  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.52 GB


In [2]:
fldrs = ["data/csv/atmc10", "data/parq/atmc10",
            "data/csv/atmc50", "data/parq/atmc50"]