In [7]:
import pandas as pd
import numpy as np
import swifter
from datetime import datetime, timedelta

In [11]:
size = 1000000
df = pd.DataFrame(
    {
        "datetime": pd.date_range(start=datetime.now(), periods=size, freq="S"),
        "normal": np.random.normal(size=size),
        "exponential": np.random.exponential(size=size),
    }
)

In [12]:
df.head()

Unnamed: 0,datetime,normal,exponential
0,2022-06-30 18:02:46.291909,0.455599,0.733794
1,2022-06-30 18:02:47.291909,-0.185953,0.407413
2,2022-06-30 18:02:48.291909,-1.476331,1.473612
3,2022-06-30 18:02:49.291909,-0.179441,1.005457
4,2022-06-30 18:02:50.291909,2.213809,0.46494


# Vectorized when possible

In [44]:
%time df["normal_squared"] = df.normal.apply(lambda x: x**2)

CPU times: user 288 ms, sys: 25.1 ms, total: 314 ms
Wall time: 312 ms


In [45]:
%time df["normal_squared"] = df.normal.swifter.apply(lambda x: x**2)

CPU times: user 115 ms, sys: 14 ms, total: 129 ms
Wall time: 126 ms


# Parallelized when it's efficient, with progress bar

In [17]:
%time df["bigger_value"] = df[["normal", "exponential"]].apply(lambda x: x.normal if x.normal > x.exponential else x.exponential, axis=1)

CPU times: user 25 s, sys: 137 ms, total: 25.2 s
Wall time: 25.2 s


In [21]:
%time df["bigger_value"] = df[["normal", "exponential"]].swifter.apply(lambda x: x.normal if x.normal > x.exponential else x.exponential, axis=1)

Dask Apply:   0%|          | 0/24 [00:00<?, ?it/s]

CPU times: user 406 ms, sys: 151 ms, total: 557 ms
Wall time: 9.9 s


## Benefits from using vectorized strategy

In [42]:
%time df["bigger_value"] = df[["normal", "exponential"]].apply(lambda x: np.where(x.normal > x.exponential, x.normal, x.exponential), axis=1)

CPU times: user 39.7 s, sys: 251 ms, total: 40 s
Wall time: 40 s


In [41]:
%time df["bigger_value"] = df[["normal", "exponential"]].swifter.apply(lambda x: np.where(x.normal > x.exponential, x.normal, x.exponential), axis=1)

CPU times: user 191 ms, sys: 77.2 ms, total: 268 ms
Wall time: 266 ms


# Single-threaded for small datasets, with progress bar

In [28]:
%time df["smaller_value"] = df.loc[:10000, ["normal", "exponential"]].apply(lambda x: x.normal if x.normal < x.exponential else x.exponential, axis=1)

CPU times: user 290 ms, sys: 1.18 ms, total: 291 ms
Wall time: 291 ms


In [27]:
%time df["smaller_value"] = df.loc[:10000, ["normal", "exponential"]].swifter.apply(lambda x: x.normal if x.normal < x.exponential else x.exponential, axis=1)

Pandas Apply:   0%|          | 0/10001 [00:00<?, ?it/s]

CPU times: user 398 ms, sys: 5.16 ms, total: 404 ms
Wall time: 398 ms
