In [1]:
import pandas as pd
import numpy as np
import swifter
from datetime import datetime, timedelta

In [2]:
size = 1000000
df = pd.DataFrame(
    {
        "datetime": pd.date_range(start=datetime.now(), periods=size, freq="S"),
        "normal": np.random.normal(size=size),
        "exponential": np.random.exponential(size=size),
    }
)

In [3]:
df.head()

Unnamed: 0,datetime,normal,exponential
0,2022-06-30 18:22:35.461518,-0.703782,1.12503
1,2022-06-30 18:22:36.461518,0.069071,0.094912
2,2022-06-30 18:22:37.461518,-2.295349,1.585564
3,2022-06-30 18:22:38.461518,-1.687109,1.73779
4,2022-06-30 18:22:39.461518,-0.359347,1.028871


# Vectorized when possible

In [4]:
%time df["normal_squared_pandas"] = df.normal.apply(lambda x: x**2)

CPU times: user 239 ms, sys: 31 ms, total: 270 ms
Wall time: 269 ms


In [5]:
%time df["normal_squared_swifter"] = df.normal.swifter.apply(lambda x: x**2)

Pandas Apply:   0%|          | 0/1000000 [00:00<?, ?it/s]

CPU times: user 1.46 s, sys: 63.6 ms, total: 1.53 s
Wall time: 1.51 s


# Parallelized when it's efficient, with progress bar

In [6]:
%time df["bigger_value_pandas_ifelse"] = df[["normal", "exponential"]].apply(lambda x: x.normal if x.normal > x.exponential else x.exponential, axis=1)

CPU times: user 25.1 s, sys: 125 ms, total: 25.2 s
Wall time: 25.2 s


In [7]:
%time df["bigger_value_swifter_ifelse"] = df[["normal", "exponential"]].swifter.apply(lambda x: x.normal if x.normal > x.exponential else x.exponential, axis=1)

Dask Apply:   0%|          | 0/24 [00:00<?, ?it/s]

CPU times: user 375 ms, sys: 104 ms, total: 479 ms
Wall time: 9.5 s


## Benefits from using vectorized strategy

In [8]:
%time df["bigger_value_pandas_where"] = df[["normal", "exponential"]].apply(lambda x: np.where(x.normal > x.exponential, x.normal, x.exponential), axis=1)

CPU times: user 36.7 s, sys: 200 ms, total: 36.9 s
Wall time: 36.9 s


In [9]:
%time df["bigger_value_swifter_where"] = df[["normal", "exponential"]].swifter.apply(lambda x: np.where(x.normal > x.exponential, x.normal, x.exponential), axis=1)

CPU times: user 170 ms, sys: 1.02 ms, total: 171 ms
Wall time: 169 ms


# Single-threaded for small datasets, with progress bar

In [10]:
%time df["smaller_value_pandas_ifelse"] = df.loc[:10000, ["normal", "exponential"]].apply(lambda x: x.normal if x.normal < x.exponential else x.exponential, axis=1)

CPU times: user 263 ms, sys: 4.96 ms, total: 268 ms
Wall time: 267 ms


In [11]:
%time df["smaller_value_swifter_ifelse"] = df.loc[:10000, ["normal", "exponential"]].swifter.apply(lambda x: x.normal if x.normal < x.exponential else x.exponential, axis=1)

Pandas Apply:   0%|          | 0/10001 [00:00<?, ?it/s]

CPU times: user 375 ms, sys: 9.16 ms, total: 384 ms
Wall time: 378 ms


# Review Data


In [12]:
df.head()

Unnamed: 0,datetime,normal,exponential,normal_squared_pandas,normal_squared_swifter,bigger_value_pandas_ifelse,bigger_value_swifter_ifelse,bigger_value_pandas_where,bigger_value_swifter_where,smaller_value_pandas_ifelse,smaller_value_swifter_ifelse
0,2022-06-30 18:22:35.461518,-0.703782,1.12503,0.495309,0.495309,1.12503,1.12503,1.125029595282032,1.12503,-0.703782,-0.703782
1,2022-06-30 18:22:36.461518,0.069071,0.094912,0.004771,0.004771,0.094912,0.094912,0.0949118804318317,0.094912,0.069071,0.069071
2,2022-06-30 18:22:37.461518,-2.295349,1.585564,5.268628,5.268628,1.585564,1.585564,1.5855638407926365,1.585564,-2.295349,-2.295349
3,2022-06-30 18:22:38.461518,-1.687109,1.73779,2.846336,2.846336,1.73779,1.73779,1.7377900617987223,1.73779,-1.687109,-1.687109
4,2022-06-30 18:22:39.461518,-0.359347,1.028871,0.12913,0.12913,1.028871,1.028871,1.028871349233043,1.028871,-0.359347,-0.359347
