In [1]:
import platform
import multiprocessing

In [2]:
platform.mac_ver()

('10.14.6', ('', '', ''), 'x86_64')

In [3]:
platform.processor()

'i386'

In [4]:
platform.python_version()

'3.7.5'

In [5]:
multiprocessing.cpu_count()

8

### Pandas Apply vs list comprehension

In [6]:
import string
import numpy as np
import pandas as pd

In [7]:
choices = np.random.randint(1, high=50, size=1)

In [8]:
# generating random text
def genereate_random_text(n: int) -> pd.DataFrame:
    printables = list(string.printable)
    text = []
    for _ in range(n):
        number_of_choices = np.random.randint(1, high=50, size=1)[0]
        text.append("".join(np.random.choice(printables, size=number_of_choices)))
    return pd.DataFrame(text, columns=['random_text'])

In [9]:
def digit_ratio(string):
    string = string.replace(' ', '')
    if not string:
        return 0
    return sum(c.isdigit() for c in string) / len(string)

* Case 1: n = 100

In [10]:
df = genereate_random_text(100)

In [11]:
df.head()

Unnamed: 0,random_text
0,mCo2Lp*Ce8x#oL: 42|\r9S>\rsYBijtn_
1,4W0]W\r
2,ik'\t
3,3X}4y[fXJHLM!GQR|2IKkh\nYR-^5
4,"wg ,t`fGBoQIIC19'va]k*/(G"


In [12]:
%%timeit -r 10
df['random_text'].apply(digit_ratio)

445 µs ± 37.4 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


In [13]:
%%timeit -r 10
[digit_ratio(x) for x in df['random_text'].values]

266 µs ± 2.83 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


In [14]:
%%timeit -r 10
df['random_text'].str.replace(' ', '').apply(lambda x: 0 if not x else sum(c.isdigit() for c in x) / len(x))

564 µs ± 9.14 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


* Case 1: n = 1000

In [15]:
df = genereate_random_text(1000)

In [16]:
df.head()

Unnamed: 0,random_text
0,#;\ro&C!-=
1,}r-K
2,".^~VA}|yM2g!\t]T`$,kf1x>"
3,l3]_A_K8dPk\tJnM:d0I&5!Ml
4,x5koEv'^!*?r59/:v Nkj/M6D@


In [17]:
%%timeit -r 10
df['random_text'].apply(digit_ratio)

3.08 ms ± 251 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [18]:
%%timeit -r 10
[digit_ratio(x) for x in df['random_text'].values]

2.84 ms ± 133 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [19]:
%%timeit -r 10
df['random_text'].str.replace(' ', '').apply(lambda x: 0 if not x else sum(c.isdigit() for c in x) / len(x))

3.43 ms ± 254 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


* Case 1: n = 10000

In [20]:
df = genereate_random_text(10000)

In [21]:
df.head()

Unnamed: 0,random_text
0,"""KcHo]-ZmuQBQMD\0\X%I;rIZp<nFI>;y0B1bTxp""f7m5dO\t"
1,MbyAc@zgq:w3\nH|@=_eL
2,`=h$c$V\tbKSa+G.:MWdM7Xv
3,"2mA3|{7vD $Rt\7h,9F_7x2MVM,snlL^#/#w8:"
4,"2?""+Cr:h7""-7p-,-A9J""yYFtvQ?zT~"


In [22]:
%%timeit -r 10
df['random_text'].apply(digit_ratio)

29.5 ms ± 1.29 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [23]:
%%timeit -r 10
[digit_ratio(x) for x in df['random_text'].values]

27.9 ms ± 307 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [24]:
%%timeit -r 10
df['random_text'].str.replace(' ', '').apply(lambda x: 0 if not x else sum(c.isdigit() for c in x) / len(x))

30.4 ms ± 2.33 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


* Case 1: n = 100000

In [25]:
df = genereate_random_text(100000)

In [26]:
df.head()

Unnamed: 0,random_text
0,\tN-b^zb#&p
1,^\rbpGt=\%OY^hV5XNwwz](5j
2,b/W%
3,i&HGL'`*tg?AsT:ca|4%zB2De]0SziNqe+\rW=X{7Q#'~
4,Mu65j\n.\vFE[\nb^1


In [27]:
%%timeit -r 10
df['random_text'].apply(digit_ratio)

297 ms ± 8.64 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [28]:
%%timeit -r 10
[digit_ratio(x) for x in df['random_text'].values]

287 ms ± 6.12 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [29]:
%%timeit -r 10
df['random_text'].str.replace(' ', '').apply(lambda x: 0 if not x else sum(c.isdigit() for c in x) / len(x))

304 ms ± 3.74 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)
