In [1]:
import platform
import multiprocessing

In [2]:
platform.mac_ver()

('10.14.6', ('', '', ''), 'x86_64')

In [3]:
platform.processor()

'i386'

In [4]:
platform.python_version()

'3.7.5'

In [5]:
multiprocessing.cpu_count()

8

### Pandas Apply vs list comprehension

In [6]:
import string
import numpy as np
import pandas as pd

In [7]:
choices = np.random.randint(1, high=50, size=1)

In [8]:
# generating random text
def genereate_random_text(n: int) -> pd.DataFrame:
    printables = list(string.printable)
    text = []
    for _ in range(n):
        number_of_choices = np.random.randint(1, high=50, size=1)[0]
        text.append("".join(np.random.choice(printables, size=number_of_choices)))
    return pd.DataFrame(text, columns=['random_text'])

In [9]:
def digit_ratio(string):
    string = string.replace(' ', '')
    if not string:
        return 0
    return sum(c.isdigit() for c in string) / len(string)

* Case 1: n = 100

In [10]:
df = genereate_random_text(100)

In [11]:
df.head()

Unnamed: 0,random_text
0,"(+ADic;bQ6p,W<@o<EO[2hzrv"
1,8
2,Oz'Hr6r
3,"I?W.4QWE,iA#M>h{ K=M5V&ta%?2{c&m@"
4,o~}#C3Mk6z


In [12]:
%%timeit -r 10
df['random_text'].apply(digit_ratio)

403 µs ± 23.4 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


In [13]:
%%timeit -r 10
[digit_ratio(x) for x in df['random_text'].values]

278 µs ± 12.2 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


In [14]:
%%timeit -r 10
[digit_ratio(x) for x in df['random_text']]

278 µs ± 4.56 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


In [15]:
%%timeit -r 10
df['random_text'].str.replace(' ', '').apply(lambda x: 0 if not x else sum(c.isdigit() for c in x) / len(x))

591 µs ± 47.2 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


* Case 1: n = 1000

In [16]:
df = genereate_random_text(1000)

In [17]:
df.head()

Unnamed: 0,random_text
0,6]2
1,"Jo,2 =39g-1M%XDz%bIQp\ty ~# H=i"
2,G|aN^K7LuvHzj49\nvz\NJ[
3,`YRIk\ 2+;YmRT&%k
4,d;p*\tD> sWv40d(oj?PQ#r{&\nabE{/


In [18]:
%%timeit -r 10
df['random_text'].apply(digit_ratio)

3.09 ms ± 140 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [19]:
%%timeit -r 10
[digit_ratio(x) for x in df['random_text'].values]

2.8 ms ± 46.4 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [20]:
%%timeit -r 10
[digit_ratio(x) for x in df['random_text']]

2.89 ms ± 22.7 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [21]:
%%timeit -r 10
df['random_text'].str.replace(' ', '').apply(lambda x: 0 if not x else sum(c.isdigit() for c in x) / len(x))

3.42 ms ± 338 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


* Case 1: n = 10000

In [22]:
df = genereate_random_text(10000)

In [23]:
df.head()

Unnamed: 0,random_text
0,iO3F.8;|UpiJG}xl0=qQ*dH/4>u[{=\nOsn=r93?
1,D?57\n
2,P%$8D
3,"&NKk FFYx,vBcL"
4,"Sy=UkF&Yc7\n='qe,+!lM/C"


In [24]:
%%timeit -r 10
df['random_text'].apply(digit_ratio)

30.4 ms ± 1.75 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [25]:
%%timeit -r 10
[digit_ratio(x) for x in df['random_text'].values]

27.3 ms ± 774 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [27]:
%%timeit -r 10
[digit_ratio(x) for x in df['random_text']]

28.2 ms ± 552 µs per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [28]:
%%timeit -r 10
df['random_text'].str.replace(' ', '').apply(lambda x: 0 if not x else sum(c.isdigit() for c in x) / len(x))

32.2 ms ± 2.31 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


* Case 1: n = 100000

In [29]:
df = genereate_random_text(100000)

In [30]:
df.head()

Unnamed: 0,random_text
0,8L)m/*JcLuoR:
1,/9BTfbPVIn
2,g8dzeFwuS8s#|D@F<oUz<
3,<0=zPWD\nie3{OJCFDn&HsD6JMiwy|`<ic
4,*i\@j8*n-{\#|cTY-YDk]


In [31]:
%%timeit -r 10
df['random_text'].apply(digit_ratio)

305 ms ± 18.2 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [32]:
%%timeit -r 10
[digit_ratio(x) for x in df['random_text'].values]

296 ms ± 10.5 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [33]:
%%timeit -r 10
[digit_ratio(x) for x in df['random_text']]

291 ms ± 7.41 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [34]:
%%timeit -r 10
df['random_text'].str.replace(' ', '').apply(lambda x: 0 if not x else sum(c.isdigit() for c in x) / len(x))

314 ms ± 13 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)
