# Comparing the performance of various methods in assembling lists, generators and iterators for a simple case of iterables of 1 to 1e6

In [19]:
# %gvim

<IPython.core.display.Javascript object>

Cell contents can now be edited via Gvim. From command mode use 'g' to open current cell contents in Gvim. After ':wq' from Gvim, use 'u' in command mode to update cell contents.


In [2]:
from functools import reduce
import operator
import statistics as st

# supporting (t - test) functions

In [151]:
def sidak(alpha, c):
    """Return alpha with Sidak correction, for s-1 successive t tests.
    Args:
        alpha (float): type 1 error rate between single test pair
        c (int): number of successive t-tests to be performed
    """
    return 1 - (1 - alpha)**(1/c)

def multiple_t(options, alpha_sidak):
    """Perform s-1 t-tests between mean execution time point estimates of 1) most performant point estimate and 2) all others
    Args:
        options (dict): {"for loop scenario":(mean, sd, df),...}
        alpha_sidak: significance wrt. each test pair. (sidak correction applied)
    Return: 
        a list of option names (keys), of the significantly most performant.
    """
    disregarded = []  # significantly slower
    
    pi1 = reduce(lambda i,j: i if i[1][0] < j[1][0] else j ,options.items())[0] # get the most performant option
    
    for pi2 in options.keys():
        if pi1 == pi2:
            continue
        if is_significant((pi1, *options[pi1]), (pi2, *options[pi2]), alpha_sidak):
            disregarded.append(pi2)  # pi1 significantly bigger (slower)

    # return options minus significantly worse
    return list(set(options.keys()) - set(disregarded))

def is_significant(pi1, pi2, alpha_sidak):
    """Return True if the (pi2 - pi1) - E > 0 (i.e. pi2 is significantly longer).
    Args:
       pi1(list): (name, mean, sd, df) of option 1
       pi2(list): (name, mean, sd, df) of option 2
    """
    name1, mean1, sd1, df1 = pi1
    name2, mean2, sd2, df2 = pi2

    sd = st.sqrt(sd1**2 / (df1+1) + sd2**2 / (df2+1))
    df = min(df1, df2)

    # is pi1 significantly larger(slower))?
    CI_lower = mean2 - mean1 - t.ppf(1 - alpha_sidak, df) * sd
    
    if  CI_lower > 0:
        print(f"CI: {CI_lower} < {name1}_mean - {name2}_mean < +INF")
        return True
    else:
        return False

# sum vs ... 

In [3]:
%%timeit
a = sum(range(1000000))

21 ms ± 1.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [4]:
%%timeit
b = reduce(operator.add, range(1000000))

56.8 ms ± 3.66 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## which are the significantly most faster?

In [145]:
options = {"sum": (21, 1.32, 70), "reduce":(56.8, 3.66, 70)}
alpha_sidak = sidak(0.05, 1)
print(f"\nsignificantly fastest options: {multiple_t(options, alpha_sidak)}")

sum
reduce
CI: 35.0303054791353 < sum_mean - reduce_mean < +INF

significantly fastest options: ['sum']


Thus, sum() is *significantly* faster than reduce().

# Creating Lists ....(as opposed to iterator/ generator objects) 

### for loop 

In [114]:
%%timeit
results = []
for i in range(1000000):
    results.append(2*i)

116 ms ± 4.46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### while loop

In [115]:
%%timeit
results = []
i = 0
while i < 1000000:
    results.append(2*i)
    i += 1

171 ms ± 20.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### list(map)

In [7]:
%%timeit
results = list(map(lambda i: 2*i, range(1000000)))

123 ms ± 2.19 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### list(generator exp)

In [106]:
%%timeit
results = list((i for i in range(1000000)))

56.8 ms ± 870 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### list comprehension

In [8]:
%%timeit
results = [i for i in range(1000000)]

56.7 ms ± 702 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### significance via multiple t-tests

In [152]:
options = {"for":(116, 4.46, 7), "while": (171, 20.5, 7), "list(map())": (123, 2.19, 70), "list(generator exp)":(56.8, 0.87, 70), "list comp": (56.7, 0.702, 70)}
alpha_sidak = sidak(0.05, 4)
print(f"\nsignificantly fastest options: {multiple_t(options, alpha_sidak)}")

CI: 54.83461560169494 < list comp_mean - for_mean < +INF
CI: 93.80248454296273 < list comp_mean - while_mean < +INF
CI: 65.67695170886289 < list comp_mean - list(map())_mean < +INF

significantly fastest options: ['list comp', 'list(generator exp)']


## Filtering

In [75]:
%%timeit
squares = [i for i in range(1000000) if i%2==0 ]

64.3 ms ± 2.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [76]:
%%timeit
list(filter(lambda i: i%2==0, range(1000000)))

100 ms ± 1.19 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [108]:
%%timeit
squares = list((i for i in range(1000000) if i%2==0))

67.6 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### significance via t-tests

In [153]:
options = {"list comp":(64.3, 2.12, 70), "list(filter)": (100, 1.19, 70), "list(generator)":(67.6, 1.15, 70)}
alpha_sidak = sidak(0.05, 2)
print(f"\nsignificantly fastest options: {multiple_t(options, alpha_sidak)}")

CI: 35.12620191928124 < list comp_mean - list(filter)_mean < +INF
CI: 2.7307634480723184 < list comp_mean - list(generator)_mean < +INF

significantly fastest options: ['list comp']


# Creating iterables

## generator vs map vs list comp, with subsequent iterable used by list-comp

### list comprehension

In [91]:
%%timeit
result = [i for i in range(1000000)]
result2 = [2*i for i in result]

101 ms ± 4.46 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### map (lambda)

In [94]:
%%timeit
result = map(lambda i: i, range(1000000))
result = [2*i for i in result]

107 ms ± 1.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### map (fun)

In [99]:
%%timeit

def f(i):
    return i

result = map(f, range(1000000))
result = [2*i for i in result]

104 ms ± 1.58 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### generator (fun)

In [93]:
%%timeit
def generator():
    for i in range(1000000):
        yield i
result2 = [2*i for i in generator()]

93.9 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### generator (expression)

In [103]:
%%timeit
result = (i for i in range(1000000))
result2 = [2*i for i in result]

93.3 ms ± 2.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [154]:
options = {"list comp":(101, 4.46, 70), "map (lambda)": (107, 1.24, 70), "map (fun)": (104, 1.58, 70), "generator (fun)":(93.9, 1.01, 70), "generator (exp)": (93.3, 2.51, 70)}
alpha_sidak = sidak(0.05, 4)
print(f"\nsignificantly fastest options: {multiple_t(options, alpha_sidak)}")

CI: 6.313497327638736 < generator (exp)_mean - list comp_mean < +INF
CI: 12.941539120656872 < generator (exp)_mean - map (lambda)_mean < +INF
CI: 9.896485310682017 < generator (exp)_mean - map (fun)_mean < +INF

significantly fastest options: ['generator (exp)', 'generator (fun)']


# Reducing operations

In [131]:
%%timeit
result = reduce(lambda i,j : i if i >j else j, range(1000000))

95.9 ms ± 1.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [130]:
%%timeit
result = 0
for i in range(1000000):
    if i > result: result = i

48.9 ms ± 837 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [155]:
options = {"reduce()":(95.9, 1.59, 7), "for loop": (48.9, 0.837, 70)}
alpha_sidak = sidak(0.05, 1)
print(f"\nsignificantly fastest options: {multiple_t(options, alpha_sidak)}")

CI: 45.9184632978186 < for loop_mean - reduce()_mean < +INF

significantly fastest options: ['for loop']


# Filtering operations

_then using the resulting iterable in list comprehenson_

In [136]:
%%timeit
result = filter(lambda i: i > 500000, range(1000000))
result2 = [i**2 for i in result]

248 ms ± 12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [137]:
%%timeit
result = []
for i in range(1000000):
    if i > 500000: result.append(i)
        
result2 = [i**2 for i in result]

225 ms ± 15.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [140]:
%%timeit
result = [i for i in range(1000000) if i > 500000]
result2 = [i**2 for i in result]

199 ms ± 5.98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [141]:
%%timeit
result = (i for i in range(1000000) if i > 500000)
result2 = [i**2 for i in result]

198 ms ± 11.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [156]:
options = {"filter()":(248, 12, 7), "for loop": (225, 15.4, 7), "list comp":(199, 5.98, 7), "gen exp":(198, 11.2, 7)}
alpha_sidak = sidak(0.05, 3)
print(f"\nsignificantly fastest options: {multiple_t(options, alpha_sidak)}")

CI: 34.7360200973572 < gen exp_mean - filter()_mean < +INF
CI: 9.29276909808802 < gen exp_mean - for loop_mean < +INF

significantly fastest options: ['list comp', 'gen exp']
