In [0]:
import timeit
import pandas as pd
import numpy as np

In [0]:
df = pd.DataFrame(np.random.randint(0, 10, size=(100000, 4)), columns=list('ABCD'))
df.head()

Unnamed: 0,A,B,C,D
0,7,7,7,3
1,9,6,4,0
2,7,9,6,0
3,2,9,7,4
4,9,9,4,0


### Standard python for loop with iloc
- 1 loop, best of 3: 2.32 s per loop

In [0]:
def loop_with_for(df):
    temp = 0
    for index in range(len(df)):
        temp += df['A'].iloc[index] + df['B'].iloc[index]
    return temp

In [0]:
%timeit loop_with_for(df)

1 loop, best of 3: 2.32 s per loop


### Using pandas iterrows function
- 1 loop, best of 3: 10.3 s per loop

In [0]:
def loop_with_iterrows(df):
    temp = 0
    for _, row in df.iterrows():
         temp += row.A + row.B
    return temp

In [0]:
%timeit loop_with_iterrows(df)

1 loop, best of 3: 10.3 s per loop


### Using pandas itertuples function
- 10 loops, best of 3: 85.4 ms per loop

In [0]:
def loop_with_itertuples(df):
    temp = 0
    for row_tuple in df.itertuples():
        temp += row_tuple.A + row_tuple.B
    return temp

In [0]:
%timeit loop_with_itertuples(df)

10 loops, best of 3: 85.4 ms per loop


### Using python zip
- 10 loops, best of 3: 21 ms per loop

In [0]:
def loop_with_zip(df):
    temp = 0
    for a, b in zip(df['A'], df['B']):
        temp += a + b
    return temp

In [0]:
%timeit loop_with_zip(df)

10 loops, best of 3: 21 ms per loop


In [0]:
%prun -l 4 loop_with_zip(df)

 

### Using pandas apply function
- 1 loop, best of 3: 1.86 s per loop

In [0]:
def using_apply(df):
    return df.apply(lambda x: x['A'] + x['B'], axis=1).sum()

In [0]:
%timeit using_apply(df)

1 loop, best of 3: 1.86 s per loop


### Using pandas builtin add function
- 1000 loops, best of 3: 863 µs per loop

In [0]:
def using_pandas_builtin(df):
    return (df['A'] + df['B']).sum()

In [0]:
%timeit using_pandas_builtin(df)

The slowest run took 32.39 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 863 µs per loop


### Using numpy builtin function
- 1000 loops, best of 3: 333 µs per loop

In [0]:
def using_numpy_builtin(df):
    return (df['A'].values + df['B'].values).sum()

In [0]:
%timeit using_numpy_builtin(df)

The slowest run took 5.43 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 333 µs per loop


### How to use apply?
- using_apply
  - 1 loop, best of 3: 1.99 s per loop

- using_apply_unpack
  - 1 loop, best of 3: 1.67 s per loop

In [0]:
def using_apply(df):
    return df.apply(lambda x: x['A'] + x['B'] + x['C'] + x['D'], axis=1).sum()

In [0]:
%timeit using_apply(df)

1 loop, best of 3: 1.99 s per loop


In [0]:
def using_apply_unpack(df):
    return df[['A', 'B', 'C', 'D']].apply(lambda x: sum([*x]), axis=1).sum()

In [0]:
%timeit using_apply_unpack(df)

1 loop, best of 3: 1.67 s per loop


In [0]:
pass