In [2]:
import pandas as pd
import numpy as np

## Column apply examples

In [3]:
t3 = pd.read_csv('./data/table3.csv')
t3

Unnamed: 0,country,year,rate
0,Afghanistan,1999,745/19987071
1,Afghanistan,2000,2666/20595360
2,Brazil,1999,37737/172006362
3,Brazil,2000,80488/174504898
4,China,1999,212258/1272915272
5,China,2000,213766/1280428583


In [4]:
t3.rate.apply(lambda col: col.split('/')[1])

0      19987071
1      20595360
2     172006362
3     174504898
4    1272915272
5    1280428583
Name: rate, dtype: object

In [5]:
t3['pop'] = t3.rate.apply(
                lambda col, sep, pos: int(col.split(sep)[pos])
                , sep='/'
                , pos=1
            )
t3

Unnamed: 0,country,year,rate,pop
0,Afghanistan,1999,745/19987071,19987071
1,Afghanistan,2000,2666/20595360,20595360
2,Brazil,1999,37737/172006362,172006362
3,Brazil,2000,80488/174504898,174504898
4,China,1999,212258/1272915272,1272915272
5,China,2000,213766/1280428583,1280428583


## More Examples

In [2]:
df = pd.DataFrame(
        {'a': [10, 20, 30]
        , 'b': [20, 30, 40]})

df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [3]:
# Broadcasting - vectorized processing
df.a * 2

0    20
1    40
2    60
Name: a, dtype: int64

In [4]:
# Apply a function on a series
df.a.apply(lambda x: x*x)

0    100
1    400
2    900
Name: a, dtype: int64

In [12]:
# Apply a function to a series with multiple arguments
# By default series gets passed in the first argument
df.a.apply(lambda arg1, arg2: arg2**2 + arg1, arg2=7)

0    59
1    69
2    79
Name: a, dtype: int64

# Dataframes

In [23]:
# By default, function is applied column series by column series
df.apply(lambda col: print(col))
#df.apply(lambda col: print(col[0])) # access elements in column
#df.apply(lambda col: np.mean(col))

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64


a    None
b    None
dtype: object

In [None]:
%%timeit
# Apply a function to a df row
# NOTE: You should not do this! Performance penalty 
df.apply(
    lambda row: print(row['a'], row.b)
    , axis=1
)

In [38]:
# Preferred way using broadcasting
def avg0(x, y): 
    return (x + y)//2

avg0(df.a, df.b)

0    15
1    25
2    35
dtype: int64

In [46]:
# Preferred way using broadcasting
# However, the function needs to be adapted to series
def avg1(x, y):
    if x == 20:
        return np.NaN
    else:
        return (x + y)/2

# this fails
try:
    avg1(df.a, df.b)
except ValueError as e:
    print(e)

The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


In [47]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


# Apply function to a row as a vector

In [51]:
# !!!This is what you should do => use numpy!!!
avg1_vec = np.vectorize(avg1)

# This returns a numpy.ndarray
df['vector_calc'] = avg1_vec(df.a, df.b)
df

Unnamed: 0,a,b,vector_calc
0,10,20,15.0
1,20,30,
2,30,40,35.0


In [67]:
# Preferred, pythonic way to do this using a decorator
@np.vectorize
def avg2(x, y):
    if x == 20:
        return np.NaN
    else:
        return (x + y)/2

In [68]:
%%timeit
# This returns an numpy.ndarray
df['vector_calc'] = avg2(df.a, df.b)
df

447 µs ± 4.21 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


# Alternative way using numba library (faster!)

In [57]:
import numba # uses JIT against C (very fast)

In [69]:
@numba.vectorize
def avg3(x, y):
    if x == 20:
        return np.NaN
    else:
        return (x + y)/2

In [70]:
%%timeit
# Requires input to be a np ndarray not a pd series
df['vector_calc'] = avg3(df.a.values, df.b.values)
df

206 µs ± 2.67 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
