In [1]:
import pandas as pd
import numpy as np

In [5]:
def get_data(size = 10000):
    df = pd.DataFrame()
    size = 10000
    df['age'] = np.random.randint(0, 100, size)
    df['time_in_bed'] = np.random.randint(0, 9, size)
    df['pct_sleeping'] = np.random.rand(size)
    df['favorite_food'] = np.random.choice(['pizza', 'taco', 'ice-cream'], size)
    df['hate_food'] = np.random.choice(['broccoli', 'candy corn', 'eggs'], size)
    return df



In [8]:
get_data().head()

Unnamed: 0,age,time_in_bed,pct_sleeping,favorite_food,hate_food
0,10,2,0.449188,ice-cream,broccoli
1,47,7,0.271524,pizza,broccoli
2,54,5,0.329379,pizza,broccoli
3,67,8,0.51598,taco,broccoli
4,81,8,0.757533,taco,candy corn


In [9]:
def reward_cal(row):
    if row['age'] >= 90:
        return row['favorite_food']
    if (row['time_in_bed'] > 5) & (row['pct_sleeping'] > 0.5):
        return row['favorite_food']
    return row['hate_food']

## 1. Loop

In [15]:
df = get_data()

In [16]:
%%timeit

for index, row in df.iterrows():
    df.loc[index, 'reward'] = reward_cal(row)

1.46 s ± 28.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
df.head()

Unnamed: 0,age,time_in_bed,pct_sleeping,favorite_food,hate_food,reward
0,55,2,0.813875,pizza,broccoli,broccoli
1,1,5,0.749471,taco,broccoli,broccoli
2,69,6,0.318501,ice-cream,candy corn,candy corn
3,52,1,0.393861,pizza,eggs,eggs
4,96,5,0.760426,ice-cream,broccoli,ice-cream


## 2. Apply

In [18]:
df = get_data()

In [19]:
%%timeit

df.apply(reward_cal, axis=1)

220 ms ± 7.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
df.head()

Unnamed: 0,age,time_in_bed,pct_sleeping,favorite_food,hate_food
0,58,6,0.83301,pizza,candy corn
1,10,2,0.781364,pizza,broccoli
2,3,5,0.386372,taco,broccoli
3,59,7,0.744937,taco,candy corn
4,27,6,0.037518,ice-cream,candy corn


## 3. Pandas Vectorization

In [25]:
df = get_data()

In [26]:
%%timeit
df['reward'] = df['hate_food']

df.loc[((df['pct_sleeping'] > 0.5) & (df['time_in_bed'] > 5)) | (df['age'] > 90),  'reward'] = df['favorite_food']

1.85 ms ± 163 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
df.head()

Unnamed: 0,age,time_in_bed,pct_sleeping,favorite_food,hate_food,reward
0,16,5,0.745327,pizza,broccoli,broccoli
1,44,0,0.363351,ice-cream,candy corn,candy corn
2,71,6,0.679592,ice-cream,broccoli,ice-cream
3,37,1,0.673191,ice-cream,candy corn,candy corn
4,98,7,0.612745,taco,eggs,taco


## 4. Numpy Vectorization

In [29]:
df = get_data()

.values is used to make pandas series as numpy values

In [34]:
%%timeit
df['reward'] = df['hate_food']


df.loc[((df['pct_sleeping'].values > 0.5) & (df['time_in_bed'].values > 5)) | (df['age'].values > 90),  'reward'] = df['favorite_food']

1.33 ms ± 154 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
df.head()

1. If you are sure that you need to use a loop, you should always choose the apply method.
2. Otherwise, vectorization is always preferable as it is much faster.

https://towardsdatascience.com/how-to-make-your-pandas-loop-71-803-times-faster-805030df4f06