## Speed Up Pandas Code!

#### Import libraries

In [1]:
import pandas as pd
import numpy as np

#### Create DataFrame

In [2]:
def create_df(size: int) -> pd.DataFrame:
    df = pd.DataFrame()
    foods = ["Broccoli", "Pizza", "Pasta", "Olives", "Hamburger", "Eggs", "Taco", "Sushi", "Mushrooms", "Fried Chicken", "Anchovies", "Ice Cream", "French Fries", "Bacon", "Steak"]
    df["Age"] = np.random.randint(0, 100, size)
    df["Bed_Time"] = np.random.randint(0, 9, size)
    df["Pct_Sleeping"] = np.random.rand(size)
    df["Favorite_Food"] = np.random.choice(foods, size)
    df["Hated_Food"] = np.random.choice(foods, size)
    return df

In [3]:
df = create_df(10_000)
df.head(10)

Unnamed: 0,Age,Bed_Time,Pct_Sleeping,Favorite_Food,Hated_Food
0,57,2,0.84173,Olives,Steak
1,93,6,0.419318,Steak,French Fries
2,57,2,0.050575,Pasta,Pizza
3,74,8,0.977028,Hamburger,Bacon
4,88,1,0.833842,Pasta,Bacon
5,78,5,0.804167,Bacon,Eggs
6,41,6,0.71655,Fried Chicken,Steak
7,83,7,0.876183,Olives,Olives
8,87,8,0.03191,Mushrooms,Mushrooms
9,75,1,0.65015,Olives,Pizza


### The Problem

-

In [4]:
def reward_calc(row) -> str:
    if row["Age"] >= 90:
        return row["Favorite_Food"]
    if ((row["Bed_Time"] > 5) & (row["Pct_Sleeping"] > 0.5)):
        return row["Favorite_Food"]
    else:
        return row["Hated_Food"]

#### Level 1 - For-loop

In [6]:
%%timeit
# df1 = create_df(10_000)
for idx, row in df.iterrows():
    df.loc[idx, "Reward"] = reward_calc(row)

2.85 s ± 45.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Level 2 - Apply function

In [7]:
%%timeit
# df = create_df(10_000)
df["Reward"] = df.apply(reward_calc, axis=1)

469 ms ± 74.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Level 3 - Vectorized Functions

In [8]:
%%timeit
# df = create_df(10_000)
df["Reward"] = df["Hated_Food"]
df.loc[((df["Bed_Time"] > 5) & (df["Pct_Sleeping"] > 0.5)) | df["Age"], "Reward"] = df["Favorite_Food"]

4.55 ms ± 273 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
