Load libraries

In [1]:
import numpy as np
import pandas as pd

Generate Random Data

In [2]:
n = 30000
person_df = pd.DataFrame(index=range(n), columns=['Height', 'Weight'])
person_df

Unnamed: 0,Height,Weight
0,,
1,,
2,,
3,,
4,,
...,...,...
29995,,
29996,,
29997,,
29998,,


In [3]:
person_df.Height = np.random.normal(178,10,n)
person_df.Weight = np.random.normal(83,7,n)

In [4]:
person_df

Unnamed: 0,Height,Weight
0,166.225777,75.691775
1,191.944626,89.576441
2,166.529376,86.469087
3,192.280417,81.097458
4,185.175475,93.645630
...,...,...
29995,191.449243,93.457894
29996,164.923293,88.802940
29997,174.611172,84.974464
29998,178.452029,92.479699


# Calculate the BMI for each row

BMI = Weight / Height^2

Method 1: Using for loop

In [9]:
%%time

for i in range(n):
    person_df.loc[i, 'BMI'] = person_df.loc[i, 'Weight'] / ((person_df.loc[i, 'Height']/100)**2)

Wall time: 3.03 s


In [10]:
person_df.head()

Unnamed: 0,Height,Weight,BMI
0,166.225777,75.691775,27.393779
1,191.944626,89.576441,24.313187
2,166.529376,86.469087,31.180219
3,192.280417,81.097458,21.934974
4,185.175475,93.64563,27.30993


Method 2: Using Pandas

In [11]:
%%time

for i,row in person_df.iterrows():
    person_df.loc[i,'BMI'] = row.Weight / ((row.Height/100)**2)

Wall time: 2.6 s


Method 3: Apply an explicit function

In [12]:
def calculateBMI(row):
    return row.Weight / ((row.Height/100)**2)

In [16]:
%%time

person_df['BMI'] = person_df.apply(calculateBMI, axis=1)

Wall time: 490 ms


Method 4: Apply a lambda function

In [17]:
%%time

person_df['BMI'] = person_df.apply(lambda r:r.Weight/((r.Height/100)**2), axis=1)

Wall time: 505 ms


Method 5: Vectorization and Broadcasting

In [18]:
%%time

person_df['BMI'] = person_df.Weight/ ((person_df.Height/100)**2)

Wall time: 2 ms


In [19]:
person_df['Gender'] = np.random.binomial(1,0.4988,n)

In [20]:
person_df = person_df.replace({0:'M', 1:'F'})
person_df

Unnamed: 0,Height,Weight,BMI,Gender
0,166.225777,75.691775,27.393779,M
1,191.944626,89.576441,24.313187,M
2,166.529376,86.469087,31.180219,M
3,192.280417,81.097458,21.934974,M
4,185.175475,93.645630,27.309930,M
...,...,...,...,...
29995,191.449243,93.457894,25.498151,M
29996,164.923293,88.802940,32.648509,F
29997,174.611172,84.974464,27.870476,F
29998,178.452029,92.479699,29.040456,F


## Who is Healthy?

Men:

* underweight: BMI < 20
* healthy: BMI => 20 & <=25
* overweight: BMI > 25

Women:

* underweight: BMI < 19
* healthy: BMI => 19 and BMI <= 24
* overweight: BM > 24

In [21]:
person_df['status'] = None

Using Method 4

In [24]:
%%time
g = person_df.Gender
bmi = person_df.BMI
person_df['status'] = np.where(g=='F',
                               np.where(bmi<19, 'U', np.where(bmi<=24, 'H', 'O')),
                               np.where(bmi<20, 'U', np.where(bmi<=25, 'H', 'O')))

Wall time: 4 ms
