## Python Data Manipulation Modules
Learn how to manipulate data using Numpy and Pandas
    
    AUTHOR: Dr. Roy Jafari 

### Video 6: Code Optimization - Loop, apply, and V&B

In [4]:
import numpy as np
import pandas as pd

In [5]:
n = 30000
person_df = pd.DataFrame(index=range(n), columns=['Height','Weight'])

In [6]:
person_df

Unnamed: 0,Height,Weight
0,,
1,,
2,,
3,,
4,,
...,...,...
29995,,
29996,,
29997,,
29998,,


In [7]:
person_df.Height = np.random.normal(178,10,n)
person_df.Weight = np.random.normal(83,7,n)

In [8]:
person_df

Unnamed: 0,Height,Weight
0,172.556616,74.380624
1,169.270587,83.355930
2,178.547856,80.339274
3,173.933601,86.496244
4,184.942304,84.652817
...,...,...
29995,168.241289,75.914217
29996,166.698809,81.083085
29997,182.960392,77.953182
29998,174.959291,77.753727


# Calculate BMI for each row:

$BMI= \frac{Weight_kg}{Height_m^{2}}$

In [9]:
## Aproach 1 - Use traditional loop
person_df['BMI'] = None
person_df

Unnamed: 0,Height,Weight,BMI
0,172.556616,74.380624,
1,169.270587,83.355930,
2,178.547856,80.339274,
3,173.933601,86.496244,
4,184.942304,84.652817,
...,...,...,...
29995,168.241289,75.914217,
29996,166.698809,81.083085,
29997,182.960392,77.953182,
29998,174.959291,77.753727,


In [10]:
%%time

for i in range(n):
    person_df.loc[i,'BMI'] = person_df.loc[i,'Weight'] / ((person_df.loc[i,'Height']/100)**2) 

Wall time: 40.6 s


In [None]:
person_df

In [None]:
## Aproach 2 - Use Pandas Loop

In [11]:
%%time

for i,row in person_df.iterrows():
    person_df.loc[i,'BMI'] = row.Weight / ((row.Height/100)**2) 

Wall time: 32.9 s


In [None]:
## Aproach 3 - Apply an explicit function

In [12]:
def calculateBMI(row):
    return row.Weight/((row.Height/100)**2) 

In [13]:
%%time
person_df['BMI'] = person_df.apply(calculateBMI,axis=1)

Wall time: 1.1 s


In [None]:
## Aproach 3 - Apply a lambda function

In [14]:
%%time
person_df['BMI'] = person_df.apply(lambda r:r.Weight/((r.Height/100)**2)  ,axis=1)

Wall time: 962 ms


In [None]:
## Aproach 4 - Vectorization and Broadcasting

In [15]:
%%time
person_df['BMI'] = person_df.Weight / ((person_df.Height/100)**2) 

Wall time: 2.93 ms


In [17]:
person_df['Gender'] = np.random.binomial(1,0.4988,n)

In [18]:
person_df = person_df.replace({0:'M',1:'F'})

In [19]:
person_df

Unnamed: 0,Height,Weight,BMI,Gender
0,172.556616,74.380624,24.980239,M
1,169.270587,83.355930,29.091995,F
2,178.547856,80.339274,25.201049,M
3,173.933601,86.496244,28.591061,F
4,184.942304,84.652817,24.749640,F
...,...,...,...,...
29995,168.241289,75.914217,26.819949,M
29996,166.698809,81.083085,29.178655,M
29997,182.960392,77.953182,23.287329,M
29998,174.959291,77.753727,25.400788,F


# Who is Healthy, underweight and overweight?

**Men**:
- underweight: $BMI<20$
- healthy: $BMI=>20 and <=25$
- overweight: $BMI>25$

**Women**:
- underweight: $BMI<19$
- healthy: $BMI=>19 and <=24$
- overweight: $BMI>24$

In [20]:
# Approach 1 
person_df['Status'] = None
person_df

Unnamed: 0,Height,Weight,BMI,Gender,Status
0,172.556616,74.380624,24.980239,M,
1,169.270587,83.355930,29.091995,F,
2,178.547856,80.339274,25.201049,M,
3,173.933601,86.496244,28.591061,F,
4,184.942304,84.652817,24.749640,F,
...,...,...,...,...,...
29995,168.241289,75.914217,26.819949,M,
29996,166.698809,81.083085,29.178655,M,
29997,182.960392,77.953182,23.287329,M,
29998,174.959291,77.753727,25.400788,F,


In [21]:
%%time

for i,row in person_df.iterrows():
    if(row.Gender == 'M'):
        if(row.BMI<20):
            person_df.loc[i,'Status'] = 'U'
        elif(row.BMI<=25):
            person_df.loc[i,'Status'] = 'H'
        else:
            person_df.loc[i,'Status'] = 'O'
    else:
        if(row.BMI<19):
            person_df.loc[i,'Status'] = 'U'
        elif(row.BMI<=24):
            person_df.loc[i,'Status'] = 'H'
        else:
            person_df.loc[i,'Status'] = 'O'

Wall time: 47.6 s


In [None]:
#Approach 2 Applying

In [22]:
def specifyStatus(row):
    if(row.Gender == 'M'):
        if(row.BMI<20):
            return 'U'
        elif(row.BMI<=25):
            return 'H'
        else:
            return 'O'
    else:
        if(row.BMI<19):
            return 'U'
        elif(row.BMI<=24):
            return 'H'
        else:
            return 'O'

In [23]:
%%time
person_df['Statu'] = person_df.apply(specifyStatus ,axis=1)

Wall time: 1.58 s


In [None]:
#Approach 3 V&B np.where()

In [24]:
%%time
g = person_df.Gender
bmi = person_df.BMI
person_df['Status'] = np.where(g=='F',
                               np.where(bmi<19,'U',np.where(bmi<=24,'H','O')),
                               np.where(bmi<20,'U',np.where(bmi<=25,'H','O')))

Wall time: 6.64 ms


## Avialable operations for V&B

- +
- -
- /
- // floor division
- % modulo operation
- @ matrix multiplication
- ** power
- '<'
- '<='
- '>'
- '>='
- & Binary and
- ^ Binary XOR
- | binary or