In [1]:
#  Boolean masking is the heart of fast and efficient querying in numpy and pandas, and its analogous to bit masking used
# in other areas of computational science. 

# A Boolean mask is an array which can be of one dimension like a series, or two dimensions like a data frame,
# where each of the values in the array are either true or false. This array is essentially overlaid on top of the data
# structure that we're querying. And any cell aligned with the true value will be admitted into our final result,
# and any cell aligned with a false value will not.

In [2]:
import pandas as pd
list = [{'name':'Tuba','age': 23, 'CGPA': 3.8},
        {'name':'Plabon','age': 25, 'CGPA': 3.4},
        {'name':'Nila','age': 23, 'CGPA': 3.2},
        {'name':'Maliha','age': 21, 'CGPA': 3.75},
        {'name':'Tara','age': 23, 'CGPA': 2.8},
        {'name':'Raima','age': 22, 'CGPA': 3.65}]

df = pd.DataFrame(list)
df

Unnamed: 0,name,age,CGPA
0,Tuba,23,3.8
1,Plabon,25,3.4
2,Nila,23,3.2
3,Maliha,21,3.75
4,Tara,23,2.8
5,Raima,22,3.65


In [9]:
# Boolean masks are created by applying operators directly to the pandas Series or DataFrame objects. 
# For instance, in our graduate admission dataset, we might be interested in seeing only those students 
# that have a cgpa higher than 3.5

# To build a Boolean mask for this query, we want to project the cgpa column using the 
# indexing operator and apply the greater than operator with a comparison value of 3.5. This is 
# essentially broadcasting a comparison operator, greater than, with the results being returned as 
# a Boolean Series. The resultant Series is indexed where the value of each cell is either True or False 
# depending on whether a student has a cgpa higher than 3.5


admit_mask=df['CGPA'] > 3.5
admit_mask

0     True
1    False
2    False
3     True
4    False
5     True
Name: CGPA, dtype: bool

In [10]:
# So, what do you do with the boolean mask once you have formed it? Well, you can just lay it on top of the
# data to "hide" the data you don't want, which is represented by all of the False values. We do this by using
# the .where() function on the original DataFrame.

df.where(admit_mask)

Unnamed: 0,name,age,CGPA,allowed
0,Tuba,23.0,3.8,0.0
1,,,,
2,,,,
3,Maliha,21.0,3.75,0.0
4,,,,
5,Raima,22.0,3.65,0.0


In [11]:
# We see that the resulting data frame keeps the original indexed values, and only data which met 
# the condition was retained. All of the rows which did not meet the condition have NaN data instead,
# but these rows were not dropped from our dataset. 
#
# The next step is, if we don't want the NaN data, we use the dropna() function

df.where(admit_mask).dropna()

Unnamed: 0,name,age,CGPA,allowed
0,Tuba,23.0,3.8,0.0
3,Maliha,21.0,3.75,0.0
5,Raima,22.0,3.65,0.0


In [13]:
# Despite being really handy, where() isn't actually used that often. Instead, the pandas devs
# created a shorthand syntax which combines where() and dropna(), doing both at once. And, in
# typical fashion, the just overloaded the indexing operator to do this!

df[df['age']>22]

Unnamed: 0,name,age,CGPA,allowed
0,Tuba,23,3.8,False
1,Plabon,25,3.4,False
2,Nila,23,3.2,False
4,Tara,23,2.8,False


In [18]:
# It can be called with a string parameter to project a single column
df["name"].head()  #single bracket

0      Tuba
1    Plabon
2      Nila
3    Maliha
4      Tara
Name: name, dtype: object

In [16]:
df[['name','age']]   #double bracket

Unnamed: 0,name,age
0,Tuba,23
1,Plabon,25
2,Nila,23
3,Maliha,21
4,Tara,23
5,Raima,22


In [21]:
# lets talk about combining multiple boolean masks

df[(df["age"]>22) & (df["CGPA"]>3.0)]

Unnamed: 0,name,age,CGPA,allowed
0,Tuba,23,3.8,False
1,Plabon,25,3.4,False
2,Nila,23,3.2,False


In [23]:
# One thing to watch out for is order of operations! A common error for new pandas users is
# to try and do boolean comparisons using the & operator but not putting parentheses around
# the individual terms you are interested in


df["age"]>22 & df["CGPA"]>3.0

TypeError: Cannot perform 'rand_' with a dtyped [float64] array and scalar of type [bool]

In [26]:
# Another way to do this is to just get rid of the comparison operator completely, and instead
# use the built in functions which mimic this approach


df['age'].gt(21) & df['CGPA'].lt(3.5)   # gt = greater than, lt = less than

0    False
1     True
2     True
3    False
4     True
5    False
dtype: bool

In [28]:
df[(df["age"]>22) & (df["age"]<25)]

Unnamed: 0,name,age,CGPA,allowed
0,Tuba,23,3.8,False
2,Nila,23,3.2,False
4,Tara,23,2.8,False
