In [None]:
# In this notebook we're going to talk about Boolean masking
# Boolean masking is a heart of fast and efficent querying in numpy and pandas.

# Booelan masking is an array which can be a one dimensional just like Series
# or it can be two dimensional just like DataFrame, where each of the values
# in the array are either true or false
# Boolean msking is essentially overlaid over the data structure that we're querying.
# Any celll correspnding to the true value will be admitted into our final result
# and any cell aligned with a false value will not.

In [6]:
# before starting let's import pandas first
import pandas as pd

# Reading the dataset and setting Serial No. column as index
df = pd.read_csv("../Datasets/Admission_Predict.csv", index_col = 0)


# Peformig column cleaning and covering every column to lower case

df.columns = [ x.strip().lower()  for x in list(df.columns)]
df.head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [8]:
# Boolean masks is created by applying operators directly to the pandas
# Series or dataframe objects. 

# For instance, in our chance of admit column we can see that there are
# probability of admission given

# and let say we are intrested in seeing only those students that have a
# chance of admit greater than 0.9

# To build a boolean mask we need to directly operator to dataframe objects
# How let's check

df["chance of admit"] > 0.9

Serial No.
1       True
2      False
3      False
4      False
5      False
       ...  
396    False
397    False
398     True
399    False
400     True
Name: chance of admit, Length: 400, dtype: bool

In [9]:
# Here you can see that perforing boolean masking over dataframe 
# is returning a list of boolean values signifing whic of the cell is 
# gretaer than 0.9 or which isn't


# Now we have our booean mask and we want to apply it over our dataframe

# in pandas, we have where function that allow is to do that

# for instance

df.where(df["chance of admit"] > 0.9)

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
2,,,,,,,,
3,,,,,,,,
4,,,,,,,,
5,,,,,,,,
...,...,...,...,...,...,...,...,...
396,,,,,,,,
397,,,,,,,,
398,330.0,116.0,4.0,5.0,4.5,9.45,1.0,0.91
399,,,,,,,,


In [11]:
# this where function only admit the cell that is corresoinding to true value in our final result
# and assignes nan value to the cell that is corresponding to false.

# so for making of our data clean and more understandable we need to remove this nan value right

# SO here have another functions name dropna() that drops the cell having nan as a item

df.where(df["chance of admit"] > 0.9).dropna()
# See it is working

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
23,328.0,116.0,5.0,5.0,5.0,9.5,1.0,0.94
24,334.0,119.0,5.0,5.0,4.5,9.7,1.0,0.95
25,336.0,119.0,5.0,4.0,3.5,9.8,1.0,0.97
26,340.0,120.0,5.0,4.5,4.5,9.6,1.0,0.94
33,338.0,118.0,4.0,3.0,4.5,9.4,1.0,0.91
35,331.0,112.0,5.0,4.0,5.0,9.8,1.0,0.94
45,326.0,113.0,5.0,4.5,4.0,9.4,1.0,0.91
71,332.0,118.0,5.0,5.0,5.0,9.64,1.0,0.94
72,336.0,112.0,5.0,5.0,5.0,9.76,1.0,0.96


In [13]:
# But this whole process it little bit old school

# So pandas provides another way to do the same thing
# pandas provides a shorthand syntax which combines both where() and dropna() doing both at once.
# jus toverloaded the indexing operator to do this.

df[df["chance of admit"] > 0.9]

# Working similar to older method right

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
23,328,116,5,5.0,5.0,9.5,1,0.94
24,334,119,5,5.0,4.5,9.7,1,0.95
25,336,119,5,4.0,3.5,9.8,1,0.97
26,340,120,5,4.5,4.5,9.6,1,0.94
33,338,118,4,3.0,4.5,9.4,1,0.91
35,331,112,5,4.0,5.0,9.8,1,0.94
45,326,113,5,4.5,4.0,9.4,1,0.91
71,332,118,5,5.0,5.0,9.64,1,0.94
72,336,112,5,5.0,5.0,9.76,1,0.96


In [14]:
# also we can get the particular column by passing column name to the indexing operator

df["gre score"]

Serial No.
1      337
2      324
3      316
4      322
5      314
      ... 
396    324
397    325
398    330
399    312
400    333
Name: gre score, Length: 400, dtype: int64

In [17]:
# or you can pass a list of column in idnexing operator

df[["gre score","lor","sop"]]

Unnamed: 0_level_0,gre score,lor,sop
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,337,4.5,4.5
2,324,4.5,4.0
3,316,3.5,3.0
4,322,2.5,3.5
5,314,3.0,2.0
...,...,...,...
396,324,3.5,3.5
397,325,3.5,3.0
398,330,4.5,5.0
399,312,4.0,3.5


In [19]:
# or you can send a boolean mask

df[df["gre score"] > 300].head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [22]:
# but sometimes in projects you may need to combine multiple boolean masks, on the basis 
# of multiple criterion

# in bitmasking we can do it by using bitwise operator

# suppose we want to query a datasets with following conditions
# gre score > 300
# chance of admit > 0.9

# Creating boolean mask

mask = (df["gre score"] > 300) & (df["chance of admit"] > 0.9)
mask.head()

Serial No.
1     True
2    False
3    False
4    False
5    False
dtype: bool

In [24]:
# applying the mask to dataframe to query data

df[mask].head()


Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
23,328,116,5,5.0,5.0,9.5,1,0.94
24,334,119,5,5.0,4.5,9.7,1,0.95
25,336,119,5,4.0,3.5,9.8,1,0.97
26,340,120,5,4.5,4.5,9.6,1,0.94


In [25]:
# but when perforing the same masking operation without parenthesis why do we get an error

df["gre score"] > 300 & df["chance of admit"] > 0.9

TypeError: Cannot perform 'rand_' with a dtyped [float64] array and scalar of type [bool]

In [26]:
# This is because python is trying to bitwise 300 and a dataframe object when
# you really want to bitwise broadcasted dataframes together

# Another way to get rid of this problem is using inbuild functions to compare

# in pandas we have gt() and lt() functions where gt stands for greater than
# and obviously lt stands for lesser than

df["gre score"].gt(300) &  df["chance of admit"].gt(0.9)

Serial No.
1       True
2      False
3      False
4      False
5      False
       ...  
396    False
397    False
398     True
399    False
400     True
Length: 400, dtype: bool