# 1) Memory Optimization

In [1]:
import pandas as pd

In [4]:
df = pd.read_csv('Data/employees.csv')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


### After checking the info of our dataframe,
+ there are some columns with missing data
+ Start Date and Last Login Time are imported as Object(string) type. This will prevent us from using DateTime related methods in the future. So we need to convert them to correct datatype.
+ Senior Management are True/False value , however it is currently in object.
+ Team and Gender are in object, we can use category data type.

## Converting to datetime using `pd.to_datetime()` Method

In [8]:
df['Start Date'] = pd.to_datetime(df['Start Date'])
df['Start Date']

0     1993-08-06
1     1996-03-31
2     1993-04-23
3     2005-03-04
4     1998-01-24
         ...    
995   2014-11-23
996   1984-01-31
997   2013-05-20
998   2013-04-20
999   2012-05-15
Name: Start Date, Length: 1000, dtype: datetime64[ns]

### For Last Login Time, we only have time without date. So when converting to datetime, it gonna use today date by default. This is ok for our currenty case.

In [11]:
df['Last Login Time'] = pd.to_datetime(df['Last Login Time'])
df['Last Login Time'].head(3)

0   2021-04-04 12:42:00
1   2021-04-04 06:53:00
2   2021-04-04 11:17:00
Name: Last Login Time, dtype: datetime64[ns]

### Converting Senior Managment to Boolean data type

In [15]:
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Senior Management']

0       True
1       True
2      False
3       True
4       True
       ...  
995    False
996    False
997    False
998    False
999     True
Name: Senior Management, Length: 1000, dtype: bool

### Converting Gender to category

In [18]:
df['Gender'] = df['Gender'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-04-04 11:17:00,130590,11.858,False,Finance


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.0+ KB


### convert Team to category

In [21]:
df['Team'].value_counts()

Client Services         106
Finance                 102
Business Development    101
Marketing                98
Product                  95
Sales                    94
Engineering              92
Human Resources          91
Distribution             90
Legal                    88
Name: Team, dtype: int64

In [22]:
df['Team'] = df['Team'].astype('category')

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


## Converting datetime during `parse_dates` during reading csv

In [25]:
df = pd.read_csv('Data/employees.csv', parse_dates=['Start Date', 'Last Login Time'])

# df['Start Date'] = pd.to_datetime(df['Start Date'])
# df['Last Login Time'] = pd.to_datetime(df['Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df['Team'] = df['Team'].astype('category')

df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-04-04 11:17:00,130590,11.858,False,Finance


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    category      
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


------

# 2) Filter a DataFrame Based on a Condition

In [2]:
df = pd.read_csv('Data/employees.csv', parse_dates=['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df['Team'] = df['Team'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-04-04 11:17:00,130590,11.858,False,Finance


In [3]:
df['Gender'] == 'Male'

0       True
1       True
2      False
3       True
4       True
       ...  
995    False
996     True
997     True
998     True
999     True
Name: Gender, Length: 1000, dtype: bool

In [4]:
df[df['Gender'] == 'Male']

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2021-04-04 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2021-04-04 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2021-04-04 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2021-04-04 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2021-04-04 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2021-04-04 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2021-04-04 16:45:00,60500,11.985,False,Business Development


In [11]:
# more elegant way

is_finance = df['Team'] == 'Finance'
df[is_finance]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2021-04-04 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2021-04-04 13:00:00,138705,9.340,True,Finance
7,,Female,2015-07-20,2021-04-04 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2021-04-04 07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,2021-04-04 22:47:00,114796,6.796,False,Finance
...,...,...,...,...,...,...,...,...
907,Elizabeth,Female,1998-07-27,2021-04-04 11:12:00,137144,10.081,False,Finance
954,Joe,Male,1980-01-19,2021-04-04 16:06:00,119667,1.148,True,Finance
987,Gloria,Female,2014-12-08,2021-04-04 05:08:00,136709,10.331,True,Finance
992,Anthony,Male,2011-10-16,2021-04-04 08:35:00,112769,11.625,True,Finance


In [12]:
df[df['Senior Management']]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2021-04-04 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2021-04-04 16:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,2021-04-04 16:20:00,65476,10.012,True,Product
...,...,...,...,...,...,...,...,...
991,Rose,Female,2002-08-25,2021-04-04 05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,2021-04-04 08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,2021-04-04 15:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,2021-04-04 17:47:00,98874,4.479,True,Marketing


In [13]:
df[df['Team'] != 'Marketing'] # not in Marketing team

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2021-04-04 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2021-04-04 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2021-04-04 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2021-04-04 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2021-04-04 16:20:00,65476,10.012,True,Product
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,2021-04-04 06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,2021-04-04 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2021-04-04 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2021-04-04 16:45:00,60500,11.985,False,Business Development


In [17]:
mask = df['Salary'] > 110000
df[mask]

df[df['Bonus %'] < 1.5]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
4,Larry,Male,1998-01-24,2021-04-04 16:47:00,101004,1.389,True,Client Services
15,Lillian,Female,2016-06-05,2021-04-04 06:09:00,59414,1.256,False,Product
58,Theresa,Female,2010-04-11,2021-04-04 07:18:00,72670,1.481,True,Engineering
77,Charles,Male,2004-09-14,2021-04-04 20:13:00,107391,1.26,True,Marketing
175,Willie,Male,1998-02-17,2021-04-04 20:20:00,146651,1.451,True,Engineering
189,Clarence,Male,1998-05-02,2021-04-04 03:16:00,85700,1.215,False,Sales
217,Douglas,Male,1999-09-03,2021-04-04 16:00:00,83341,1.015,True,Client Services
273,Nicholas,Male,1994-04-12,2021-04-04 20:21:00,74669,1.113,True,Product
279,Ruby,Female,2000-11-08,2021-04-04 19:35:00,105946,1.139,False,Business Development
365,Gloria,,1983-07-19,2021-04-04 01:57:00,140885,1.113,False,Human Resources


In [18]:
df[df['Start Date'] <= '1985-01-01']

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2021-04-04 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2021-04-04 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2021-04-04 10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,2021-04-04 18:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,2021-04-04 20:49:00,57427,9.557,True,Client Services
...,...,...,...,...,...,...,...,...
982,Rose,Female,1982-04-06,2021-04-04 10:43:00,91411,8.639,True,Human Resources
983,John,Male,1982-12-23,2021-04-04 22:35:00,146907,11.738,False,Engineering
985,Stephen,,1983-07-10,2021-04-04 20:10:00,85668,1.909,False,Legal
986,Donna,Female,1982-11-26,2021-04-04 07:04:00,82871,17.999,False,Marketing


--------

# 3) Filter with More than One Condition (AND `&`)

In [19]:
df = pd.read_csv('Data/employees.csv', parse_dates=['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df['Team'] = df['Team'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-04-04 11:17:00,130590,11.858,False,Finance


### If we were handling more than one conditions, it is recommended to put each condition in separate variable, insteading of stacking

In [25]:
is_male = df['Gender'] == 'Male'
is_marketing = df['Team'] == 'Marketing'

df[is_male & is_marketing] # both

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2021-04-04 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2021-04-04 07:45:00,37598,7.757,True,Marketing
74,Thomas,Male,1995-06-04,2021-04-04 14:24:00,62096,17.029,False,Marketing
77,Charles,Male,2004-09-14,2021-04-04 20:13:00,107391,1.26,True,Marketing
101,Aaron,Male,2012-02-17,2021-04-04 10:20:00,61602,11.849,True,Marketing
104,John,Male,1989-12-23,2021-04-04 07:01:00,80740,19.305,False,Marketing
112,Willie,Male,2003-11-27,2021-04-04 06:21:00,64363,4.023,False,Marketing
119,Paul,Male,2008-06-03,2021-04-04 15:05:00,41054,12.299,False,Marketing
150,Sean,Male,1996-05-04,2021-04-04 20:59:00,135490,19.934,False,Marketing


---

# 4) Filter with More than One Condition (OR `|`)

In [26]:
df = pd.read_csv('Data/employees.csv', parse_dates=['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df['Team'] = df['Team'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-04-04 11:17:00,130590,11.858,False,Finance


In [31]:
is_senior_managment = df['Senior Management']
is_less_than_1990Jan1 = df['Start Date'] < '1990-01-01'

df[is_senior_managment | is_less_than_1990Jan1] # either one

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2021-04-04 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2021-04-04 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2021-04-04 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
992,Anthony,Male,2011-10-16,2021-04-04 08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,2021-04-04 15:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,2021-04-04 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2021-04-04 06:30:00,42392,19.675,False,Finance


In [36]:
mask1 = df['First Name'] == 'Robert'
mask2 = df['Team'] == 'Client Services'
mask3 = df['Start Date'] > '2016-06-01'

df[(mask1 & mask2) | mask3]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2021-04-04 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2021-04-04 19:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,2021-04-04 04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,2021-04-04 00:29:00,140002,19.49,True,Marketing


------

# 5) The `.isin()` Method

In [37]:
df = pd.read_csv('Data/employees.csv', parse_dates=['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df['Team'] = df['Team'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-04-04 11:17:00,130590,11.858,False,Finance


In [39]:
mask1 = df['Team'] == 'Legal'
mask2 = df['Team'] == 'Sales'
mask3 = df['Team'] == 'Product'

df[mask1 | mask2 | mask3]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2021-04-04 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2021-04-04 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2021-04-04 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2021-04-04 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2021-04-04 06:09:00,59414,1.256,False,Product
...,...,...,...,...,...,...,...,...
981,James,Male,1993-01-15,2021-04-04 17:19:00,148985,19.280,False,Legal
985,Stephen,,1983-07-10,2021-04-04 20:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,2021-04-04 16:58:00,38344,3.794,False,Legal
997,Russell,Male,2013-05-20,2021-04-04 12:39:00,96914,1.421,False,Product


### Instead we can do more efficiently with `.isin()` Method
+ it not only takes python list, but also pandas series

In [42]:
mask = df['Team'].isin(['Legal', 'Product', 'Marketing'])
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
5,Dennis,Male,1987-04-18,2021-04-04 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2021-04-04 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2021-04-04 15:19:00,102508,12.637,True,Legal
15,Lillian,Female,2016-06-05,2021-04-04 06:09:00,59414,1.256,False,Product
...,...,...,...,...,...,...,...,...
986,Donna,Female,1982-11-26,2021-04-04 07:04:00,82871,17.999,False,Marketing
989,Justin,,1991-02-10,2021-04-04 16:58:00,38344,3.794,False,Legal
991,Rose,Female,2002-08-25,2021-04-04 05:12:00,134505,11.051,True,Marketing
994,George,Male,2013-06-21,2021-04-04 17:47:00,98874,4.479,True,Marketing


------

# 6) The `.isnull()` and `.notnull()` Methods

In [43]:
df = pd.read_csv('Data/employees.csv', parse_dates=['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df['Team'] = df['Team'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-04-04 11:17:00,130590,11.858,False,Finance


In [48]:
# we want to filter out all rows where Team is NULL

mask = df['Team'].isnull()
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,2021-04-04 09:01:00,63241,15.132,True,
23,,Male,2012-06-14,2021-04-04 16:19:00,125792,5.042,True,
32,,Male,1998-08-21,2021-04-04 14:27:00,122340,6.417,True,
91,James,,2005-01-26,2021-04-04 23:00:00,128771,8.309,False,
109,Christopher,Male,2000-04-22,2021-04-04 10:15:00,37919,11.449,False,
139,,Female,1990-10-03,2021-04-04 01:08:00,132373,10.527,True,
199,Jonathan,Male,2009-07-17,2021-04-04 08:15:00,130581,16.736,True,
258,Michael,Male,2002-01-24,2021-04-04 03:04:00,43586,12.659,False,
290,Jeremy,Male,1988-06-14,2021-04-04 18:20:00,129460,13.657,True,


In [49]:
df.isnull().sum()

First Name            67
Gender               145
Start Date             0
Last Login Time        0
Salary                 0
Bonus %                0
Senior Management      0
Team                  43
dtype: int64

In [53]:
is_gender_not_null = df['Gender'].notnull() # gender not null rows
df[is_gender_not_null]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,2021-04-04 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2021-04-04 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2021-04-04 16:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,2021-04-04 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2021-04-04 06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,2021-04-04 12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,2021-04-04 16:45:00,60500,11.985,False,Business Development


-------

# 7) The `.between()` Method

**helpful to use to search the values between a Range whether it is Time, etc**

In [54]:
df = pd.read_csv('Data/employees.csv', parse_dates=['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df['Team'] = df['Team'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-04-04 11:17:00,130590,11.858,False,Finance


In [57]:
df[df['Salary'].between(60000, 70000)] # values are INCLUSIVE

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.170,True,
6,Ruby,Female,1987-08-17,2021-04-04 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2021-04-04 09:01:00,63241,15.132,True,
20,Lois,,1995-04-22,2021-04-04 19:18:00,64714,4.934,True,Legal
41,Christine,,2015-06-28,2021-04-04 01:08:00,66582,11.308,True,Business Development
...,...,...,...,...,...,...,...,...
965,Catherine,Female,1989-09-25,2021-04-04 01:31:00,68164,18.393,False,Client Services
970,Alice,Female,1988-09-03,2021-04-04 20:54:00,63571,15.397,True,Product
974,Harry,Male,2011-08-30,2021-04-04 18:31:00,67656,16.455,True,Client Services
978,Sean,Male,1983-01-17,2021-04-04 14:23:00,66146,11.178,False,Human Resources


In [61]:
df[df['Bonus %'].between(2.0, 5.0)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.170,True,
20,Lois,,1995-04-22,2021-04-04 19:18:00,64714,4.934,True,Legal
40,Michael,Male,2008-10-10,2021-04-04 11:25:00,99283,2.665,True,Distribution
49,Chris,,1980-01-24,2021-04-04 12:13:00,113590,3.055,False,Sales
60,Paula,,2005-11-23,2021-04-04 14:01:00,48866,4.271,False,Distribution
...,...,...,...,...,...,...,...,...
943,Wayne,Male,2006-09-08,2021-04-04 11:09:00,67471,2.728,False,Engineering
961,Antonio,,1989-06-18,2021-04-04 21:37:00,103050,3.050,False,Legal
976,Denise,Female,1992-10-19,2021-04-04 05:42:00,137954,4.195,True,Legal
989,Justin,,1991-02-10,2021-04-04 16:58:00,38344,3.794,False,Legal


## `.between()` works with DateTime too

In [64]:
df[df['Start Date'].between('1991-01-01', '1991-12-31')]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
27,Scott,,1991-07-11,2021-04-04 18:58:00,122367,5.218,False,Legal
75,Bonnie,Female,1991-07-02,2021-04-04 01:27:00,104897,5.118,True,Human Resources
88,Donna,Female,1991-11-27,2021-04-04 13:59:00,64088,6.155,True,Legal
116,,Male,1991-06-22,2021-04-04 20:58:00,76189,18.988,True,Legal
148,Patrick,,1991-07-14,2021-04-04 02:24:00,124488,14.837,True,Sales
166,,Female,1991-07-09,2021-04-04 18:52:00,42341,7.014,True,Sales
172,Sara,Female,1991-09-23,2021-04-04 18:17:00,97058,9.402,False,Finance
220,,Female,1991-06-17,2021-04-04 12:49:00,71945,5.56,True,Marketing
245,Victor,Male,1991-04-11,2021-04-04 07:44:00,70817,17.138,False,Engineering
277,Brenda,,1991-05-29,2021-04-04 06:32:00,82439,19.062,False,Sales


In [65]:
df[df['Last Login Time'].between('08:30AM', '12:00PM')]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2021-04-04 11:17:00,130590,11.858,False,Finance
7,,Female,2015-07-20,2021-04-04 10:43:00,45906,11.598,True,Finance
10,Louise,Female,1980-08-12,2021-04-04 09:01:00,63241,15.132,True,
18,Diana,Female,1981-10-23,2021-04-04 10:27:00,132940,19.082,False,Client Services
33,Jean,Female,1993-12-18,2021-04-04 09:07:00,119082,16.180,False,Business Development
...,...,...,...,...,...,...,...,...
963,Ann,Female,1994-09-23,2021-04-04 11:15:00,89443,17.940,True,Sales
977,Sarah,Female,1995-12-04,2021-04-04 09:16:00,124566,5.949,False,Product
982,Rose,Female,1982-04-06,2021-04-04 10:43:00,91411,8.639,True,Human Resources
988,Alice,Female,2004-10-05,2021-04-04 09:34:00,47638,11.209,False,Human Resources


--------

# The `.duplicated()` Method

In [70]:
df = pd.read_csv('Data/employees.csv', parse_dates=['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df['Team'] = df['Team'].astype('category')
df = df.sort_values('First Name')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2021-04-04 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2021-04-04 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2021-04-04 14:53:00,52119,11.343,True,Client Services


In [72]:
df['First Name']

101    Aaron
327    Aaron
440    Aaron
937    Aaron
137     Adam
       ...  
902      NaN
925      NaN
946      NaN
947      NaN
951      NaN
Name: First Name, Length: 1000, dtype: object

## `dupicated()` method has parameter `keep`
+ by default, **`keep=first`** which keep the first value among duplicated values
    + Example: name Aaron has duplicate values. When pandas find first value `Aaron`, it hasn't seen any `Aaron` value yet. So it is marked as False for duplicated. For the subsequent values of `Aaron`, those will be marked as True for duplicated.
+**`keep=last`**: keep only the last value among the duplicated values. Other rows will be returned as True.
+ **`keep=False`**: mark rows which has duplicated values and return those.

### using `keep=first`

In [81]:
df['First Name'].duplicated()
df['First Name'].duplicated(keep='first')

101    False
327     True
440     True
937     True
137    False
       ...  
902     True
925     True
946     True
947     True
951     True
Name: First Name, Length: 1000, dtype: bool

In [82]:
df[df['First Name'].duplicated()] # without first row 101 because it was kept

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
327,Aaron,Male,1994-01-29,2021-04-04 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2021-04-04 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2021-04-04 19:39:00,63126,18.424,False,Client Services
141,Adam,Male,1990-12-24,2021-04-04 20:57:00,110194,14.727,True,Product
302,Adam,Male,2007-07-05,2021-04-04 11:59:00,71276,5.027,True,Human Resources
...,...,...,...,...,...,...,...,...
902,,Male,2001-05-23,2021-04-04 19:52:00,103877,6.322,True,Distribution
925,,Female,2000-08-23,2021-04-04 16:19:00,95866,19.388,True,Sales
946,,Female,1985-09-15,2021-04-04 01:50:00,133472,16.941,True,Distribution
947,,Male,2012-07-30,2021-04-04 15:07:00,107351,5.329,True,Marketing


### using `keep=last`

In [77]:
df['First Name'].duplicated(keep='last')

101     True
327     True
440     True
937    False
137     True
       ...  
902     True
925     True
946     True
947     True
951    False
Name: First Name, Length: 1000, dtype: bool

In [84]:
df[df['First Name'].duplicated(keep='last')] # without last row 937 for Aaron, because it was kept

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2021-04-04 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2021-04-04 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2021-04-04 14:53:00,52119,11.343,True,Client Services
137,Adam,Male,2011-05-21,2021-04-04 01:45:00,95327,15.120,False,Distribution
141,Adam,Male,1990-12-24,2021-04-04 20:57:00,110194,14.727,True,Product
...,...,...,...,...,...,...,...,...
890,,Male,2015-11-24,2021-04-04 03:11:00,145329,7.100,True,Finance
902,,Male,2001-05-23,2021-04-04 19:52:00,103877,6.322,True,Distribution
925,,Female,2000-08-23,2021-04-04 16:19:00,95866,19.388,True,Sales
946,,Female,1985-09-15,2021-04-04 01:50:00,133472,16.941,True,Distribution


### using `keep=False`

In [79]:
df['First Name']

101    Aaron
327    Aaron
440    Aaron
937    Aaron
137     Adam
       ...  
902      NaN
925      NaN
946      NaN
947      NaN
951      NaN
Name: First Name, Length: 1000, dtype: object

In [78]:
df['First Name'].duplicated(keep=False)

101    True
327    True
440    True
937    True
137    True
       ... 
902    True
925    True
946    True
947    True
951    True
Name: First Name, Length: 1000, dtype: bool

In [80]:
df[df['First Name'].duplicated(keep=False)] # all rows with duplicated rows will be returned without keeping them, so that those can be handled later

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2021-04-04 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2021-04-04 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2021-04-04 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2021-04-04 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2021-04-04 01:45:00,95327,15.120,False,Distribution
...,...,...,...,...,...,...,...,...
902,,Male,2001-05-23,2021-04-04 19:52:00,103877,6.322,True,Distribution
925,,Female,2000-08-23,2021-04-04 16:19:00,95866,19.388,True,Sales
946,,Female,1985-09-15,2021-04-04 01:50:00,133472,16.941,True,Distribution
947,,Male,2012-07-30,2021-04-04 15:07:00,107351,5.329,True,Marketing


------

# Negating the conditions using `~` tilde

## We want to get all rows which has unique values in First Name
+ Meaning negating all duplicated values in First Name rows are filtered out
+ the values that are only have unique values

In [90]:
unique_first_name_rows = ~df['First Name'].duplicated(keep=False)
df[unique_first_name_rows]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2021-04-04 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2021-04-04 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2021-04-04 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2021-04-04 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2021-04-04 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2021-04-04 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2021-04-04 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2021-04-04 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2021-04-04 10:30:00,132839,17.463,True,Client Services


----

# 9) The `.drop_duplicates()` Method

In [91]:
df = pd.read_csv('Data/employees.csv', parse_dates=['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df['Team'] = df['Team'].astype('category')
df = df.sort_values('First Name')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2021-04-04 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2021-04-04 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2021-04-04 14:53:00,52119,11.343,True,Client Services


### After we call drop_duplicates() on our dataframe (deduplicated), why the length is still 1000?
Because:
+ It doesn't matter if there are duplicates in a Single column (example: we have so many Aaron in First Name column, we know we have duplicates in Gender, we know we have duplicates in Senior Management)
+ **It only remove the row if cell values across the columns are identical, to another row or more than 2 rows**
+ So there is no such rows in our df. We only have rows (maybe values for 1 column or 2 columns or even 5 columns values are same in another rows).
+ So no rows are being removed by default.
+ **We have to provide a little more customization in order to get what we want.**

In [95]:
len(df)

1000

In [94]:
len(df.drop_duplicates())

1000

### providing customization for `.drop_duplicates()`
+ `subset`, `keep`

In [99]:
df.drop_duplicates(subset=['First Name'], keep='first') # keep only the first Aaron

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2021-04-04 10:20:00,61602,11.849,True,Marketing
137,Adam,Male,2011-05-21,2021-04-04 01:45:00,95327,15.120,False,Distribution
300,Alan,Male,1988-06-26,2021-04-04 03:54:00,111786,3.592,True,Engineering
372,Albert,Male,1997-02-01,2021-04-04 16:20:00,67827,19.717,True,Engineering
988,Alice,Female,2004-10-05,2021-04-04 09:34:00,47638,11.209,False,Human Resources
...,...,...,...,...,...,...,...,...
433,Wanda,Female,2008-07-20,2021-04-04 13:44:00,65362,7.132,True,Legal
177,Wayne,Male,2012-04-07,2021-04-04 08:00:00,102652,14.085,True,Distribution
820,William,Male,1993-11-18,2021-04-04 12:27:00,54058,5.182,True,Human Resources
450,Willie,Male,2009-08-22,2021-04-04 13:03:00,55038,19.691,False,Legal


In [102]:
df.drop_duplicates(subset=['First Name'], keep='last') # keep only the last Aaron

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
937,Aaron,,1986-01-22,2021-04-04 19:39:00,63126,18.424,False,Client Services
538,Adam,Male,2010-10-08,2021-04-04 21:53:00,45181,3.491,False,Human Resources
610,Alan,Male,2012-02-17,2021-04-04 00:26:00,41453,10.084,False,Product
959,Albert,Male,1992-09-19,2021-04-04 02:35:00,45094,5.850,True,Business Development
693,Alice,Female,1995-10-16,2021-04-04 21:19:00,92799,2.782,False,Sales
...,...,...,...,...,...,...,...,...
512,Wanda,Female,1993-04-06,2021-04-04 03:11:00,78883,19.695,False,
637,Wayne,Male,2009-09-02,2021-04-04 01:37:00,126956,18.396,False,Human Resources
127,William,Male,2002-09-29,2021-04-04 16:09:00,66521,5.830,False,Human Resources
652,Willie,Male,2009-12-05,2021-04-04 05:39:00,141932,1.017,True,Engineering


In [104]:
df.drop_duplicates(subset=['First Name'], keep=False) # NOT keeping any rows which have duplicates, now all of Aaron are gone

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2021-04-04 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2021-04-04 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2021-04-04 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2021-04-04 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2021-04-04 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2021-04-04 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2021-04-04 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2021-04-04 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2021-04-04 10:30:00,132839,17.463,True,Client Services


### If we are not careful using this method, funny thing might happen.
Example: try to remove duplicates on Team column and only want to keep unique values

In [107]:
df.drop_duplicates(subset=['Team'], keep=False) # this will return empty data frame

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team


### More than one values for subset

In [116]:
df.drop_duplicates(subset=['First Name', 'Team']) # unique rows across First Name and Team

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2021-04-04 10:20:00,61602,11.849,True,Marketing
440,Aaron,Male,1990-07-22,2021-04-04 14:53:00,52119,11.343,True,Client Services
137,Adam,Male,2011-05-21,2021-04-04 01:45:00,95327,15.120,False,Distribution
141,Adam,Male,1990-12-24,2021-04-04 20:57:00,110194,14.727,True,Product
302,Adam,Male,2007-07-05,2021-04-04 11:59:00,71276,5.027,True,Human Resources
...,...,...,...,...,...,...,...,...
149,,Female,2014-08-17,2021-04-04 14:00:00,86230,8.578,True,Distribution
157,,Female,2005-07-27,2021-04-04 08:32:00,79536,14.443,True,Product
269,,Female,1992-08-02,2021-04-04 20:35:00,145316,18.517,True,Human Resources
329,,Male,2010-01-27,2021-04-04 02:57:00,87760,14.987,True,Engineering


--------

# 10) The `.unique()` and `.nunique()` Method

In [118]:
df = pd.read_csv('Data/employees.csv', parse_dates=['Start Date', 'Last Login Time'])

df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df['Team'] = df['Team'].astype('category')

df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-04-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-04-04 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-04-04 11:17:00,130590,11.858,False,Finance


### get the unique of columns

In [126]:
df['Gender'].unique()

df['Team'].unique()

['Marketing', NaN, 'Finance', 'Client Services', 'Legal', ..., 'Engineering', 'Business Development', 'Human Resources', 'Sales', 'Distribution']
Length: 11
Categories (10, object): ['Marketing', 'Finance', 'Client Services', 'Legal', ..., 'Business Development', 'Human Resources', 'Sales', 'Distribution']

### get the Number of Unique values
+ `dropna=True/False`: whether to count NaN or not

In [127]:
len(df['Team'].unique())

11

In [129]:
df['Team'].nunique() # by default it doesn't count NaN values
df['Team'].nunique(dropna=True)

10

In [130]:
df['Team'].nunique(dropna=False)

11