# Dataframe II Filtering Data

## Memory optimization

In [138]:
import pandas as pd 

In [139]:
employees = pd.read_csv("employees.csv")

In [140]:
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [141]:
employees

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.170,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.340,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,11/23/2014,6:09 AM,132483,16.655,False,Distribution
996,Phillip,Male,1/31/1984,6:30 AM,42392,19.675,False,Finance
997,Russell,Male,5/20/2013,12:39 PM,96914,1.421,False,Product
998,Larry,Male,4/20/2013,4:45 PM,60500,11.985,False,Business Development


In [142]:
employees["Start Date"] = pd.to_datetime(employees["Start Date"], format = "%m/%d/%Y")

In [143]:
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    object        
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   object        
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  933 non-null    object        
 7   Team               957 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 62.6+ KB


In [144]:
employees["Last Login Time"] = pd.to_datetime(employees["Last Login Time"], format = "%H:%M %p").dt.time

In [145]:
employees["Senior Management"] = employees["Senior Management"].astype(bool)

In [146]:
employees["Gender"] = employees["Gender"].astype("category")

In [147]:
employees = pd.read_csv("employees.csv", parse_dates = ["Start Date"],date_format = "%m/%d/%Y")

In [148]:
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,4:47 PM,101004,1.389,True,Client Services


## Filtering rows on a condition 

In [149]:
employees = pd.read_csv("employees.csv", parse_dates = ["Start Date"],date_format = "%m/%d/%Y")
employees["Last Login Time"] = pd.to_datetime(employees["Last Login Time"], format = "%H:%M %p").dt.time
employees["Senior Management"] = employees["Senior Management"].astype(bool)
employees["Gender"] = employees["Gender"].astype("category")
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services


In [150]:
# extract male employees 
employees[employees["Gender"] == "Male"]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,01:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,05:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,04:45:00,60500,11.985,False,Business Development


In [151]:
employees[employees["Team"] == "Finance"]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.340,True,Finance
7,,Female,2015-07-20,10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,10:47:00,114796,6.796,False,Finance
...,...,...,...,...,...,...,...,...
907,Elizabeth,Female,1998-07-27,11:12:00,137144,10.081,False,Finance
954,Joe,Male,1980-01-19,04:06:00,119667,1.148,True,Finance
987,Gloria,Female,2014-12-08,05:08:00,136709,10.331,True,Finance
992,Anthony,Male,2011-10-16,08:35:00,112769,11.625,True,Finance


In [152]:
on_finance = employees["Team"] == "Finance"
employees[on_finance]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.340,True,Finance
7,,Female,2015-07-20,10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,10:47:00,114796,6.796,False,Finance
...,...,...,...,...,...,...,...,...
907,Elizabeth,Female,1998-07-27,11:12:00,137144,10.081,False,Finance
954,Joe,Male,1980-01-19,04:06:00,119667,1.148,True,Finance
987,Gloria,Female,2014-12-08,05:08:00,136709,10.331,True,Finance
992,Anthony,Male,2011-10-16,08:35:00,112769,11.625,True,Finance


In [153]:
employees["Senior Management"] == True

0       True
1       True
2      False
3       True
4       True
       ...  
995    False
996    False
997    False
998    False
999     True
Name: Senior Management, Length: 1000, dtype: bool

In [154]:
# all employees who have salary > 110,000
employees[employees["Salary"] > 110000]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.340,True,Finance
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
9,Frances,Female,2002-08-08,06:51:00,139852,7.524,True,Business Development
12,Brandon,Male,1980-12-01,01:08:00,112807,17.492,True,Human Resources
...,...,...,...,...,...,...,...,...
987,Gloria,Female,2014-12-08,05:08:00,136709,10.331,True,Finance
991,Rose,Female,2002-08-25,05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,08:35:00,112769,11.625,True,Finance
995,Henry,,2014-11-23,06:09:00,132483,16.655,False,Distribution


In [155]:
# date time 
employees[employees["Start Date"] < "1985-01-01"]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,06:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,08:49:00,57427,9.557,True,Client Services
...,...,...,...,...,...,...,...,...
982,Rose,Female,1982-04-06,10:43:00,91411,8.639,True,Human Resources
983,John,Male,1982-12-23,10:35:00,146907,11.738,False,Engineering
985,Stephen,,1983-07-10,08:10:00,85668,1.909,False,Legal
986,Donna,Female,1982-11-26,07:04:00,82871,17.999,False,Marketing


In [156]:
import datetime as dt

In [157]:
dt.time(12,0,0)

datetime.time(12, 0)

In [158]:
employees["Last Login Time"] < dt.time(12,0,0)

0      False
1       True
2       True
3       True
4       True
       ...  
995     True
996     True
997    False
998     True
999     True
Name: Last Login Time, Length: 1000, dtype: bool

In [159]:
employees[employees["Last Login Time"] < dt.time(12,0,0)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,05:47:00,98874,4.479,True,Marketing
995,Henry,,2014-11-23,06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance
998,Larry,Male,2013-04-20,04:45:00,60500,11.985,False,Business Development


## Filter on more than one condition, AND

In [160]:
employees = pd.read_csv("employees.csv", parse_dates = ["Start Date"],date_format = "%m/%d/%Y")
employees["Last Login Time"] = pd.to_datetime(employees["Last Login Time"], format = "%H:%M %p").dt.time
employees["Senior Management"] = employees["Senior Management"].astype(bool)
employees["Gender"] = employees["Gender"].astype("category")
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services


In [161]:
# find female employees who work in marketing 
is_female = employees["Gender"] == "Female"

in_marketing = employees["Team"] == "Marketing"

In [162]:
employees[is_female & in_marketing]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
43,Marilyn,Female,1980-12-07,03:16:00,73524,5.207,True,Marketing
62,,Female,2007-06-12,05:25:00,58112,19.414,True,Marketing
98,Tina,Female,2016-06-16,07:47:00,100705,16.961,True,Marketing
140,Shirley,Female,1981-02-28,01:23:00,113850,1.854,False,Marketing
158,Norma,Female,1999-02-28,08:45:00,114412,8.756,True,Marketing
201,Kimberly,Female,1997-07-15,05:57:00,36643,7.953,False,Marketing
220,,Female,1991-06-17,12:49:00,71945,5.56,True,Marketing
305,Margaret,Female,1993-02-06,01:05:00,125220,3.733,False,Marketing
319,Jacqueline,Female,1981-11-25,03:01:00,145988,18.243,False,Marketing
331,Evelyn,Female,1983-09-03,01:58:00,36759,17.269,True,Marketing


In [163]:
# females in marketing who make over 100k 
is_female = employees["Gender"] == "Female"

in_marketing = employees["Team"] == "Marketing"

over_100 = employees["Salary"] > 100000

employees[is_female & in_marketing & over_100]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
98,Tina,Female,2016-06-16,07:47:00,100705,16.961,True,Marketing
140,Shirley,Female,1981-02-28,01:23:00,113850,1.854,False,Marketing
158,Norma,Female,1999-02-28,08:45:00,114412,8.756,True,Marketing
305,Margaret,Female,1993-02-06,01:05:00,125220,3.733,False,Marketing
319,Jacqueline,Female,1981-11-25,03:01:00,145988,18.243,False,Marketing
379,,Female,2002-09-18,12:39:00,118906,4.537,True,Marketing
468,Janice,Female,1997-06-28,01:48:00,136032,10.696,True,Marketing
490,Judith,Female,2007-11-23,01:22:00,117055,7.461,False,Marketing
531,Virginia,Female,2010-05-02,09:10:00,123649,10.154,True,Marketing
585,Shirley,Female,1988-04-16,11:09:00,132156,2.754,False,Marketing


## Filter by more than one condition, OR

In [164]:
employees = pd.read_csv("employees.csv", parse_dates = ["Start Date"],date_format = "%m/%d/%Y")
employees["Last Login Time"] = pd.to_datetime(employees["Last Login Time"], format = "%H:%M %p").dt.time
employees["Senior Management"] = employees["Senior Management"].astype(bool)
employees["Gender"] = employees["Gender"].astype("category")
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services


In [165]:
# filter for employees that are male or in finance 
male = employees["Gender"] == "Male"

finance = employees["Team"] == "Finance"

In [166]:
employees[male | finance]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,05:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,04:45:00,60500,11.985,False,Business Development


In [167]:
# employees who are senior management or started before january 1st, 1990 

senior_management = employees["Senior Management"] == True 
started = employees["Start Date"] < "1990-01-01"

employees[senior_management | started]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,01:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
992,Anthony,Male,2011-10-16,08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,03:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,05:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance


In [168]:
# employees with name Robert who work in client services or have start date after 2016-06-01
is_robert = employees["First Name"] == "Robert"
is_client_services = employees["Team"] == "Client Services"

start_date = employees["Start Date"] > "2016-06-01"

employees[(is_robert & is_client_services)| start_date]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,07:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,12:29:00,140002,19.49,True,Marketing


## The isin method 

In [169]:
employees = pd.read_csv("employees.csv", parse_dates = ["Start Date"],date_format = "%m/%d/%Y")
employees["Last Login Time"] = pd.to_datetime(employees["Last Login Time"], format = "%H:%M %p").dt.time
employees["Senior Management"] = employees["Senior Management"].astype(bool)
employees["Gender"] = employees["Gender"].astype("category")
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services


In [170]:
# employees who are either on legal team or the sales team or product team 
legal = employees["Team"] == "Legal"
sales = employees["Team"] == "Sales"
product = employees["Team"] == "Product"

employees[legal | sales | product]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,04:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,03:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,11:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,06:09:00,59414,1.256,False,Product
...,...,...,...,...,...,...,...,...
981,James,Male,1993-01-15,05:19:00,148985,19.280,False,Legal
985,Stephen,,1983-07-10,08:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,04:58:00,38344,3.794,False,Legal
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product


In [171]:
# shortened version 

employees[employees["Team"].isin(["Legal", "Sales", "Product"])]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,04:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,03:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,11:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,06:09:00,59414,1.256,False,Product
...,...,...,...,...,...,...,...,...
981,James,Male,1993-01-15,05:19:00,148985,19.280,False,Legal
985,Stephen,,1983-07-10,08:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,04:58:00,38344,3.794,False,Legal
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product


## The isnull and notnull methods 

In [172]:
employees = pd.read_csv("employees.csv", parse_dates = ["Start Date"],date_format = "%m/%d/%Y")
employees["Last Login Time"] = pd.to_datetime(employees["Last Login Time"], format = "%H:%M %p").dt.time
employees["Senior Management"] = employees["Senior Management"].astype(bool)
employees["Gender"] = employees["Gender"].astype("category")
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services


In [173]:
# find the rows where the team value is missing, NaN
employees[employees["Team"].isnull()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,09:01:00,63241,15.132,True,
23,,Male,2012-06-14,04:19:00,125792,5.042,True,
32,,Male,1998-08-21,02:27:00,122340,6.417,True,
91,James,,2005-01-26,11:00:00,128771,8.309,False,
109,Christopher,Male,2000-04-22,10:15:00,37919,11.449,False,
139,,Female,1990-10-03,01:08:00,132373,10.527,True,
199,Jonathan,Male,2009-07-17,08:15:00,130581,16.736,True,
258,Michael,Male,2002-01-24,03:04:00,43586,12.659,False,
290,Jeremy,Male,1988-06-14,06:20:00,129460,13.657,True,


In [174]:
employees[employees["Team"].notnull()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,04:45:00,60500,11.985,False,Business Development


In [175]:
# first name is NaN and team is valid 
employees[employees["First Name"].isnull() & employees["Team"].notnull()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
7,,Female,2015-07-20,10:43:00,45906,11.598,True,Finance
25,,Male,2012-10-08,01:12:00,37076,18.576,True,Client Services
39,,Male,2016-01-29,02:33:00,122173,7.797,True,Client Services
51,,,2011-12-17,08:29:00,41126,14.009,True,Sales
62,,Female,2007-06-12,05:25:00,58112,19.414,True,Marketing
116,,Male,1991-06-22,08:58:00,76189,18.988,True,Legal
149,,Female,2014-08-17,02:00:00,86230,8.578,True,Distribution
157,,Female,2005-07-27,08:32:00,79536,14.443,True,Product
165,,Female,2014-03-23,01:28:00,59148,9.061,True,Legal
166,,Female,1991-07-09,06:52:00,42341,7.014,True,Sales


## The between method 

In [176]:
employees = pd.read_csv("employees.csv", parse_dates = ["Start Date"],date_format = "%m/%d/%Y")
employees["Last Login Time"] = pd.to_datetime(employees["Last Login Time"], format = "%H:%M %p").dt.time
employees["Senior Management"] = employees["Senior Management"].astype(bool)
employees["Gender"] = employees["Gender"].astype("category")
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services


In [177]:
# find employees with a salary between 60,000 and 70,000
employees[employees["Salary"].between(60000, 70000)]

employees[employees["Start Date"].between("1991-01-01","1992-01-01")]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
27,Scott,,1991-07-11,06:58:00,122367,5.218,False,Legal
75,Bonnie,Female,1991-07-02,01:27:00,104897,5.118,True,Human Resources
88,Donna,Female,1991-11-27,01:59:00,64088,6.155,True,Legal
116,,Male,1991-06-22,08:58:00,76189,18.988,True,Legal
148,Patrick,,1991-07-14,02:24:00,124488,14.837,True,Sales
166,,Female,1991-07-09,06:52:00,42341,7.014,True,Sales
172,Sara,Female,1991-09-23,06:17:00,97058,9.402,False,Finance
220,,Female,1991-06-17,12:49:00,71945,5.56,True,Marketing
245,Victor,Male,1991-04-11,07:44:00,70817,17.138,False,Engineering
277,Brenda,,1991-05-29,06:32:00,82439,19.062,False,Sales


In [178]:
import datetime as dt

In [179]:
employees[employees["Last Login Time"].between(dt.time(8,30), dt.time(12,0))]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
7,,Female,2015-07-20,10:43:00,45906,11.598,True,Finance
10,Louise,Female,1980-08-12,09:01:00,63241,15.132,True,
13,Gary,Male,2008-01-27,11:40:00,109831,5.831,False,Sales
18,Diana,Female,1981-10-23,10:27:00,132940,19.082,False,Client Services
...,...,...,...,...,...,...,...,...
977,Sarah,Female,1995-12-04,09:16:00,124566,5.949,False,Product
982,Rose,Female,1982-04-06,10:43:00,91411,8.639,True,Human Resources
983,John,Male,1982-12-23,10:35:00,146907,11.738,False,Engineering
988,Alice,Female,2004-10-05,09:34:00,47638,11.209,False,Human Resources


## The duplicated method

In [180]:
employees = pd.read_csv("employees.csv", parse_dates = ["Start Date"],date_format = "%m/%d/%Y")
employees["Last Login Time"] = pd.to_datetime(employees["Last Login Time"], format = "%H:%M %p").dt.time
employees["Senior Management"] = employees["Senior Management"].astype(bool)
employees["Gender"] = employees["Gender"].astype("category")
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services


In [181]:
employees[employees["First Name"].duplicated()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
23,,Male,2012-06-14,04:19:00,125792,5.042,True,
25,,Male,2012-10-08,01:12:00,37076,18.576,True,Client Services
32,,Male,1998-08-21,02:27:00,122340,6.417,True,
34,Jerry,Male,2004-01-10,12:56:00,95734,19.096,False,Client Services
39,,Male,2016-01-29,02:33:00,122173,7.797,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,04:45:00,60500,11.985,False,Business Development


In [182]:
employees[employees["First Name"].duplicated(keep = "last")]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
959,Albert,Male,1992-09-19,02:35:00,45094,5.850,True,Business Development
960,Stephen,Male,1989-10-29,11:34:00,93997,18.093,True,Business Development
970,Alice,Female,1988-09-03,08:54:00,63571,15.397,True,Product
973,Russell,Male,2013-05-10,11:08:00,137359,11.105,False,Business Development


In [183]:
employees[employees["First Name"].duplicated(keep = False)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,04:45:00,60500,11.985,False,Business Development


In [184]:
employees[~employees["First Name"].duplicated(keep = False)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
33,Jean,Female,1993-12-18,09:07:00,119082,16.18,False,Business Development
190,Carol,Female,1996-03-19,03:39:00,57783,9.129,False,Finance
291,Tammy,Female,1984-11-11,10:30:00,132839,17.463,True,Client Services
495,Eugene,Male,1984-05-24,10:54:00,81077,2.117,False,Sales
688,Brian,Male,2007-04-07,10:47:00,93901,17.821,True,Legal
832,Keith,Male,2003-02-12,03:02:00,120672,19.467,False,Legal
887,David,Male,2009-12-05,08:48:00,92242,15.407,False,Legal


## The drop_duplicates method

In [185]:
employees = pd.read_csv("employees.csv", parse_dates = ["Start Date"],date_format = "%m/%d/%Y")
employees["Last Login Time"] = pd.to_datetime(employees["Last Login Time"], format = "%H:%M %p").dt.time
employees["Senior Management"] = employees["Senior Management"].astype(bool)
employees["Gender"] = employees["Gender"].astype("category")
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services


In [186]:
employees.drop_duplicates(subset = "First Name")

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
712,Martin,,2001-02-06,04:17:00,123963,15.745,True,Engineering
749,Janet,,1986-01-25,05:48:00,85789,9.712,False,Legal
832,Keith,Male,2003-02-12,03:02:00,120672,19.467,False,Legal
855,Phillip,,2003-10-20,11:09:00,89700,2.277,True,


In [187]:
employees.drop_duplicates(["Team", "Senior Management"])

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,04:20:00,65476,10.012,True,Product
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
9,Frances,Female,2002-08-08,06:51:00,139852,7.524,True,Business Development
11,Julie,Female,1997-10-26,03:19:00,102508,12.637,True,Legal


## The unique and nunique methods

In [188]:
employees = pd.read_csv("employees.csv", parse_dates = ["Start Date"],date_format = "%m/%d/%Y")
employees["Last Login Time"] = pd.to_datetime(employees["Last Login Time"], format = "%H:%M %p").dt.time
employees["Senior Management"] = employees["Senior Management"].astype(bool)
employees["Gender"] = employees["Gender"].astype("category")
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,01:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,04:47:00,101004,1.389,True,Client Services


In [189]:
employees["Gender"].unique()

['Male', 'Female', NaN]
Categories (2, object): ['Female', 'Male']

In [190]:
employees["Team"].unique()

array(['Marketing', nan, 'Finance', 'Client Services', 'Legal', 'Product',
       'Engineering', 'Business Development', 'Human Resources', 'Sales',
       'Distribution'], dtype=object)

In [191]:
employees["Team"].nunique()

10

In [192]:
employees.nunique()

First Name           200
Gender                 2
Start Date           972
Last Login Time      542
Salary               995
Bonus %              971
Senior Management      2
Team                  10
dtype: int64