In [1]:
#import libraries
import pandas as pd

In [3]:
employees = pd.read_csv('../data/pandas/employees.csv')
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services


In [4]:
#use info() to look at the dataset
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null object
Start Date           1000 non-null object
Last Login Time      1000 non-null object
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    933 non-null object
Team                 957 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


looking at .info() reveals:
* that we must have a lot of missing gender values. We can do something with the null values.
* We can also categorize the 'gender' column
* The "Start Date" field is being treated as a string and not as a datetime object.

In [63]:
#convert the start date to a datetime type. Notice there's no "inplace" attribute here, so to make the change permanent, I have to use assignment
employees['Start Date'] = pd.to_datetime(employees['Start Date'])
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-10-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-10-04 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2018-10-04 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2018-10-04 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2018-10-04 16:47:00,101004,1.389,True,Client Services


In [11]:
#convert the login time to a datetime field, too.
employees['Last Login Time'] = pd.to_datetime(employees['Last Login Time'])
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-10-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-10-04 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2018-10-04 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2018-10-04 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2018-10-04 16:47:00,101004,1.389,True,Client Services


In [12]:
#convert management type to Boolean
employees['Senior Management'] = employees['Senior Management'].astype('bool')
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-10-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-10-04 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2018-10-04 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2018-10-04 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2018-10-04 16:47:00,101004,1.389,True,Client Services


In [14]:
#change gender to a category type (much better for memory management)
employees['Gender'] = employees['Gender'].astype('category')
employees.head()
employees.info() #hug memory savings b/c of the categorical change

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null category
Start Date           1000 non-null datetime64[ns]
Last Login Time      1000 non-null datetime64[ns]
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    1000 non-null bool
Team                 957 non-null object
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.0+ KB


In [15]:
# convert "Team" column to a category type for further memory management
employees['Team'] = employees['Team'].astype('category')
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null category
Start Date           1000 non-null datetime64[ns]
Last Login Time      1000 non-null datetime64[ns]
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    1000 non-null bool
Team                 957 non-null category
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


# Filter based on a condition
this will extract rows that meet a specific condition

In [19]:
# pull out all the males in the company into a specific Series
#use a condition to see if we are dealing with a male or a female.  Returns a boolean
employees[employees['Gender'] == "Male"]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-10-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-10-04 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2018-10-04 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2018-10-04 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2018-10-04 01:35:00,115163,10.125,False,Legal
12,Brandon,Male,1980-12-01,2018-10-04 01:08:00,112807,17.492,True,Human Resources
13,Gary,Male,2008-01-27,2018-10-04 23:40:00,109831,5.831,False,Sales
16,Jeremy,Male,2010-09-21,2018-10-04 05:56:00,90370,7.369,False,Human Resources
17,Shawn,Male,1986-12-07,2018-10-04 19:45:00,111737,6.414,False,Product
21,Matthew,Male,1995-09-05,2018-10-04 02:12:00,100612,13.645,False,Marketing


In [23]:
#get everyone on the finance team
employees[employees['Team'] == 'Finance']

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2018-10-04 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2018-10-04 13:00:00,138705,9.340,True,Finance
7,,Female,2015-07-20,2018-10-04 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2018-10-04 07:13:00,41426,14.543,True,Finance
46,Bruce,Male,2009-11-28,2018-10-04 22:47:00,114796,6.796,False,Finance
53,Alan,,2014-03-03,2018-10-04 13:28:00,40341,17.578,True,Finance
56,Carl,Male,2006-05-03,2018-10-04 17:55:00,130276,16.084,True,Finance
67,Rachel,Female,1999-08-16,2018-10-04 06:53:00,51178,9.735,True,Finance
68,Jose,Male,2004-10-30,2018-10-04 13:39:00,84834,14.330,True,Finance
69,Irene,,2015-07-14,2018-10-04 16:31:00,100863,4.382,True,Finance


In [26]:
#create a filter using a variable
mask = employees['Bonus %'] <= 5
employees[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2018-10-04 06:53:00,61933,4.170,True,
4,Larry,Male,1998-01-24,2018-10-04 16:47:00,101004,1.389,True,Client Services
15,Lillian,Female,2016-06-05,2018-10-04 06:09:00,59414,1.256,False,Product
19,Donna,Female,2010-07-22,2018-10-04 03:48:00,81014,1.894,False,Product
20,Lois,,1995-04-22,2018-10-04 19:18:00,64714,4.934,True,Legal
40,Michael,Male,2008-10-10,2018-10-04 11:25:00,99283,2.665,True,Distribution
49,Chris,,1980-01-24,2018-10-04 12:13:00,113590,3.055,False,Sales
52,Todd,Male,1990-02-18,2018-10-04 02:41:00,49339,1.695,True,Human Resources
58,Theresa,Female,2010-04-11,2018-10-04 07:18:00,72670,1.481,True,Engineering
60,Paula,,2005-11-23,2018-10-04 14:01:00,48866,4.271,False,Distribution


In [35]:
#use comparison with dates.  Get all employees who started after the year 2010
employees[employees['Start Date'] >= '2010-01-01']

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
7,,Female,2015-07-20,2018-10-04 10:43:00,45906,11.598,True,Finance
15,Lillian,Female,2016-06-05,2018-10-04 06:09:00,59414,1.256,False,Product
16,Jeremy,Male,2010-09-21,2018-10-04 05:56:00,90370,7.369,False,Human Resources
19,Donna,Female,2010-07-22,2018-10-04 03:48:00,81014,1.894,False,Product
22,Joshua,,2012-03-08,2018-10-04 01:58:00,90816,18.816,True,Client Services
23,,Male,2012-06-14,2018-10-04 16:19:00,125792,5.042,True,
25,,Male,2012-10-08,2018-10-04 01:12:00,37076,18.576,True,Client Services
39,,Male,2016-01-29,2018-10-04 02:33:00,122173,7.797,True,Client Services
41,Christine,,2015-06-28,2018-10-04 01:08:00,66582,11.308,True,Business Development
51,,,2011-12-17,2018-10-04 08:29:00,41126,14.009,True,Sales


# Filtering df based on multiple conditions

In [44]:
#it's easiest to manage the conditions by storing them separately and then filtering based on a list
genderMask = employees['Gender'] == 'Female'
salaryMask = employees['Salary'] >= 10
managementMask = employees['Senior Management']
bonusMask = employees['Bonus %'] >= 15
teamMask = employees['Team'] == "Marketing"
employees[genderMask & salaryMask & managementMask & bonusMask & teamMask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
62,,Female,2007-06-12,2018-10-04 17:25:00,58112,19.414,True,Marketing
98,Tina,Female,2016-06-16,2018-10-04 19:47:00,100705,16.961,True,Marketing
331,Evelyn,Female,1983-09-03,2018-10-04 13:58:00,36759,17.269,True,Marketing
446,Cheryl,Female,1994-08-16,2018-10-04 08:33:00,67150,15.85,True,Marketing
656,Lisa,Female,1982-02-09,2018-10-04 18:44:00,113592,17.108,True,Marketing
730,Nicole,Female,2009-04-26,2018-10-04 00:40:00,66047,18.674,True,Marketing
813,Evelyn,Female,2002-02-10,2018-10-04 04:44:00,123621,19.767,True,Marketing


In [46]:
#use both & and | in a single operation
nameMask = employees['First Name'] == 'Robert'
teamMask = employees['Team'] == 'Client Services'
startMask = employees['Start Date'] > '2015-01-01'
employees[(nameMask & teamMask) | startMask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
7,,Female,2015-07-20,2018-10-04 10:43:00,45906,11.598,True,Finance
15,Lillian,Female,2016-06-05,2018-10-04 06:09:00,59414,1.256,False,Product
39,,Male,2016-01-29,2018-10-04 02:33:00,122173,7.797,True,Client Services
41,Christine,,2015-06-28,2018-10-04 01:08:00,66582,11.308,True,Business Development
69,Irene,,2015-07-14,2018-10-04 16:31:00,100863,4.382,True,Finance
89,Janice,Female,2016-03-12,2018-10-04 00:40:00,51082,11.955,False,Legal
98,Tina,Female,2016-06-16,2018-10-04 19:47:00,100705,16.961,True,Marketing
121,Kathleen,,2016-05-09,2018-10-04 08:55:00,119735,18.74,False,Product
143,Teresa,,2016-01-28,2018-10-04 10:55:00,140013,8.689,True,Engineering
225,Harry,Male,2015-10-01,2018-10-04 19:47:00,64579,15.266,True,Sales


# .isin() method
used for checking multiple values in a single series

In [50]:
#extract the series you want
mask = employees['Team'].isin(['Legal','Sales','Product'])
employees[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2018-10-04 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2018-10-04 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2018-10-04 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2018-10-04 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2018-10-04 06:09:00,59414,1.256,False,Product
17,Shawn,Male,1986-12-07,2018-10-04 19:45:00,111737,6.414,False,Product
19,Donna,Female,2010-07-22,2018-10-04 03:48:00,81014,1.894,False,Product
20,Lois,,1995-04-22,2018-10-04 19:18:00,64714,4.934,True,Legal
27,Scott,,1991-07-11,2018-10-04 18:58:00,122367,5.218,False,Legal
29,Benjamin,Male,2005-01-26,2018-10-04 22:06:00,79529,7.008,True,Legal


# .isnull() and .notnull() methods
check for the  null status of each row and returns a boolean

In [54]:
#get the rows from the team column where the 'Team' value is null
mask = employees['Team'].isnull()
employees[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2018-10-04 06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,2018-10-04 09:01:00,63241,15.132,True,
23,,Male,2012-06-14,2018-10-04 16:19:00,125792,5.042,True,
32,,Male,1998-08-21,2018-10-04 14:27:00,122340,6.417,True,
91,James,,2005-01-26,2018-10-04 23:00:00,128771,8.309,False,
109,Christopher,Male,2000-04-22,2018-10-04 10:15:00,37919,11.449,False,
139,,Female,1990-10-03,2018-10-04 01:08:00,132373,10.527,True,
199,Jonathan,Male,2009-07-17,2018-10-04 08:15:00,130581,16.736,True,
258,Michael,Male,2002-01-24,2018-10-04 03:04:00,43586,12.659,False,
290,Jeremy,Male,1988-06-14,2018-10-04 18:20:00,129460,13.657,True,


In [56]:
#check for not null rows in the Gender column
employees[employees['Gender'].notnull()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-10-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-10-04 06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,2018-10-04 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2018-10-04 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2018-10-04 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2018-10-04 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2018-10-04 16:20:00,65476,10.012,True,Product
7,,Female,2015-07-20,2018-10-04 10:43:00,45906,11.598,True,Finance
8,Angela,Female,2005-11-22,2018-10-04 06:29:00,95570,18.523,True,Engineering
9,Frances,Female,2002-08-08,2018-10-04 06:51:00,139852,7.524,True,Business Development


# .between() method
looking for values that fall within a range

In [58]:
# pull out all of the employees with a salary between a specific range
employees[employees['Salary'].between(80000,90000)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
19,Donna,Female,2010-07-22,2018-10-04 03:48:00,81014,1.894,False,Product
31,Joyce,,2005-02-20,2018-10-04 14:40:00,88657,12.752,False,Product
35,Theresa,Female,2006-10-10,2018-10-04 01:12:00,85182,16.675,False,Sales
45,Roger,Male,1980-04-17,2018-10-04 11:32:00,88010,13.886,True,Sales
54,Sara,Female,2007-08-15,2018-10-04 09:23:00,83677,8.999,False,Engineering
68,Jose,Male,2004-10-30,2018-10-04 13:39:00,84834,14.330,True,Finance
70,Todd,,2003-06-10,2018-10-04 14:26:00,84692,6.617,False,Client Services
84,Doris,Female,2004-08-20,2018-10-04 05:51:00,83072,7.511,False,Finance
104,John,Male,1989-12-23,2018-10-04 07:01:00,80740,19.305,False,Marketing
120,Peter,Male,1994-11-17,2018-10-04 18:15:00,84885,15.402,False,Business Development


In [60]:
#look for employees with a bonus between 3 and 5%
employees[employees['Bonus %'].between(3,3.5)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
49,Chris,,1980-01-24,2018-10-04 12:13:00,113590,3.055,False,Sales
106,Paul,Male,1993-08-04,2018-10-04 19:25:00,42146,3.046,False,Legal
197,Carolyn,Female,2012-11-06,2018-10-04 03:51:00,69268,3.031,False,Client Services
222,Jason,Male,1999-10-17,2018-10-04 22:09:00,78417,3.067,False,Finance
265,Roy,Male,2004-09-23,2018-10-04 13:31:00,101941,3.45,False,Client Services
296,Jeremy,,1996-11-10,2018-10-04 13:49:00,55394,3.18,True,Sales
323,Linda,Female,1990-12-16,2018-10-04 02:20:00,115658,3.041,True,Sales
353,,Male,1997-04-22,2018-10-04 21:36:00,65078,3.095,True,Marketing
374,,Female,2006-12-31,2018-10-04 23:02:00,81444,3.171,True,Distribution
419,Dorothy,Female,2013-11-05,2018-10-04 22:50:00,140136,3.12,True,Business Development


In [67]:
# use .between() with dates and times
#first, ensure the 'start date' is a datetime object
employees['Start Date'] = pd.to_datetime(employees['Start Date'])
employees.info()
employees[employees['Start Date'].between('2010-01-01','2010-010-31')]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null category
Start Date           1000 non-null datetime64[ns]
Last Login Time      1000 non-null datetime64[ns]
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    1000 non-null bool
Team                 957 non-null category
dtypes: bool(1), category(2), datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 42.6+ KB


ValueError: could not convert string to Timestamp

In [71]:
#use .between() with login times
employees[employees['Last Login Time'].between('04:30am','9:00am')]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2018-10-04 06:53:00,61933,4.170,True,
8,Angela,Female,2005-11-22,2018-10-04 06:29:00,95570,18.523,True,Engineering
9,Frances,Female,2002-08-08,2018-10-04 06:51:00,139852,7.524,True,Business Development
14,Kimberly,Female,1999-01-14,2018-10-04 07:13:00,41426,14.543,True,Finance
15,Lillian,Female,2016-06-05,2018-10-04 06:09:00,59414,1.256,False,Product
16,Jeremy,Male,2010-09-21,2018-10-04 05:56:00,90370,7.369,False,Human Resources
26,Craig,Male,2000-02-27,2018-10-04 07:45:00,37598,7.757,True,Marketing
47,Kathy,Female,2005-06-22,2018-10-04 04:51:00,66820,9.000,True,Client Services
48,Clarence,Male,1996-03-26,2018-10-04 05:57:00,93581,6.083,True,Business Development
50,Nancy,Female,2000-09-23,2018-10-04 08:05:00,94976,13.830,True,Engineering


# .duplicated() method
extracts duplicates from a dataframe

In [72]:
# check for duplicates in the first name
employees.sort_values('First Name', inplace=True)
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2018-10-04 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2018-10-04 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2018-10-04 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2018-10-04 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2018-10-04 01:45:00,95327,15.12,False,Distribution


In [78]:
#remove duplicate rows, but keep one of them
employees[employees['First Name'].duplicated(keep="last")]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2018-10-04 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2018-10-04 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2018-10-04 14:53:00,52119,11.343,True,Client Services
137,Adam,Male,2011-05-21,2018-10-04 01:45:00,95327,15.120,False,Distribution
141,Adam,Male,1990-12-24,2018-10-04 20:57:00,110194,14.727,True,Product
302,Adam,Male,2007-07-05,2018-10-04 11:59:00,71276,5.027,True,Human Resources
300,Alan,Male,1988-06-26,2018-10-04 03:54:00,111786,3.592,True,Engineering
53,Alan,,2014-03-03,2018-10-04 13:28:00,40341,17.578,True,Finance
372,Albert,Male,1997-02-01,2018-10-04 16:20:00,67827,19.717,True,Engineering
458,Albert,Male,2007-09-30,2018-10-04 17:34:00,102626,15.843,False,Finance


In [80]:
#get rid of all duplicates in total
employees[employees['First Name'].duplicated(keep = False)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2018-10-04 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2018-10-04 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2018-10-04 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2018-10-04 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2018-10-04 01:45:00,95327,15.120,False,Distribution
141,Adam,Male,1990-12-24,2018-10-04 20:57:00,110194,14.727,True,Product
302,Adam,Male,2007-07-05,2018-10-04 11:59:00,71276,5.027,True,Human Resources
538,Adam,Male,2010-10-08,2018-10-04 21:53:00,45181,3.491,False,Human Resources
300,Alan,Male,1988-06-26,2018-10-04 03:54:00,111786,3.592,True,Engineering
53,Alan,,2014-03-03,2018-10-04 13:28:00,40341,17.578,True,Finance


In [81]:
#get ONLY unique rows.  Use the tilde symbol to "reverse" a Boolean series
employees[~employees['First Name'].duplicated(keep=False)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2018-10-04 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2018-10-04 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2018-10-04 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2018-10-04 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2018-10-04 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2018-10-04 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2018-10-04 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2018-10-04 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2018-10-04 10:30:00,132839,17.463,True,Client Services


# .drop_duplicates() method
works on a dataframe instead of on a series

In [83]:
#see how many rows there currently are
len(employees)

1000

In [84]:
len(employees.drop_duplicates())

1000

notice that the length is the same before and after.  That's because .drop_duplicates() is checking for duplicates across all rows and cols

In [89]:
# use the subseet attribute to limit to a few different fields for checking duplicates
employees.drop_duplicates(subset = ['First Name', 'Gender'],keep=False)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
937,Aaron,,1986-01-22,2018-10-04 19:39:00,63126,18.424,False,Client Services
53,Alan,,2014-03-03,2018-10-04 13:28:00,40341,17.578,True,Finance
765,Alice,,1995-04-23,2018-10-04 06:35:00,148339,11.479,True,Finance
8,Angela,Female,2005-11-22,2018-10-04 06:29:00,95570,18.523,True,Engineering
645,Anna,,1985-03-13,2018-10-04 09:19:00,45418,10.162,False,Marketing
543,Anna,Female,2008-04-15,2018-10-04 14:34:00,117293,2.366,False,Client Services
792,Anne,,1996-04-18,2018-10-04 23:57:00,122762,9.564,False,Distribution
86,Annie,,2007-09-29,2018-10-04 00:11:00,103495,17.290,True,Business Development
961,Antonio,,1989-06-18,2018-10-04 21:37:00,103050,3.050,False,Legal
482,Beverly,,1995-05-03,2018-10-04 05:49:00,104815,3.380,False,Product


# .unique() and .nunique() methods
find the unique values in a series

In [90]:
employees['Gender'].unique()

[Male, NaN, Female]
Categories (2, object): [Male, Female]

In [92]:
#count the # of unique values.  Not that this does NOT include NaN values
employees['Team'].nunique()

10