# Exploratory Data Analysis - Data Extraction

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Load Data

employees = pd.read_csv("https://raw.githubusercontent.com/renatomaaliw3/public_files/master/Data%20Sets/employees.csv")
employees.head(5)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services


In [None]:
# Check the DataFrame Information

employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [None]:
# Assign Respective Data Types

employees['Start Date'] = pd.to_datetime(employees['Start Date'], format = '%m/%d/%Y')
employees['Last Login Time'] = pd.to_datetime(employees['Last Login Time'], format='%H:%M %p').dt.time
employees['Senior Management'] = employees['Senior Management'].astype(bool)
employees['Gender'] = employees['Gender'].astype('category')

In [None]:
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   object        
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 49.1+ KB


## Extract the following data based from the given conditions:

In [21]:
# Extract all 'Male' Employees with Salary of greater than 146000 and a part of Senior Management

employees[(employees['Gender'] == 'Male') & (employees['Salary'] > 146000) & employees['Senior Management']]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
83,Shawn,Male,2005-09-23,02:55:00,148115,6.539,True,Finance
175,Willie,Male,1998-02-17,08:20:00,146651,1.451,True,Engineering
452,Scott,Male,2012-11-17,02:47:00,146812,1.965,True,Marketing
536,Clarence,Male,1982-08-26,09:47:00,146589,4.905,True,Business Development
665,Anthony,Male,2013-02-13,01:35:00,146141,3.645,True,Distribution
721,Harold,Male,2010-04-16,05:13:00,147417,11.626,True,Business Development
851,Bobby,Male,1996-08-19,01:16:00,147842,16.158,True,


In [27]:
# Top 5 Highest Paid Employees from the Engineering Department

employees[employees['Team'] == 'Engineering'].sort_values(by = 'Salary', ascending = False).head(5)[['First Name', 'Salary']]

Unnamed: 0,First Name,Salary
541,Ruby,147362
983,John,146907
175,Willie,146651
808,Julie,145357
797,Rose,145001


In [31]:
# First Name is Robert who work in Client Services OR Start Date after 2016-06-01

is_robert = employees["First Name"] == "Robert"
is_in_client_services = employees["Team"] == "Client Services"
start_date_after_june_2016 = employees["Start Date"] > "2016-06-01"

employees[(is_robert & is_in_client_services) | start_date_after_june_2016]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,07:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,12:29:00,140002,19.49,True,Marketing


In [33]:
# Legal Team or Sales Team or Product Team (using .isin)

target_teams = employees['Team'].isin(['Legal', 'Sales', 'Product'])
employees[target_teams]

In [34]:
# Employees having no Team

employees[employees['Team'].isnull()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,09:01:00,63241,15.132,True,
23,,Male,2012-06-14,04:19:00,125792,5.042,True,
32,,Male,1998-08-21,02:27:00,122340,6.417,True,
91,James,,2005-01-26,11:00:00,128771,8.309,False,
109,Christopher,Male,2000-04-22,10:15:00,37919,11.449,False,
139,,Female,1990-10-03,01:08:00,132373,10.527,True,
199,Jonathan,Male,2009-07-17,08:15:00,130581,16.736,True,
258,Michael,Male,2002-01-24,03:04:00,43586,12.659,False,
290,Jeremy,Male,1988-06-14,06:20:00,129460,13.657,True,


In [35]:
# Employees with No First Name and No Team

employees[employees['Team'].isnull() & employees['First Name'].isnull()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
23,,Male,2012-06-14,04:19:00,125792,5.042,True,
32,,Male,1998-08-21,02:27:00,122340,6.417,True,
139,,Female,1990-10-03,01:08:00,132373,10.527,True,
382,,Female,1996-04-18,03:57:00,107024,12.182,True,
567,,Female,1980-04-01,08:04:00,48141,12.605,True,
626,,Female,1997-04-13,08:03:00,131755,2.93,True,
774,,Female,2000-06-18,07:36:00,106428,10.867,True,
826,,,1988-08-01,01:35:00,87103,5.665,True,
951,,Female,2010-09-14,05:19:00,143638,9.662,True,


In [39]:
# Employees with Last Login Time between 8:30 to 12:

import datetime as dt

employees[employees["Last Login Time"].between(dt.time(12, 40), dt.time(12, 45))]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
76,Margaret,Female,1988-09-10,12:42:00,131604,7.353,True,Distribution
89,Janice,Female,2016-03-12,12:40:00,51082,11.955,False,Legal
672,Robert,Male,1982-11-05,12:41:00,111580,10.982,False,Human Resources
730,Nicole,Female,2009-04-26,12:40:00,66047,18.674,True,Marketing
766,Justin,Male,1991-07-25,12:45:00,112975,9.699,False,Human Resources
931,Harold,Male,2012-06-23,12:40:00,140444,3.771,False,Distribution
934,Samuel,Male,1997-08-07,12:40:00,43694,3.787,True,Engineering
945,Gerald,,1989-04-15,12:44:00,93712,17.426,True,Distribution
962,Jonathan,Male,2013-08-21,12:45:00,121797,16.923,False,Product


In [49]:
# Unique First Name without NaN

employees['First Name'].dropna().unique()

array(['Douglas', 'Thomas', 'Maria', 'Jerry', 'Larry', 'Dennis', 'Ruby',
       'Angela', 'Frances', 'Louise', 'Julie', 'Brandon', 'Gary',
       'Kimberly', 'Lillian', 'Jeremy', 'Shawn', 'Diana', 'Donna', 'Lois',
       'Matthew', 'Joshua', 'John', 'Craig', 'Scott', 'Terry', 'Benjamin',
       'Christina', 'Joyce', 'Jean', 'Theresa', 'Rachel', 'Linda',
       'Stephanie', 'Michael', 'Christine', 'Beverly', 'Marilyn',
       'Cynthia', 'Roger', 'Bruce', 'Kathy', 'Clarence', 'Chris', 'Nancy',
       'Todd', 'Alan', 'Sara', 'Karen', 'Carl', 'Henry', 'Irene', 'Paula',
       'Denise', 'Kathleen', 'Steve', 'Jose', 'Johnny', 'Bobby', 'Bonnie',
       'Margaret', 'Charles', 'Robin', 'Gerald', 'Christopher', 'Steven',
       'Doris', 'Annie', 'Janice', 'James', 'Virginia', 'Harry',
       'Heather', 'Laura', 'Tina', 'Harold', 'Melissa', 'Aaron', 'Jack',
       'Phyllis', 'Paul', 'Russell', 'Shirley', 'Willie', 'Ashley',
       'Pamela', 'Andrea', 'Peter', 'Helen', 'Patricia', 'Michelle',
    

In [47]:
# Number of Unique First Name without NaN

employees['First Name'].nunique(dropna = True)

200