In [1]:
#import libraries
import pandas as pd

In [3]:
employees = pd.read_csv('../data/pandas/employees.csv')
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services


In [4]:
#use info() to look at the dataset
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null object
Start Date           1000 non-null object
Last Login Time      1000 non-null object
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    933 non-null object
Team                 957 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


looking at .info() reveals:
* that we must have a lot of missing gender values. We can do something with the null values.
* We can also categorize the 'gender' column
* The "Start Date" field is being treated as a string and not as a datetime object.

In [9]:
#convert the start date to a datetime type. Notice there's no "inplace" attribute here, so to make the change permanent, I have to use assignment
employees['Start Date'] = pd.to_datetime(employees['Start Date'])
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,4:47 PM,101004,1.389,True,Client Services


In [11]:
#convert the login time to a datetime field, too.
employees['Last Login Time'] = pd.to_datetime(employees['Last Login Time'])
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-10-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-10-04 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2018-10-04 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2018-10-04 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2018-10-04 16:47:00,101004,1.389,True,Client Services


In [12]:
#convert management type to Boolean
employees['Senior Management'] = employees['Senior Management'].astype('bool')
employees.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2018-10-04 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2018-10-04 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2018-10-04 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2018-10-04 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2018-10-04 16:47:00,101004,1.389,True,Client Services


In [14]:
#change gender to a category type (much better for memory management)
employees['Gender'] = employees['Gender'].astype('category')
employees.head()
employees.info() #hug memory savings b/c of the categorical change

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null category
Start Date           1000 non-null datetime64[ns]
Last Login Time      1000 non-null datetime64[ns]
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    1000 non-null bool
Team                 957 non-null object
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.0+ KB
