In [1]:
import calendar
import pendulum
import numpy as np
import pandas as pd
from pandas.tseries.offsets import MonthEnd
from datetime import datetime as dt

In [2]:
path = '../../../data/raw/employees.csv'

In [3]:
df = pd.read_csv(path)

In [4]:
df

Unnamed: 0,id,name,dept,salary,gender,marital_status,education,hired_date,resign_date,active
0,MR-001,Alice,Market Research,7500,F,1,Bsc,2017-02-10 9:30:00,,1
1,MR-002,Bob,Market Research,7500,M,0,Bsc,2018-04-10 9:30:00,,1
2,MR-003,John,Market Research,9500,M,1,Msc,2019-01-01 10:30:00,,1
3,M-001,Jonathan,Marketing,4500,O,0,BA,2017-04-10 9:30:00,,1
4,M-002,Ethan,Marketing,6000,M,1,BA,2017-04-10 9:30:00,,1
5,M-003,Elyra,Marketing,7500,O,1,BA,2017-04-10 9:30:00,,1
6,M-004,Sophia,Marketing,8000,F,1,MA,2017-04-10 9:30:00,,1
7,M-005,Mia,Marketing,5000,F,1,BA,2017-04-10 9:30:00,,1
8,M-006,Ava,Marketing,4500,F,0,HS,2018-01-01 9:15:00,,1
9,M-007,Olivia,Marketing,6500,F,1,BA,2019-02-10 10:30:00,,1


In [5]:
# count the number of employee in department
df['dept'].value_counts()

Sale                      20
Marketing                  8
Human Resource             6
Security                   6
Customer Service           4
Finance and Accounting     4
Market Research            3
IT                         2
Human Resuorce             1
Name: dept, dtype: int64

In [6]:
# Get the salary summary
df['salary'].describe()

count       54.000000
mean      5750.000000
std       2071.254291
min       3000.000000
25%       4500.000000
50%       5000.000000
75%       7500.000000
max      11500.000000
Name: salary, dtype: float64

In [7]:
# End of month
# For salary calculation for first month
df['end_of_mo'] = pd.to_datetime(df['hired_date'], format="%Y-%m") + MonthEnd(1)

In [8]:
df.head()

Unnamed: 0,id,name,dept,salary,gender,marital_status,education,hired_date,resign_date,active,end_of_mo
0,MR-001,Alice,Market Research,7500,F,1,Bsc,2017-02-10 9:30:00,,1,2017-02-28 09:30:00
1,MR-002,Bob,Market Research,7500,M,0,Bsc,2018-04-10 9:30:00,,1,2018-04-30 09:30:00
2,MR-003,John,Market Research,9500,M,1,Msc,2019-01-01 10:30:00,,1,2019-01-31 10:30:00
3,M-001,Jonathan,Marketing,4500,O,0,BA,2017-04-10 9:30:00,,1,2017-04-30 09:30:00
4,M-002,Ethan,Marketing,6000,M,1,BA,2017-04-10 9:30:00,,1,2017-04-30 09:30:00


In [9]:
# remove the extra value in timestamp
df['hired_date_dt'] = pd.to_datetime(df['hired_date'], format='%Y-%m-%d').dt.date

In [10]:
df['end_of_mo'] = df['end_of_mo'].dt.date

In [11]:
df.head()

Unnamed: 0,id,name,dept,salary,gender,marital_status,education,hired_date,resign_date,active,end_of_mo,hired_date_dt
0,MR-001,Alice,Market Research,7500,F,1,Bsc,2017-02-10 9:30:00,,1,2017-02-28,2017-02-10
1,MR-002,Bob,Market Research,7500,M,0,Bsc,2018-04-10 9:30:00,,1,2018-04-30,2018-04-10
2,MR-003,John,Market Research,9500,M,1,Msc,2019-01-01 10:30:00,,1,2019-01-31,2019-01-01
3,M-001,Jonathan,Marketing,4500,O,0,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10
4,M-002,Ethan,Marketing,6000,M,1,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10


In [12]:
# working day for first month
# it's a pay day :p
df['working_day'] = df['end_of_mo'] - df['hired_date_dt']

In [13]:
df['working_day'] = df['working_day'].dt.days

In [14]:
df.head()

Unnamed: 0,id,name,dept,salary,gender,marital_status,education,hired_date,resign_date,active,end_of_mo,hired_date_dt,working_day
0,MR-001,Alice,Market Research,7500,F,1,Bsc,2017-02-10 9:30:00,,1,2017-02-28,2017-02-10,18
1,MR-002,Bob,Market Research,7500,M,0,Bsc,2018-04-10 9:30:00,,1,2018-04-30,2018-04-10,20
2,MR-003,John,Market Research,9500,M,1,Msc,2019-01-01 10:30:00,,1,2019-01-31,2019-01-01,30
3,M-001,Jonathan,Marketing,4500,O,0,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20
4,M-002,Ethan,Marketing,6000,M,1,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20


In [15]:
def business_days(yr, mo):
    weekday_count = 0
    cal = calendar.Calendar()
    for week in cal.monthdayscalendar(int(yr), int(mo)):
        for i, day in enumerate(week):
            # not this month's day or a weekend
            if day == 0 or i > 5:
                continue
            # or some other control if desired...
        weekday_count += 1
    return weekday_count

In [16]:
df['weekend'] = df['end_of_mo'].apply(lambda x: business_days(x.year, x.month))

In [17]:
df.head()

Unnamed: 0,id,name,dept,salary,gender,marital_status,education,hired_date,resign_date,active,end_of_mo,hired_date_dt,working_day,weekend
0,MR-001,Alice,Market Research,7500,F,1,Bsc,2017-02-10 9:30:00,,1,2017-02-28,2017-02-10,18,5
1,MR-002,Bob,Market Research,7500,M,0,Bsc,2018-04-10 9:30:00,,1,2018-04-30,2018-04-10,20,6
2,MR-003,John,Market Research,9500,M,1,Msc,2019-01-01 10:30:00,,1,2019-01-31,2019-01-01,30,5
3,M-001,Jonathan,Marketing,4500,O,0,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5
4,M-002,Ethan,Marketing,6000,M,1,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5


In [18]:
df['business_days'] = df['working_day'] - df['weekend']

In [19]:
df

Unnamed: 0,id,name,dept,salary,gender,marital_status,education,hired_date,resign_date,active,end_of_mo,hired_date_dt,working_day,weekend,business_days
0,MR-001,Alice,Market Research,7500,F,1,Bsc,2017-02-10 9:30:00,,1,2017-02-28,2017-02-10,18,5,13
1,MR-002,Bob,Market Research,7500,M,0,Bsc,2018-04-10 9:30:00,,1,2018-04-30,2018-04-10,20,6,14
2,MR-003,John,Market Research,9500,M,1,Msc,2019-01-01 10:30:00,,1,2019-01-31,2019-01-01,30,5,25
3,M-001,Jonathan,Marketing,4500,O,0,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15
4,M-002,Ethan,Marketing,6000,M,1,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15
5,M-003,Elyra,Marketing,7500,O,1,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15
6,M-004,Sophia,Marketing,8000,F,1,MA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15
7,M-005,Mia,Marketing,5000,F,1,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15
8,M-006,Ava,Marketing,4500,F,0,HS,2018-01-01 9:15:00,,1,2018-01-31,2018-01-01,30,5,25
9,M-007,Olivia,Marketing,6500,F,1,BA,2019-02-10 10:30:00,,1,2019-02-28,2019-02-10,18,5,13


In [20]:
# calcualte the business hour based on 9:00am to 4:00 pm without OT
# same rule for all employees, some will do much longer
# some rules are not suitable for employee
df['cnst_working_hours'] = df['business_days']*8

In [21]:
df.head()

Unnamed: 0,id,name,dept,salary,gender,marital_status,education,hired_date,resign_date,active,end_of_mo,hired_date_dt,working_day,weekend,business_days,cnst_working_hours
0,MR-001,Alice,Market Research,7500,F,1,Bsc,2017-02-10 9:30:00,,1,2017-02-28,2017-02-10,18,5,13,104
1,MR-002,Bob,Market Research,7500,M,0,Bsc,2018-04-10 9:30:00,,1,2018-04-30,2018-04-10,20,6,14,112
2,MR-003,John,Market Research,9500,M,1,Msc,2019-01-01 10:30:00,,1,2019-01-31,2019-01-01,30,5,25,200
3,M-001,Jonathan,Marketing,4500,O,0,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15,120
4,M-002,Ethan,Marketing,6000,M,1,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15,120


In [22]:
# remove some columns from dataframe
exclude_cols = ['name', 'salary', 'gender', 'education', 'cnst_working_hours']
df.loc[:, ~df.columns.isin(exclude_cols)]

Unnamed: 0,id,dept,marital_status,hired_date,resign_date,active,end_of_mo,hired_date_dt,working_day,weekend,business_days
0,MR-001,Market Research,1,2017-02-10 9:30:00,,1,2017-02-28,2017-02-10,18,5,13
1,MR-002,Market Research,0,2018-04-10 9:30:00,,1,2018-04-30,2018-04-10,20,6,14
2,MR-003,Market Research,1,2019-01-01 10:30:00,,1,2019-01-31,2019-01-01,30,5,25
3,M-001,Marketing,0,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15
4,M-002,Marketing,1,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15
5,M-003,Marketing,1,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15
6,M-004,Marketing,1,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15
7,M-005,Marketing,1,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15
8,M-006,Marketing,0,2018-01-01 9:15:00,,1,2018-01-31,2018-01-01,30,5,25
9,M-007,Marketing,1,2019-02-10 10:30:00,,1,2019-02-28,2019-02-10,18,5,13


In [23]:
# select columns from dataframe
selected_cols = ['id', 'dept', 'marital_status', 'active']
df.loc[:, df.columns.isin(selected_cols)]

Unnamed: 0,id,dept,marital_status,active
0,MR-001,Market Research,1,1
1,MR-002,Market Research,0,1
2,MR-003,Market Research,1,1
3,M-001,Marketing,0,1
4,M-002,Marketing,1,1
5,M-003,Marketing,1,1
6,M-004,Marketing,1,1
7,M-005,Marketing,1,1
8,M-006,Marketing,0,1
9,M-007,Marketing,1,1


In [24]:
# count the married and unmarriage
df['marital_status'].value_counts()

1    36
0    18
Name: marital_status, dtype: int64

In [25]:
# calculate servcie year
def endofmoth():
    today = pendulum.today()
    lmo = today.subtract(days=today.day)
    return dt.strftime(lmo, '%Y-%m-%d')

In [26]:
df['lsrv_date'] = endofmoth()

In [27]:
df['lsrv_date'] = pd.to_datetime(df['lsrv_date'], format='%Y-%m-%d')

In [28]:
df['service_with_business'] = pd.to_datetime(df['lsrv_date']) - pd.to_datetime(df['hired_date_dt'])

In [29]:
df.head()

Unnamed: 0,id,name,dept,salary,gender,marital_status,education,hired_date,resign_date,active,end_of_mo,hired_date_dt,working_day,weekend,business_days,cnst_working_hours,lsrv_date,service_with_business
0,MR-001,Alice,Market Research,7500,F,1,Bsc,2017-02-10 9:30:00,,1,2017-02-28,2017-02-10,18,5,13,104,2021-11-30,1754 days
1,MR-002,Bob,Market Research,7500,M,0,Bsc,2018-04-10 9:30:00,,1,2018-04-30,2018-04-10,20,6,14,112,2021-11-30,1330 days
2,MR-003,John,Market Research,9500,M,1,Msc,2019-01-01 10:30:00,,1,2019-01-31,2019-01-01,30,5,25,200,2021-11-30,1064 days
3,M-001,Jonathan,Marketing,4500,O,0,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15,120,2021-11-30,1695 days
4,M-002,Ethan,Marketing,6000,M,1,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15,120,2021-11-30,1695 days


In [30]:
df['service_with_business(yr)'] = df['service_with_business']/np.timedelta64(1, 'Y')

In [31]:
df.head()

Unnamed: 0,id,name,dept,salary,gender,marital_status,education,hired_date,resign_date,active,end_of_mo,hired_date_dt,working_day,weekend,business_days,cnst_working_hours,lsrv_date,service_with_business,service_with_business(yr)
0,MR-001,Alice,Market Research,7500,F,1,Bsc,2017-02-10 9:30:00,,1,2017-02-28,2017-02-10,18,5,13,104,2021-11-30,1754 days,4.802289
1,MR-002,Bob,Market Research,7500,M,0,Bsc,2018-04-10 9:30:00,,1,2018-04-30,2018-04-10,20,6,14,112,2021-11-30,1330 days,3.641416
2,MR-003,John,Market Research,9500,M,1,Msc,2019-01-01 10:30:00,,1,2019-01-31,2019-01-01,30,5,25,200,2021-11-30,1064 days,2.913133
3,M-001,Jonathan,Marketing,4500,O,0,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15,120,2021-11-30,1695 days,4.640752
4,M-002,Ethan,Marketing,6000,M,1,BA,2017-04-10 9:30:00,,1,2017-04-30,2017-04-10,20,5,15,120,2021-11-30,1695 days,4.640752


In [32]:
# list the education of employee
np.unique(df['education'].to_list())

array(['BA', 'Bsc', 'HS', 'MA', 'MS', 'Ms', 'Msc'], dtype='<U3')