In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Trun off warnings
import warnings
warnings.filterwarnings('ignore')

# Load Full Dataset

In [None]:
# Directory prefix
dir_prefix = "/kaggle/input/human-resources-data-set/"

In [None]:
# load v13 dataset
df_hrfull = pd.read_csv(dir_prefix + "HRDataset_v13.csv")

In [None]:
df_hrfull.head(10)

In [None]:
df_hrfull.dtypes

## Is there any relationship between who a person works for and their performance score?

In [None]:
from scipy.stats import entropy, mode

In [None]:
df_hrfull.groupby('ManagerID')['PerfScoreID'].agg(mode)

In [None]:
df_hrfull.groupby('ManagerID')['PerfScoreID'].agg(lambda x: entropy(x.value_counts(normalize=True)) )

**Comments**
- Based on most frequent performance scores (Mode) of employees under each manager, there is not a huge dependence on who a person works for and their performance score.
- However based on the variablity of performance scores (Entropy) within for each group of employess under a manager, there is certainly some dependence, particularly the variance of scores.

# What is the overall diversity profile of the organization?

- Demographic feilds available:
    - Gender
    - Race
    - Marital Status

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### Gender

In [None]:
# Emplyment status
df_hrfull['EmploymentStatus'].value_counts()

In [None]:
# Select Active and on leave employes only
active_mask = (df_hrfull['EmploymentStatus'] == "Active") | (df_hrfull['EmploymentStatus']  == "Leave of Absence")
# print counts of these employess
df_hrfull.loc[active_mask, 'EmploymentStatus'].value_counts()

In [None]:
df_sub = df_hrfull.loc[active_mask].fillna({'Sex':'Unknown'})
sns.countplot(x="Sex", data=df_sub)

### Race

In [None]:
df_sub = df_hrfull.loc[active_mask].fillna({'RaceDesc':'Unknown'})

sns.countplot(y="RaceDesc", data=df_sub)

### Marital Status

In [None]:
df_sub = df_hrfull.loc[active_mask].fillna({'MaritalDesc':'Unknown'})
sns.countplot(y="MaritalDesc", data=df_sub)

### Diversity Profile

In [None]:
df_sub = df_hrfull.loc[active_mask].fillna({'Sex':'Unknown','RaceDesc':'Unknown'})

sns.countplot(y="RaceDesc", hue="Sex", data=df_sub)

## What are our best recruiting sources if we want to ensure a diverse organization?

In [None]:
df_sub = df_hrfull.loc[active_mask].fillna({'Sex':'Unknown','RaceDesc':'Unknown'})
# Create an interaction features of Sex and Race
df_sub['Sex_RaceDesc'] = df_sub["Sex"].str.strip() + "_" + df_sub["RaceDesc"].str.strip() 

In [None]:
df_sub['Sex_RaceDesc'].value_counts()

In [None]:
# Measure diversity as entropy of "Sex_RaceDesc" 
df_sub = pd.DataFrame(df_sub.groupby('RecruitmentSource')['Sex_RaceDesc'] \
                    .agg(lambda x: entropy(x.value_counts(normalize=True))))
df_sub = df_sub.rename(columns={"Sex_RaceDesc":"DiversityScore"}).sort_values(by="DiversityScore", ascending=False)
df_sub.reset_index(inplace=True)

**Recruitment sources ranked by Diversity Score**

In [None]:
plt.figure(figsize=(10,8))
ax = sns.barplot(x="DiversityScore", y="RecruitmentSource", data=df_sub)

## Are there areas of the company where pay is not equitable?

In [None]:
# Import datetime to create column for days since hiring.
from datetime import datetime

In [None]:
df_hrfull['DaysSinceHire'] = (datetime.now()  - pd.to_datetime(df_hrfull['DateofHire'],infer_datetime_format=True)).dt.days

In [None]:
df_hrfull['DaysSinceHire'].describe()

## First, let's just see if there is varaibility in pay by position

In [None]:
# import iqr
from scipy.stats import iqr

In [None]:
# Let us only include employees that have a performance score of "Fully Meets"
perf_mask = (df_hrfull['PerformanceScore'] == "Fully Meets")

In [None]:
df_salary_pos = df_hrfull.loc[active_mask & perf_mask].groupby(['Position'])['PayRate'].agg(['median',iqr]).rename(lambda x: 'salary_' + x, axis=1)
df_salary_pos['numberOfEmployees'] = df_hrfull.loc[active_mask & perf_mask].groupby(['Position']).size()

In [None]:
df_salary_pos

In [None]:
df_salary_pos['relative_variability'] = df_salary_pos['salary_iqr'] / df_salary_pos['salary_median']
df_salary_pos.sort_values(by='relative_variability', ascending=False, inplace=True)
df_salary_pos

### Comments
- Seems like the following positions have high variability in pay (10% or more in relative_variability score)
    - IT manager DB
    - IT Suport
    - Network engineer
    - Software Engineer
    - Data Analyst
    - Production Technician I

### Investigate each of these positions further
- Does seniority account for the pay variability?
- Does a combination of seniority and special projects count acocunt for variability? 

In [None]:
df_salary_pos.reset_index(inplace=True)

In [None]:
df_sub =  df_salary_pos.loc[df_salary_pos['relative_variability'] > 0.10]
df_sub

In [None]:
# replace df_sub with the new data frame
df_sub = pd.merge(df_sub, df_hrfull, on='Position', how='left')

In [None]:
# import linear regression from scipy
from scipy.stats import linregress

In [None]:
df_linreg_salary_time = df_sub.groupby('Position').apply(lambda x: pd.Series(linregress(x['DaysSinceHire'], x['PayRate'])) )
df_linreg_salary_time.columns = ["slope", "intercept", "r_value", "p_value", "std_err"]

In [None]:
df_linreg_salary_time

In [None]:
for idx, grup in df_sub.groupby('Position'):
    sns.regplot(x=grup['DaysSinceHire'], y=grup['PayRate'])
    plt.title(grup['Position'].unique()[0])
    plt.show()

#### Comments
- Looks like seniority alone does not explain the variability

### Let us look at Seniority and Special Projects Count

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

for idx, grup in df_sub.groupby('Position'):
    # pipeline for regression
    lr_pipe = Pipeline([ ("std_scale", StandardScaler()) ,("lr",LinearRegression())])
    
    # include polynomial features
    poly = PolynomialFeatures(order=2)
    X_poly = poly.fit_transform(grup.loc[:,['DaysSinceHire', 'SpecialProjectsCount']])
    
    # fit pipeline
    lr_pipe.fit(X= X_poly, y=grup['PayRate'])
    
    # Print fit results
    print("Position: ", grup['Position'].unique()[0])
    print('Coefficients: \n', lr_pipe.named_steps['lr'].coef_)
    print("R^2 score: ", lr_pipe.score(X= X_poly, y=grup['PayRate']),'\n\n' )

### Comments
- We fit linear regression model and included polynomial features of degree 2 including interaction terms. 
- The fit results indicate:
    - "Software Engineer", "Production Technician I" and "Data Analyst" are likely areas where the pay may not be equitable. 
    - This course needs to be compared with company and department policies for pay dependence on seniority and special projects.
    - Further analysis could include ANOVA to assess how best the given factors explain the variability in pay. 

## Can we predict who is going to terminate and who isn't?

In [None]:
df_hrfull['Termd'].value_counts()

### We will need to do some pre-processing to the dataset before being able to carry out such a prediction task

1. This is Binary Prediction task where we will try to predict if a particular employee has terminated.
2. We will need to make sure there is no target leakage in the features
3. We will need to convert dates to more useful features that the models can utilize
4. We will need to remove all employees whos emplayment status is unknown or are starting in the future

In [None]:
active_or_termd_mask = ~(df_hrfull['EmploymentStatus'].isnull() | (df_hrfull['EmploymentStatus']=="Future Start"))

In [None]:
df_sub = df_hrfull.loc[active_or_termd_mask,:]

In [None]:
df_sub.columns

In [None]:
# days since termination
df_sub['DaysSinceTermd'] = (datetime.now()  - pd.to_datetime(df_sub['DateofTermination'],infer_datetime_format=True)).dt.days

In [None]:
df_sub.fillna({'DaysSinceTermd': 0}, inplace=True)

In [None]:
# are there any NaNs in "DaysSinceTermd"
df_sub.loc[ (df_sub['Termd']==1.0), 'DaysSinceTermd'].isnull().any()

In [None]:
df_sub['DaysWorked'] = df_sub['DaysSinceHire'] - df_sub['DaysSinceTermd']

In [None]:
# check if any nulls exist
df_sub['DaysWorked'].isnull().any()

In [None]:
df_sub['MaxAgeWhenEmployed'] = (datetime.now()  - pd.to_datetime(df_sub['DOB'],infer_datetime_format=True)).dt.days  -  df_sub['DaysSinceTermd']

In [None]:
df_sub['MaxAgeWhenEmployed'].isnull().any()

In [None]:
# Columns to drop for training the RandomForest model
drop_cols = ['Employee_Name', 'EmpID', 'MarriedID', 'GenderID','DOB','EmpStatusID',
               'DateofHire', 'DateofTermination', 'TermReason', 'EmploymentStatus','Position', 
               'Zip','Department', 'ManagerName', 'LastPerformanceReview_Date', 'DaysLateLast30', 
               'DaysSinceHire', 'DaysSinceTermd','Termd']

X = df_sub.drop(columns= drop_cols)
y = df_sub['Termd'].values

In [None]:
X.isnull().sum()

In [None]:
# Fillna in manager ID with a ID -99
X.fillna({'ManagerID':-99}, inplace=True)

### Fit a RandomForest model to predict who will terminate

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# label encode categorical columns
for col in X.columns:
    if (X[col].dtype == 'O'):
        labenc = LabelEncoder()
        X[col] = labenc.fit_transform(X[col])

In [None]:
from sklearn.model_selection import cross_validate, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=150, n_jobs=-1, min_impurity_decrease=1e-8)

In [None]:
rf_params = {'max_depth': [2,4], 'min_samples_leaf': [3,10]}

In [None]:
gcv = GridSearchCV(estimator=rf, param_grid=rf_params, cv=StratifiedKFold(n_splits=5), scoring=['accuracy','roc_auc'], refit='roc_auc', verbose=3)

In [None]:
gcv.fit(X,y)

In [None]:
gcv.best_params_, gcv.best_score_

In [None]:
# Mean score
gcv.cv_results_['mean_test_accuracy'].mean().round(2), gcv.cv_results_['mean_test_roc_auc'].mean().round(2)

In [None]:
# Mean STD of folds
gcv.cv_results_['std_test_accuracy'].mean().round(2), gcv.cv_results_['std_test_roc_auc'].mean().round(2)

In [None]:
feature_inportance_idx = np.argsort(gcv.best_estimator_.feature_importances_)

In [None]:
# Features in increasing order of predictive power (to predict if an employee will terminate)
X.columns[feature_inportance_idx]

## Comments & Conclusions
- We are able to predict who will terminate and who will not with a:
    - Mean Accuracy score of `0.91 +/- 0.09`
    - Mean AUC-ROC score of `0.95 +/- 0.06`
    
    
- The top 4 determinants of whether someone will terminate are: 
    - 'DaysWorked' (how long they have worked at the company)
    - 'MaxAgeWhenEmployed' (Age of the person),
    - 'PayRate' (Salary of the person)
    - 'ManagerID' (Who is their manager)