In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In this project, we'll work with exit surveys from employees of the Department of Education, Training and Employment (DETE) and the Technical and Further Education (TAFE) institute in Queensland, Australia. You can find the TAFE exit survey here and the survey for the DETE here.

These are the goal of the analysis:
* Are employees who only worked for the institutes for a short period of time resigning due to some kind of dissatisfaction? What about employees who have been there longer?
* Are younger employees resigning due to some kind of dissatisfaction? What about older employees?

The results from both surveys will be combined. However, although both used the same survey template, one of them customized some of the answers. Data cleaning will be done so we could analyze the first question.

A data dictionary wasn't provided with the dataset. In a job setting, we'd make sure to meet with a manager to confirm the definitions of the data. For this project, we'll use our general knowledge to define the columns.

Below is a preview of a couple columns we'll work with from the dete_survey.csv:

* ID: An id used to identify the participant of the survey
* SeparationType: The reason why the person's employment ended
* Cease Date: The year or month the person's employment ended
* DETE Start Date: The year the person began employment with the DETE

Below is a preview of a couple columns we'll work with from the tafe_survey.csv:

* Record ID: An id used to identify the participant of the survey
* Reason for ceasing employment: The reason why the person's employment ended
* LengthofServiceOverall. Overall Length of Service at Institute (in years): The length of the person's employment (in years)

In [None]:
# load both dataset
dete_survey = pd.read_csv('/kaggle/input/employee-exit-survey/dete_survey.csv')
tafe_survey = pd.read_csv('/kaggle/input/employee-exit-survey/tafe_survey.csv')

In [None]:
# display the dataset
pd.options.display.max_columns = 100 #to avoid column truncation
dete_survey.head(5)

In [None]:
dete_survey.info()

In [None]:
tafe_survey.head(5)

In [None]:
tafe_survey.info()

We can make the following observations based on the work above:

The dete_survey dataframe contains 'Not Stated' values that indicate values are missing, but they aren't represented as NaN.
Both the dete_survey and tafe_survey contain many columns that we don't need to complete our analysis.
Each dataframe contains many of the same columns, but the column names are different.
There are multiple columns/answers that indicate an employee resigned because they were dissatisfied.

In [None]:
# read 'Not Stated' as NaN
dete_survey = pd.read_csv('/kaggle/input/employee-exit-survey/dete_survey.csv', na_values='Not Stated')
dete_survey

Dropping columns that won't be used in the analysis.

In dete_survey, the 'Professional Development' column until 'Health & Safety' column.

In tafe_survey, the 'InstituteViews. Topic:1. I feel the senior leadership had a clear vision and direction' column until 'Workplace. Topic:Would you recommend the Institute as an employer to others?' column

In [None]:
# getting the column index
print(dete_survey.columns.get_loc('Professional Development'))
print(dete_survey.columns.get_loc('Health & Safety'))
print('\n')
print(tafe_survey.columns.get_loc('Main Factor. Which of these was the main factor for leaving?'))
print(tafe_survey.columns.get_loc('Workplace. Topic:Would you recommend the Institute as an employer to others?'))

In [None]:
# remove and update the columns on both dataset
dete_survey_updated = dete_survey.drop(dete_survey.columns[28:49], axis=1)
tafe_survey_updated = tafe_survey.drop(tafe_survey.columns[17:66], axis=1)

# check the resulting dataset
print(dete_survey_updated.columns)
print(tafe_survey_updated.columns)

Each dataframe contain columns with different name whose values are identical in both dataframe. Below are some of the columns that we'd like to use for the analysis:

| dete_survey     | tefe_survey                                                                | definition                                              |
|-----------------|----------------------------------------------------------------------------|---------------------------------------------------------|
| ID              | Record ID                                                                  | An id used to identify the participant of the survey    |
| SeparationType  | Reason for ceasing employment                                              | The reason why the participant's employment ended       |
| Cease Date      | CESSATION YEAR                                                             | The year or month the participant's employment ended    |
| DETE Start Date |                                                                            | The year the participant began employment with the DETE |
|                 | LengthofServiceOverall. Overall Length of Service at Institute (in years)  | The length of the person's employment (in years)        |
| Age             | CurrentAge. Current Age                                                    | The age of the participant                              |
| Gender          | Gender. What is your Gender?                                               | StartFragmentThe gender of the participant EndFragment  |  &nbsp;


Eventually, we want to combine both dataframes.  In order to do so, the column name in both dataframes must be standardized.

In [None]:
# rename the column name of dete_survey
dete_survey_updated.columns = dete_survey_updated.columns.str.lower().str.strip().str.replace(' ', '_')

# rename the column name of tefe_survey by referring to the dete_survey column name
tefe_col = {'Record ID': 'id', 'CESSATION YEAR': 'cease_date', 'Reason for ceasing employment': 'separationtype', 'Gender. What is your Gender?': 'gender',
           'CurrentAge. Current Age':'age', 'Employment Type. Employment Type': 'employment_status', 'Classification. Classification': 'position', 
           'LengthofServiceOverall. Overall Length of Service at Institute (in years)': 'institute_service', 'LengthofServiceCurrent. Length of Service at current workplace (in years)': 'role_service'
           }

tafe_survey_updated = tafe_survey_updated.rename(tefe_col, axis=1)

In [None]:
# check the updated dete_survey column name
dete_survey_updated.head()

In [None]:
# check the updated tafe_survey column name
tafe_survey_updated.head()

# Data Filtering

Under 'separationtype' column, to accomplish the initial goal we only consider the employee who resigned, therefore only the element that contains the string 'Resignation'.

In [None]:
# check the unique values under 'separationtype' column

dete_survey_updated['separationtype'].value_counts()

In [None]:
# check the unique values under 'separationtype' column
tafe_survey_updated['separationtype'].value_counts()

In [None]:
# update the resignation values in dete_survey into single value: 'Resignation'
dete_survey_updated['separationtype'] = dete_survey_updated['separationtype'].str.split('-').str[0]

# verify the column
dete_survey_updated['separationtype'].value_counts()

In [None]:
# create new dataset with only 'Resignation' value in 'separationtype' column

dete_resignations = dete_survey_updated[dete_survey_updated['separationtype'] == 'Resignation']
tafe_resignations = tafe_survey_updated[tafe_survey_updated['separationtype'] == 'Resignation']

Verifying the Data

For dete survey, we will focus on two columns: cease_date and dete_start date, which indicate the time when the employee begins and exit the company. There are two considerations:
* The value under both columns wouldn't make sense if it exceeds the current date.
* Since most employee begin working at the age of 20s, it's unlikely that the value of 1940s is true under the dete_start_date.

In [None]:
# check the values under 'cease_date' column
dete_resignations['cease_date']

In [None]:
# extract the year string under cease_date column
dete_resignations['cease_date'] = dete_resignations['cease_date'].str.split('/').str[-1]

Executing the line above give us the SettingWithCopyWarning, which usually occurs when we try to modify the value of a slice from the complete dataframe. To overcome this problem, it's a good practice to copy the slice we want to modify beforehand. In this case, it's the 'Resignation' slice. 

In [None]:
# creating the copy of the slice
dete_resignations = dete_survey_updated[dete_survey_updated['separationtype'] == 'Resignation'].copy()
tafe_resignations = tafe_survey_updated[tafe_survey_updated['separationtype'] == 'Resignation'].copy()

In [None]:
# extract the year string under cease_date column, on the copy this time. SettingWithCopyWarning will not pop up
# the value also converted into float for further processing
dete_resignations['cease_date'] = dete_resignations['cease_date'].str.split('/').str[-1].astype('float')
dete_resignations['cease_date'].value_counts().sort_index()

In [None]:
# check for outliers in dete_start_date
dete_resignations['dete_start_date'].value_counts().sort_index()

In [None]:
# check for outlier in tafe_survey
tafe_resignations['cease_date'].value_counts().sort_index()

By plotting the data, we could easily observe the distribution of the data. From the figure, it can be seen that both dataframe are not aligned, for example there is 2006 data in dete but no data in tafe, likewise, there is 2009 data in tafe, but not in dete. The discrepancy of resignation number can be seen in the year 2010, where it is much higher for tafe. However, we don't need to look into specific year to accomplish the goal.

In [None]:
import matplotlib.pyplot as plt
from numpy import arange

In [None]:
plt.hist(dete_resignations['cease_date'], alpha=0.5, bins=np.arange(2006,2016)-0.5, label='dete')
plt.hist(tafe_resignations['cease_date'], alpha=0.5, bins=np.arange(2009,2015)-0.5, label='tafe')
plt.xticks(np.arange(2006, 2015, 1.0))
plt.legend(loc='best')
plt.show()

# Service Duration

Since the analysis goal is to find out why the employees who has only been for a short period of time (and long period) resigned, we need to find out the length of time of their service. 

Tafe data already has the 'institute_service' column which contains the duration while dete survey do not. However, it has start and end date, and from there we could do date substraction to obtain the duration of service.

In [None]:
# create new column in dete by substracting 'dete_start_date' from 'cease_date'
dete_resignations['institute_service'] = dete_resignations['cease_date'] - dete_resignations['dete_start_date']

# check the newly created column
dete_resignations['institute_service'].head()

# Identify Dissatisfied Employee

Next, we'll identify any employees who resigned because they were dissatisfied. Below are the columns we'll use to categorize employees as "dissatisfied" from each dataframe:

tafe_survey_:
* Contributing Factors. Dissatisfaction
* Contributing Factors. Job Dissatisfaction

dete_survey:
* job_dissatisfaction
* dissatisfaction_with_the_department
* physical_work_environment
* lack_of_recognition
* lack_of_job_security
* work_location
* employment_conditions
* work_life_balance
* workload

We'll create a new column called 'dissatisfied' which contains boolean value True and False to indicate whether the employee resigned due to dissastification or other reason.

In [None]:
# check the unique values
tafe_resignations['Contributing Factors. Dissatisfaction'].value_counts(dropna=False)

In [None]:
# check the unique values
tafe_resignations['Contributing Factors. Job Dissatisfaction'].value_counts(dropna=False)

In [None]:
# create a function to update the value to either True, False, or NaN
def update_vals(val):
    if pd.isnull(val):
        return np.nan
    elif val == '-':
        return False
    else:
        return True

# apply the function on both columns, putting the result in the new column
tafe_resignations['dissatisfied'] = tafe_resignations[['Contributing Factors. Dissatisfaction', 'Contributing Factors. Job Dissatisfaction']].applymap(update_vals).any(axis=1, skipna=False)
        
# check the result
tafe_resignations['dissatisfied'].value_counts(dropna=False)

In [None]:
# create similar 'dissatisfied' column in dete_survey data based on the determined column
dis_list = ['job_dissatisfaction','dissatisfaction_with_the_department', 'physical_work_environment', 'lack_of_recognition', 'lack_of_job_security',
           'work_location', 'employment_conditions', 'work_life_balance', 'workload']

dete_resignations['dissatisfied'] = dete_resignations[dis_list].any(axis=1, skipna=False)
dete_resignations['dissatisfied'].value_counts(dropna=False)

In [None]:
dete_resignations1 = dete_resignations.copy()
tafe_resignations1 = tafe_resignations.copy()


# Combining the Data

We will combine both dataset. To avoid mixing up confusion, we will add new column identifying the data which contain the value 'DETE' for dete_survey and 'TAFE' for tafe survey. Then combine and drop the remaining columns we don't need.

In [None]:
# create identifier column for both dataset
dete_resignations['institute'] = 'DETE'
tafe_resignations['institute'] = 'TAFE'

In [None]:
# combine both dataset
combined = pd.concat([dete_resignations, tafe_resignations], ignore_index=True)

In [None]:
# verify the number of non null values in each column
combined.notnull().sum().sort_values()

In [None]:
# drop the column with less than 500 non null values
combined1 = combined.dropna(thresh=500, axis=1) 
combined1.notnull().sum()

In [None]:
combined1['institute_service'].value_counts()

# Creating New Column: Service Category
The service duration column, 'institute_service', is tricky to clean since it contains values in different forms e.g 1-2, 'less than 1 year', 7.0. To help the analysis, we will categorize these values into the career stage, like so:
* New: Less than 3 years at a company
* Experienced: 3-6 years at a company
* Established: 7-10 years at a company
* Veteran: 11 or more years at a company

In [None]:
combined1['institute_service']

In [None]:
# extract the year string
combined2 = combined1.copy()
combined2['institute_service'] = combined1['institute_service'].astype('str').str.extract(r'([0-9]+)')

# convert the values to float
combined2['institute_service'] = combined2['institute_service'].astype('float')

# check the result
combined2['institute_service'].value_counts(dropna=False)

In [None]:
# create a function that returns the value according to the duration
def cat(x):
    if pd.isnull(x):
        return np.nan
    elif x<3:
        return 'New'
    elif 3 <= x <= 6:
        return 'Experienced'
    elif 7 <= x <= 10:
        return 'Established'
    else:
        return 'Veteran'
    
# apply the function on the column
combined2['institute_service'] = combined2['institute_service'].apply(cat)

In [None]:
# check the result
combined2['institute_service'].value_counts(dropna=False)

# Initial Analysis

Finally, we'll replace the missing values in the dissatisfied column with the most frequent value, False. Then, we'll calculate the percentage of employees who resigned due to dissatisfaction in each service_cat group and plot the results.

Note that since we still have additional missing values left to deal with, this is meant to be an initial introduction to the analysis, not the final analysis.

In [None]:
# check the values of 'dissatisfied' column
combined2['dissatisfied'].value_counts(dropna=False)

In [None]:
# fill the null value with the most frequent value, False
combined2['dissatisfied'] = combined2['dissatisfied'].fillna(False)

# aggregate the 'dissatisfied' column with category as the index
table = pd.pivot_table(combined2, values='dissatisfied', index='institute_service')

In [None]:
# plot the aggregate result
%matplotlib inline
table.plot(kind='bar', rot=30)

The plot shows that Established and Veteran employee (7+ years of service) are more likely to resign due to some kind of dissatisfaction compared to new employee.