In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import SelectPercentile, univariate_selection, RFE
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score, RepeatedStratifiedKFold,\
StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, plot_confusion_matrix, precision_recall_curve, plot_precision_recall_curve, confusion_matrix
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE
import category_encoders as ce
from scipy import stats

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Context and Content**

A company which is active in Big Data and Data Science wants to hire data scientists among people who successfully pass some courses which conduct by the company. Many people signup for their training. Company wants to know which of these candidates are really wants to work for the company after training or looking for a new employment because it helps to reduce the cost and time as well as the quality of training or planning the courses and categorization of candidates. Information related to demographics, education, experience are in hands from candidates signup and enrollment.

This dataset designed to understand the factors that lead a person to leave current job for HR researches too. By model(s) that uses the current credentials,demographics,experience data you will predict the probability of a candidate to look for a new job or will work for the company, as well as interpreting affected factors on employee decision.

The whole data divided to train and test . Target isn't included in test but the test target values data file is in hands for related tasks. A sample submission correspond to enrollee_id of test set provided too with columns : enrollee _id , target

***Note:***

* The dataset is imbalanced.
* Most features are categorical (Nominal, Ordinal, Binary), some with high cardinality.
* Missing imputation can be a part of your pipeline as well.
* Features

enrollee_id : Unique ID for candidate

city: City code

city_ development _index : Developement index of the city (scaled)

gender: Gender of candidate

relevent_experience: Relevant experience of candidate

enrolled_university: Type of University course enrolled if any

education_level: Education level of candidate

major_discipline :Education major discipline of candidate

experience: Candidate total experience in years

company_size: No of employees in current employer's company

company_type : Type of current employer

lastnewjob: Difference in years between previous job and current job

training_hours: training hours completed

target: 0 – Not looking for job change, 1 – Looking for a job change

***Inspiration***
* Predict the probability of a candidate will work for the company
* Interpret model(s) such a way that illustrate which features affect candidate decision

According to toggl.com, they took a research example from the Society for Human Resource Management, the average cost of recruiting new employees is USD 4,129, assuming that filling positions can take place within 42 days.

According to research from Glassdoor, the average cost for a new employee recuit is USD 4,000, assuming a position fill takes place within 50 days.

Meanwhile, the cost of training for Data Science, we can take the most reasonable cost, which is USD 2,950 (Track 1 to Track 4). This training is conducted by the Data Science Council of America (DASCA): Principal Data Science (PDS). This training is quite complete with fundamental learning to advanced data science concepts such as big data best practices, business strategies for data, building cross-organizational support, machine learning, natural language processing, scholastic modeling and more.

This means that we can conclude that **the cost of recruiting one new employee is around 1.37x higher than the cost of training one employee.**

# Data Preparation and Feature Engineering

In [None]:
emp = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
emp

In [None]:
# drop unnecessary feature
emp.drop('enrollee_id', axis=1, inplace=True)

In [None]:
emp.info()

In [None]:
num_desc = emp.describe()
num_desc

In [None]:
obj_desc = emp.select_dtypes(include='object').describe()
obj_desc

In [None]:
round(emp.isna().sum()/len(emp)*100, 2)

we can see from the results above that, many columns still have empty data. So that we can start to create a data imputation strategy.

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(emp.isna())
plt.show()

In [None]:
emp['gender'].value_counts().plot(kind='pie')
plt.show()

This is a bit strange, because someone has filled the gender column with other. We have to decide whether this other is NaN or indeed some other category. Let's see first, how many others are in gender.

In [None]:
len(emp[emp['gender'] == 'Other']) / len(emp) * 100

It turns out that the number of gender with 'Other' value is not up to 1 percent. If we want to stay informed from this data, we can change this value to NaN and then impute it later. Or if we feel that this is small data and doesn't seem to have a significant impact, we can delete it. But for now I will try to convert it to NaN and I will impute it later.

## Gender

In [None]:
emp['gender'].replace({'Other': np.nan}, inplace=True)

In [None]:
emp['gender'].isna().sum()/len(emp)*100

In [None]:
emp[emp['gender'].isna()==True]['education_level'].value_counts()

In [None]:
emp[emp['education_level']=='Graduate']['gender'].value_counts()

From the results above, it can be seen that, the majority of gender that is not known is mostly from the education level: Graduate. And at that level of education, male gender is dominated. Then we will impute the `gender` column with the value Male.

## Enrolled Uni and Education Level

In [None]:
sns.heatmap(emp['enrolled_university education_level'.split()].isna())

We can see that there is no correlation between missing value between education levels and enrolled universities. So that we can impute them separately or not in pairs.

In [None]:
emp[emp['enrolled_university'].isna() == True]['city'].value_counts()

In [None]:
emp[emp['enrolled_university'].isna() == True]['education_level'].value_counts()

In [None]:
emp[emp['city'] == 'city_21']['enrolled_university'].value_counts()

In [None]:
emp[emp['education_level'].isna() == True]['city'].value_counts()

In [None]:
emp[emp['education_level'].isna() == True]['enrolled_university'].value_counts()

In [None]:
emp[emp['city'] == 'city_103']['education_level'].value_counts()

From the results above, we can conclude. For the column `enrolled_university`, I will fill it with 'no enrollment'. As for the `education_level` column, I will fill it with 'Graduate'.

## Major Discipline

In [None]:
emp['major_discipline'].value_counts(normalize=True).plot(kind='barh')

More than 80% of employees come from STEM (Science, Technology, Engineering, Mathematics) majors. We will try to fill in this column based on the `city_id` column.

In [None]:
emp[emp['major_discipline'].isna() == True]['city'].value_counts()

In [None]:
emp[emp['city'] == 'city_103']['major_discipline'].mode()

It turned out that the city that lost the most data on its graduates was city_103 and most employees from city_103 took STEM majors. So we will impute `major_discipline` with STEM.

## Experience

In [None]:
round(emp.isna().sum()/len(emp)*100, 2)

Because the `experience` column only loses less than half a percent of data, I will delete data for employees who do not have experience data.

## Company Size and Type

In [None]:
sns.heatmap(emp['company_size company_type'.split()].isna())

If we look at the results above. Almost all data that is missing in the `company_size` column will also be lost in the` company_type` column. We will try to observe these pairs.

In [None]:
both_nan_comp = emp[(emp['company_size'].isna()==True) & (emp['company_type'].isna()==True)]
both_nan_comp

In [None]:
either_nan_comp = emp[(emp['company_size'].isna()==True) | (emp['company_type'].isna()==True)]
either_nan_comp

In [None]:
print('Paired NaN from company_size and company_type:', len(both_nan_comp)/len(either_nan_comp)*100, '%')

In [None]:
both_nan_comp['city'].value_counts(normalize=True).sort_values(ascending=False).head(10).plot(kind='barh')

In [None]:
both_nan_comp[both_nan_comp['company_size'].isna()==True]['city'].mode()

In [None]:
print('Most frequent company size from city_103:', emp[emp['city']=='city_103']['company_size'].mode()[0])
print('Most frequent company type from city_103:', emp[emp['city']=='city_103']['company_type'].mode()[0])

In [None]:
emp[(emp['company_type']=='Pvt Ltd') & (emp['city']=='city_103')]['company_size'].value_counts()

We'll try to fill in all the NaNs in these 2 columns with 'Pvt Ltd' and '10000+' for `company_type` and` company_size` respectively.

## Last New Job

In [None]:
emp['last_new_job'].value_counts()

According to the documentation, `last_new_job` column is about: Difference in years between previous job and current job

In [None]:
emp[emp['last_new_job'].isna() == True]['city'].value_counts()

In [None]:
emp[emp['city'] == 'city_21']['last_new_job'].value_counts()

With this we will fill in the column `last_new_job` with value 1. Or in other words, employees who just moved 1 year ago.

## The conclusion of the imputation strategy:
1. `gender` = 'Male'
1. `enrolled_university` = 'no_enrollment'
1. `education_level` = 'Graduate'
1. `major_discipline` = 'STEM'
1. `experience` =` drop.na() `
1. `company_size` = '10000+'
1. `company_type` = 'Pvt Ltd'
1. `last_new_job` = '1'

To be continued..