# HR Analytics: Job Change of Data Scientists

![](https://greatpeopleinside.com/wp-content/uploads/2019/06/analytics-1030x618.jpg)

## 1. Moduls to Use

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# 2. Import Datasets

In [None]:
path_train = '../input/hr-analytics-job-change-of-data-scientists/aug_train.csv'
path_test = '../input/hr-analytics-job-change-of-data-scientists/aug_test.csv'
path_submission = '../input/hr-analytics-job-change-of-data-scientists/sample_submission.csv'

In [None]:
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

In [None]:
#First 5 rows - Train

df_train.head()

In [None]:
#First 5 rows - Test

df_test.head()

# 3. Exploratory Data Analysis

## 3.1 General analysis

In [None]:
df_train.dtypes

In [None]:
#Columns (features) - Train

list(df_train.columns)

In [None]:
#Columns (features) - Test

list(df_test.columns)

In [None]:
#Shape of the DataFrame - Train

df_train.shape

In [None]:
#Shape of the DataFrame - Test

df_test.shape

In [None]:
#Number of NaN per column

def percentage_nulls(df):

    number_nulls = pd.DataFrame(df.isnull().sum(),columns=['Total'])
    number_nulls['% nulls'] = round((number_nulls['Total'] / df.shape[0])*100,1)
    
    return number_nulls

In [None]:
#NaN for Train

percentage_nulls(df_train)

In [None]:
#NaN for Test

percentage_nulls(df_test)

As seen in some columns there is a considerable amount of NaN.

The % are similar in master datasets (train and test)

## 3.2 Feature *'city'* - City code

In [None]:
#City - Train

city_train = df_train['city']
city_train.value_counts()

In [None]:
#City - Test

city_test = df_test['city']
city_test.value_counts()

it is observed that the cities are coded with a number and they are 123 different citys for Train and 108 for test

## 3.3 Feature *'city_ development _index'* -  Developement index of the city (scaled)

For more information: https://en.wikipedia.org/wiki/City_development_index

In [None]:
#Distribution city development index - Train

sns.displot(data=df_train, x='city_development_index',height=6,color = 'lightblue')

In [None]:
#Distribution city development index - Train

sns.displot(data=df_test, x='city_development_index',height=6,color = 'coral')

## 3.4 Feature *'gender'* - Gender of candidate


In [None]:
gender_train = df_train['gender']
gender_test = df_test['gender']

In [None]:
def percentage(df):

    number = pd.DataFrame(df.value_counts())
    number.columns = ['Total']
    number['%'] = round((number['Total'] / df.notnull().sum())*100,1)
    
    return number

In [None]:
percentage(gender_train)

In [None]:
percentage(gender_test)

As you can see the% are similar in both datasets

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 6))

sns.countplot(x="gender",data=df_train,palette="Set1",ax=ax[0]).set_title('Train')
sns.countplot(x="gender",data=df_test,palette="Set1",ax=ax[1]).set_title('Test')
fig.tight_layout()
fig.show()

## 3.5 Feature *'relevent_experience'* - Relevant experience of candidate


In [None]:
relevent_experience_train = df_train['relevent_experience']
relevent_experience_test = df_test['relevent_experience']

In [None]:
percentage(relevent_experience_train)

In [None]:
percentage(relevent_experience_test)

As you can see the% are similar in both datasets

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 6))

sns.countplot(x='relevent_experience',data=df_train,palette="Set2",ax=ax[0]).set_title('Train')
sns.countplot(x='relevent_experience',data=df_test,palette="Set2",ax=ax[1]).set_title('Test')
fig.tight_layout()
fig.show()

## 3.6 Feature *'enrolled_university'* - Type of University course enrolled if any


In [None]:
enrolled_university_train = df_train['enrolled_university']
enrolled_university_test = df_test['enrolled_university']

In [None]:
percentage(enrolled_university_train)

In [None]:
percentage(enrolled_university_test)

In [None]:
order_enrolled_university = percentage(enrolled_university_train).index

As you can see the% are similar in both datasets

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 6))

sns.countplot(x='enrolled_university',data=df_train,palette="Set3",ax=ax[0],order=order_enrolled_university).set_title('Train')
sns.countplot(x='enrolled_university',data=df_test,palette="Set3",ax=ax[1],order=order_enrolled_university).set_title('Test')
fig.tight_layout()
fig.show()

## 3.7 Feature *'education_level'* - Education level of candidate


In [None]:
education_level_train = df_train['education_level']
education_level_test = df_test['education_level']

In [None]:
percentage(education_level_train)

In [None]:
percentage(education_level_test)

As you can see the% are similar in both datasets

In [None]:
order_education_level = percentage(education_level_train).index

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 8))

sns.countplot(x='education_level',data=df_train,palette="Set1",ax=ax[0],order=order_education_level).set_title('Train')
sns.countplot(x='education_level',data=df_test,palette="Set1",ax=ax[1],order=order_education_level).set_title('Test')
fig.tight_layout()
fig.show()

## 3.8 Feature *'major_discipline'* - Education major discipline of candidate


In [None]:
major_discipline_train = df_train['major_discipline']
major_discipline_test = df_test['major_discipline']

In [None]:
percentage(major_discipline_train)

In [None]:
percentage(major_discipline_test)

As you can see the% are similar in both datasets

In [None]:
order_major_discipline = percentage(major_discipline_train).index

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(14, 10))

sns.countplot(x='major_discipline',data=df_train,palette="Set2",ax=ax[0],order=order_major_discipline).set_title('Train')
sns.countplot(x='major_discipline',data=df_test,palette="Set2",ax=ax[1],order=order_major_discipline).set_title('Test')
fig.tight_layout()
fig.show()

## 3.9 Feature *'experience'* - Candidate total experience in years


In [None]:
experience_train = df_train['experience']
experience_test = df_test['experience']

In [None]:
percentage(experience_train)

In [None]:
percentage(experience_test)

In [None]:
order_experience = percentage(experience_train).index

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(14, 12))

sns.countplot(x='experience',data=df_train,palette="Set3",ax=ax[0],order=order_experience).set_title('Train')
sns.countplot(x='experience',data=df_test,palette="Set3",ax=ax[1],order=order_experience).set_title('Test')
fig.tight_layout()
fig.show()

## 3.10 Feature *'company_size'* - No of employees in current employer's company

In [None]:
company_size_train = df_train['company_size']
company_size_test = df_test['company_size']

In [None]:
percentage(company_size_train)

In [None]:
percentage(company_size_test)

As you can see the% are similar in both datasets

In [None]:
order_company_size = percentage(company_size_train).index

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(14, 12))

sns.countplot(x='company_size',data=df_train,palette="Set1",ax=ax[0],order=order_company_size).set_title('Train')
sns.countplot(x='company_size',data=df_test,palette="Set1",ax=ax[1],order=order_company_size).set_title('Test')
fig.tight_layout()
fig.show()

## 3.11 Feature *'company_type'* - Type of current employer

In [None]:
company_type_train = df_train['company_type']
company_type_test = df_test['company_type']

In [None]:
percentage(company_type_train)

In [None]:
percentage(company_type_test)

As you can see the% are similar in both datasets

In [None]:
order_company_type = percentage(company_type_train).index

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(14, 12))

sns.countplot(x='company_type',data=df_train,palette="Set2",ax=ax[0],order=order_company_type).set_title('Train')
sns.countplot(x='company_type',data=df_test,palette="Set2",ax=ax[1],order=order_company_type).set_title('Test')
fig.tight_layout()
fig.show()

## 3.12 Feature *'lastnewjob'* - Difference in years between previous job and current job

In [None]:
last_new_job_train = df_train['last_new_job']
last_new_job_test = df_test['last_new_job']

In [None]:
percentage(last_new_job_train)

In [None]:
percentage(last_new_job_test)

As you can see the% are similar in both datasets

In [None]:
order_last_new_job = percentage(last_new_job_train).index

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 8))

sns.countplot(x='last_new_job',data=df_train,palette="Set3",ax=ax[0],order=order_last_new_job).set_title('Train')
sns.countplot(x='last_new_job',data=df_test,palette="Set3",ax=ax[1],order=order_last_new_job).set_title('Test')
fig.tight_layout()
fig.show()

## 3.13 Feature *'training_hours'* - Training hours completed

In [None]:
#Distribution of training hours - Train

sns.displot(data=df_train, x='training_hours',height=6,color = 'lightblue')

In [None]:
#Distribution of training hours - Test

sns.displot(data=df_train, x='training_hours',height=6,color = 'coral')

## 3.14 *'Target'* 

     0 - Not looking for job change
     1 - Looking for a job change

In [None]:
target = df_train['target']

In [None]:
percentage(target)

In [None]:
sns.countplot(x='target',data=df_train,palette="Set1").set_title('Train')

### Target by city development index

In [None]:
#Distribution

sns.displot(data=df_train, x='city_development_index', hue='target',kind='kde',height=6)

### Target by gender

In [None]:
sns.catplot(x="target",hue='gender',data=df_train,palette="Set1",kind="count",height=6)

### Target by relevant experience

In [None]:
sns.catplot(x="target",hue='relevent_experience',data=df_train,palette="Set2",kind="count",height=6)

### Target by enrolled university

In [None]:
sns.catplot(x="target",hue='enrolled_university',data=df_train,palette="Set3",kind="count",height=6)

### Target by education level

In [None]:
sns.catplot(x="target",hue='education_level',data=df_train,palette="Set1",kind="count",height=8) 

### Target by major discipline

In [None]:
sns.catplot(x="target",hue='major_discipline',data=df_train,palette="Set2",kind="count",height=8)

### Target by major experience

In [None]:
sns.catplot(x="target",hue='experience',data=df_train,kind="count",height=16)

### Target by training hours

In [None]:
#Distribution

sns.displot(data=df_train, x='training_hours', hue='target',kind='kde',height=6)

## 4. Data Preprocessing

### 4.1 Unnecessary columns

In [None]:
#Remember the train dataset

df_train.head()

In [None]:
df_train.columns

I consider that these columns are not useful to use as features,therefore I delete them:

    'enrollee_id'

In [None]:
#I remove the column 'enrollee_id' as it has no use

df_train = df_train.drop(['enrollee_id'],axis=1)
df_train.head()

In [None]:
#Select the target

target = df_train['target']

In [None]:
#Drop the target

df_train = df_train.drop('target',axis=1)
df_train

## 4.2 Deal with NaN values

As seen previously, some columns have a significant amount of NaN. We see what % is for the columns that remained

In [None]:
percentage_nulls(df_train)

In [None]:
#View the dtype for every column

df_train.dtypes

In [None]:
#Select columns name with categorical data

cat_columns = df_train.columns[df_train.dtypes=='object']
cat_columns

### 4.2.1 Simple Imputation - Mode

As a first approach I am going to make a simple imputation through mode

In [None]:
df_train_impute_mode = df_train.copy()

In [None]:
for columna in cat_columns:
    df_train_impute_mode[columna].fillna(df_train_impute_mode[columna].mode()[0],inplace=True)

In [None]:
percentage_nulls(df_train_impute_mode)

## 4.3 Encodig of categorical variables