# Is the candidate looking for a new job? by Martina Raabe


<img src="https://images.unsplash.com/photo-1455849318743-b2233052fcff?ixid=MXwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHw%3D&ixlib=rb-1.2.1&auto=format&fit=crop&w=1950&q=80" width="500" height="400" align="center"/>


[Source](https://unsplash.com/@goian)

<h1 style='background: black; border:1; color: white'><center>Introduction</center></h1>


This dataset is designed to understand the factors that lead to a person to leave their current job. This notebook uses XGBoostClassifier to determine whether it is likely for a person to look for a new job. The whole data is divided to train and test. 

The following steps are performed in the notebook:

1.  The training data is cleaned and missing values are handled via imputation and substitution.
2.  A short EDA is performed with the goal to better understand the data.
3.  A model (XGBoost) is trained in order to predict whether a candidate is looking for a new job or not.


If you like this notebook, please don't forget to **upvote**. Thanks!

<h1 style='background: black; border:1; color: white'><center>Importing, preparing and getting to know the data</center></h1>

**Features**

enrollee_id : Unique ID for candidate

city: City code

city_ development _index : Developement index of the city (scaled)

gender: Gender of candidate

relevent_experience: Relevant experience of candidate

enrolled_university: Type of University course enrolled if any

education_level: Education level of candidate

major_discipline :Education major discipline of candidate

experience: Candidate total experience in years

company_size: No of employees in current employer's company

company_type : Type of current employer

lastnewjob: Difference in years between previous job and current job

training_hours: training hours completed

target: 0 – Not looking for job change, 1 – Looking for a job change

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set() # Setting seaborn as default style even if use only matplotlib

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import roc_curve


import os
print(os.listdir("../input"))


In [None]:
#import test dataset into DataFrame

data = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
data.head(5)

In [None]:
data.describe()

In [None]:
data.info()

The above output shows that the column 'gender' has the most missing values. All missing values will be handled and imputed or substituted.


In [None]:
#fill nan-values of 'gender' with last valid value and 'experience' with 0 (since it probably is 0 years)
data['gender'] = data['gender'].fillna(method='ffill')
data['experience'] = data['experience'].fillna(0)


#replacing nan-values with most common values in the column
cols_nan_replace = ['enrolled_university','education_level', 'major_discipline', 'company_size', 'company_type', 'last_new_job']
for col in cols_nan_replace:
    most_common_value = data[col].mode().iloc[0]
    data[col] = data[col].fillna(most_common_value)

    
# replacing values in columns 'experience' and 'last_new_job' and converting them to int64 type

data['experience'] = data['experience'].replace('>20', '21').replace('<1', 0)
data['last_new_job'] = data['last_new_job'].replace('>4', 5).replace('never', 0)

# converting cols to data type int
convert_cols = ['experience', 'last_new_job']
data[convert_cols] = data[convert_cols].apply(lambda x: x.astype(int)) 

In [None]:
# dropping unnecessary column 'city'

data.drop(['city', 'enrollee_id'], inplace=True, axis=1)

In [None]:
# Creating countplots for selected columns

fig, axes = plt.subplots(1,4, figsize=(20, 5))
n = 0
fig.suptitle('Countplots of various columns')

cols = ['gender', 'education_level', 'relevent_experience', 'major_discipline']

for col in cols:
    sns.countplot(ax=axes[n], data=data, x=col, palette='rocket')
    axes[n].set_title('Count of {}'.format(col))
    axes[n].set_xlabel('')
    axes[n].set_ylabel('')
    axes[n].tick_params('x',labelrotation=70)
    n += 1
    
  
plt.show()


In [None]:
# plot mean of training hours per gender
training_hours = data.groupby('gender')['training_hours'].mean().sort_values(ascending=False)
training_hours = pd.DataFrame(training_hours)

n = training_hours.index
s = training_hours['training_hours']

training_hours.plot(kind='bar', color='green')
plt.title('Mean of training hours by gender')
plt.xticks(rotation=70)
plt.xlabel('')
plt.legend().remove()

#adding annotations to the bars

for i in range(len(n)):
    plt.annotate(str(round(s[i],2)), xy=(n[i],s[i]), ha='center', va='bottom')

plt.show()

The graphic shows that the gender 'Other' had the most training hours whereas the men have the least training hours.

<h1 style='background: black; border:1; color: white'><center>OneHotEncoding of categorical features</center></h1>

In [None]:
# Create X(training data) and y (target variable) by subsetting the data 

X,y  = data.iloc [:, :-1], data.iloc[:, -1]

In [None]:
# using pd.get_dummies to encode features without ordinal relationship
# only the relevant columns are encoded with pd.get_dummies

# select categorical columns which will be encoded
categorical_cols = X.columns[X.dtypes == 'object'].to_list()

# get_dummies takes the whole dataframe and encodes only the categorical columns
X_encoded = pd.get_dummies(X, columns = categorical_cols, drop_first=True)

print('The shape of the Dataframe changed from formerly {}'.format(X.shape[1]), 'to now {}'.format(X_encoded.shape[1]), 'columns.' )

In [None]:
X_encoded.rename(columns={'company_size_10000+': 'company_size_10000_more', 'company_size_<10':'company_size_10_more'}, inplace=True)

<h1 style='background: black; border:1; color: white'><center>Prediction with XGBoost</center></h1>

In [None]:
# Creating countplots for selected columns

fig, axes = plt.subplots(1,4, figsize=(20, 5))
n = 0
fig.suptitle('Countplots of various columns')

cols = ['experience', 'last_new_job', 'gender_Male', 'relevent_experience_No relevent experience']

for col in cols:
    sns.countplot(ax=axes[n], data=X_encoded, x=col, palette='rocket')
    axes[n].set_title('Count of {}'.format(col))
    axes[n].set_xlabel('')
    axes[n].set_ylabel('')
    axes[n].tick_params('x',labelrotation=70)
    n += 1
    
  
plt.show()

We see that the data is imbalanced. This will affect the performance of the model. Hence, the data will be balanced in the next step.

In [None]:
# As the data is imbalanced we use SMOTE for balancing of the data

smote = SMOTE(random_state = 402)
X_smote, y_smote = smote.fit_resample(X_encoded,y)

In [None]:
# Creating countplots for selected columns to check how smote changed the distribution

fig, axes = plt.subplots(1,4, figsize=(20, 5))
n = 0
fig.suptitle('Countplots of various columns')

cols = ['experience', 'last_new_job', 'gender_Male', 'relevent_experience_No relevent experience']

for col in cols:
    sns.countplot(ax=axes[n], data=X_smote, x=col, palette='rocket')
    axes[n].set_title('Count of {}'.format(col))
    axes[n].set_xlabel('')
    axes[n].set_ylabel('')
    axes[n].tick_params('x',labelrotation=70)
    n += 1
    
plt.show()

In [None]:
# Splitting the dataset into train and test set

X_train, X_test, y_train, y_test = train_test_split(X_smote,y_smote, test_size=0.3, random_state=42)


# Normalizing the dataset with StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit(X_test)

In [None]:
# Instantiate XGBoost Classifier
# Define params for RandomizedSearchCV
# Instantiate RandomizedSearchCV object with params

clf = xgb.XGBClassifier(objective='binary:logistic', seed=42)

params = {'max_depth': np.arange(2, 10),
          'n_estimators': [5, 10, 15, 20, 25],
          'colsample_bytree': [0.3, 0.7],
          'subsample': [0.4, 0.6, 0.8, 1.0]
         }

randomized_cv = RandomizedSearchCV(estimator=clf,param_distributions=params, scoring='roc_auc', n_iter=5, cv=5, verbose=1, n_jobs=1, 
                                   return_train_score=True)

# Fit the data
randomized_cv.fit(X_train,y_train)

In [None]:
print("Best parameters found: ", randomized_cv.best_params_)

print("Best score found: ", randomized_cv.best_score_)

In [None]:
# Instantiating a classifier with the obtained params

clf = xgb.XGBClassifier(colsample_bytree= 0.3,
                       n_estimators= 25,
                       max_depth= 9,
                       subsample= 0.4)

model_fit = clf.fit(X_train, y_train)

In [None]:
# Predicting the probability and set threshold at 0.5

y_proba = clf.predict_proba(X_train)[:,1]
y_pred = (y_proba > 0.5).astype(bool)

In [None]:
# ROC curve chart

fallout, sensitivity, thresholds = roc_curve(y_train, y_proba)
plt.plot(fallout, sensitivity, color = 'darkorange')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.title ('Area under the curve')
plt.show()

# Printing AUC score
print ('The AUC score is {}'.format (round(metrics.roc_auc_score(y_train,y_pred),3)))

In [None]:
# Get the feature importance
feature_importance_dict = {}
feature_importance = clf.get_booster().get_score(importance_type = 'weight')


for feat, importance in zip(X_encoded.columns, feature_importance.values()):
    feature_importance_dict[feat] = importance

# Print the 5 most important features
print (sorted(feature_importance_dict.items(), key=lambda x:x[1])[-5:])