In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hr-analytics-job-change-of-data-scientists/sample_submission.csv
/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv
/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv


# The Dataset


In [2]:
data = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
data.head(3)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0


## Description

* enrollee_id : Unique ID (Useless here)
* city: City code
* city_ development _index : Developement index of the city (scaled)
* gender: Gender of candidate
* relevent_experience: Relevant experience of candidate
* enrolled_university: Type of University course enrolled if any
* education_level: Education level of candidate
* major_discipline :Education major discipline of candidate
* experience: Total experience in years
* company_size: No of employees in current company
* company_type : Type of current company
* lastnewjob: Difference in years between previous job and current job
* training_hours: training hours completed

    **target: 0 – Not looking for job change, 1 – Looking for a job change**

## Data Exploration

For EDA, I used this library that provides almost all the relevent information we need. So we don't have to manually look around the data ( Saves time:) )


In [3]:
import pandas_profiling as pp
pp.ProfileReport(data)

Summarize dataset:   0%|          | 0/27 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



# Preprocessing
## Label encoding "city" feature

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

data['city'] = le.fit_transform(data['city'])

# manual encoding ordinal features
Some of the categorical features in this dataset are ordinal, i.e,there is a clear ordering of the categories. So I have manually encoded these.

In [5]:
gender_map = {
    'Female' : 2,
    'Male' : 1,
    'Other' : 0
    }

relevent_experience_map = {
    'Has relevent experience' : 1,
    'No relevent experience' : 0
    }

enrolled_university_map = {
    'no_enrollment' : 0,
    'Part time course' : 1,
    'Full time course' : 2
    }

education_level_map = {
    'Primary School' :    0,
    'Graduate'       :    2,
    'Masters'        :    3, 
    'High School'    :    1, 
    'Phd'            :    4
    } 

major_map = {
    'STEM' : 0,
    'Business Degree' : 1,
    'Humanities' : 2,
    'Arts' : 3,
    'Other' : 4,
    'No Major' : 5
    }

experience_map = {
    '<1' : 0,
    '1' : 1,
    '2' : 2,
    '3' : 3,
    '4' : 4,
    '5' : 5,
    '6' : 6,
    '7' : 7,
    '8' : 8,
    '9' : 9,
    '10' : 10,
    '11' : 11,
    '12' : 12,
    '13' : 13,
    '14' : 14,
    '15' : 15,
    '16' : 16,
    '17' : 17,
    '18' : 18,
    '19' : 19,
    '20' : 20,
    '>20' : 21
    }

size_map = {
    '<10' : 0,
    '10/49' : 1,
    '50-99' : 2,
    '100-500' :3,
    '500-999' :4,
    '1000-4999': 5,
    '5000-9999' : 6,
    '10000+' : 7
    }

company_type_map = {
    'Pvt Ltd'               :    0,
    'Funded Startup'        :    1, 
    'Early Stage Startup'   :    2, 
    'Other'                 :    3, 
    'Public Sector'         :    4, 
    'NGO'                   :    5
}

last_new_job_map = {
    'never'        :    0,
    '1'            :    1, 
    '2'            :    2, 
    '3'            :    3, 
    '4'            :    4, 
    '>4'           :    5
}

data.loc[:,'education_level'] = data['education_level'].map(education_level_map)
data.loc[:,'company_size'] = data['company_size'].map(size_map)
data.loc[:,'company_type'] = data['company_type'].map(company_type_map)
data.loc[:,'last_new_job'] = data['last_new_job'].map(last_new_job_map)
data.loc[:,'major_discipline'] = data['major_discipline'].map(major_map)
data.loc[:,'enrolled_university'] = data['enrolled_university'].map(enrolled_university_map)
data.loc[:,'relevent_experience'] = data['relevent_experience'].map(relevent_experience_map)
data.loc[:,'gender'] = data['gender'].map(gender_map)
data.loc[:,'experience'] = data['experience'].map(experience_map)

# Handling missing data with knn
As we saw in the EDA report, there is a lot of missing data which we can deal with the help of KNN Imputer

In [6]:
from sklearn.impute import KNNImputer
knn_imputer = KNNImputer()
#making a copy just in case
copy = data.copy()

copy = knn_imputer.fit_transform(copy)
#rounding the knn values
copy[:, 3:] = np.round(copy[:, 3:])
data = pd.DataFrame(copy, columns = data.columns)

# One Hot Encoding 
Now, the categorical features which are not ordinal will be one hot encoded

 **Dividing numeric and categorical data**

In [7]:
numeric = data[["city_development_index", "training_hours", "target"]].copy()
category = data[["city", "gender", "relevent_experience", "enrolled_university", "education_level", "major_discipline", "experience", "company_size", "company_type", "last_new_job"]].copy()

#using the previously manual encoded columns
category_ordinalencoded = category[['education_level', 'experience', 'company_size', 'last_new_job']]


 **One Hot encoding the rest categorical columns**

In [8]:
#columns that need to be one hot encoded
one_how_columns = [ col for col in category.columns if col not in ['education_level', 'experience', 'company_size', 'last_new_job']]

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False).fit(category.loc[:, one_how_columns])
category_onehotEncoded = ohe.transform(category.loc[:, one_how_columns])
#joining all the category columns
category_preprocessed = np.concatenate([category_onehotEncoded, category_ordinalencoded], axis=1)

#joining all the features
X = np.concatenate([numeric.drop('target', axis=1).values, category_preprocessed], axis=1)
y = numeric['target'].values

## Imbalanced data
As we can see clearly, the data is very imbalanced which needs to be dealt with if we want our model to train properly.
For this, I used SMOTE(Synthetic Minority Oversampling Technique) to make the data balanced. 
> If you want to know more about SMOTE, check this [link](https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/)

In [9]:
from imblearn.over_sampling import SMOTE
X, y = SMOTE().fit_resample(X, y)

## Train-test split

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Building the Random Forest Model

## Base Model

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train, y_train)
y_base = rf.predict(X_test)
from sklearn.metrics import confusion_matrix
cm_base =  confusion_matrix(y_test, y_base)
cm_base

array([[2568,  308],
       [ 586, 2291]])

## Using RandomSearch to find the optimal parameters

RandomizedSearchCV randomly goes through the combination of parameters and gives the best one found. It does not give the absolute best parameters but its usually pretty close and helps in reducing the iteratons in Gridsearch

In [12]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 34.0min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 63.5min finished


{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 110,
 'bootstrap': False}

Comparing the Random search model

In [13]:
best_random = rf_random.best_estimator_
y_rand = best_random.predict(X_test)

#confusion matrix
from sklearn.metrics import confusion_matrix
cm_rand = confusion_matrix(y_test, y_rand)
cm_rand

array([[2558,  318],
       [ 537, 2340]])

## Grid Search

In [14]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [None, 20, 50, 70, 100],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [2],
    'min_samples_split': [5],
    'n_estimators': [400, 600, 800, 1200, 1400, 1600]
}
# Create a base model
rfc = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rfc, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_train)
grid_search.best_params_

y_grid = grid_search.predict(X_test)
cm_grid = confusion_matrix(y_test, y_grid)
cm_grid


Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 35.7min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 42.4min finished


array([[2559,  317],
       [ 540, 2337]])

**NOTE: I used random search twice and created the param_grid based on those two results (Yes, I got two different best_params_!)**

## Accuracy

In [20]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(y_test, y_grid))
accuracy_score(y_test, y_grid)

              precision    recall  f1-score   support

         0.0       0.83      0.89      0.86      2876
         1.0       0.88      0.81      0.85      2877

    accuracy                           0.85      5753
   macro avg       0.85      0.85      0.85      5753
weighted avg       0.85      0.85      0.85      5753



0.8510342430036503