In [None]:
import numpy as np 
import pandas as pd 
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import KNNImputer
import pandas_profiling as pp
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# The Dataset


In [None]:
data = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
data.head(3)

## Description
With the help of these features we have to predict which data scientist is looking for a job change and thus, is a better prospect in terms of hiring.

* enrollee_id : Unique ID (Useless here)
* city: City code
* city_ development _index : Developement index of the city (scaled)
* gender: Gender of candidate
* relevent_experience: Relevant experience of candidate
* enrolled_university: Type of University course enrolled if any
* education_level: Education level of candidate
* major_discipline :Education major discipline of candidate
* experience: Total experience in years
* company_size: No of employees in current company
* company_type : Type of current company
* lastnewjob: Difference in years between previous job and current job
* training_hours: training hours completed

    **target: 0 – Not looking for job change, 1 – Looking for a job change**

## Data Exploration

For EDA, I used this library that provides almost all the relevent information we need. So we don't have to manually look around the data ( Saves time:) )


In [None]:
pp.ProfileReport(data)

# Preprocessing
**Label encoding "city" feature**

In [None]:
le = LabelEncoder()

data['city'] = le.fit_transform(data['city'])
data['city'].head(5)

**Manual encoding ordinal features**
Some of the categorical features in this dataset are ordinal, i.e,there is a clear ordering of the categories. So I have manually encoded these.

In [None]:
gender_map = {
    'Female' : 2,
    'Male' : 1,
    'Other' : 0
    }

relevent_experience_map = {
    'Has relevent experience' : 1,
    'No relevent experience' : 0
    }

enrolled_university_map = {
    'no_enrollment' : 0,
    'Part time course' : 1,
    'Full time course' : 2
    }

education_level_map = {
    'Primary School' :    0,
    'Graduate'       :    2,
    'Masters'        :    3, 
    'High School'    :    1, 
    'Phd'            :    4
    } 

major_map = {
    'STEM' : 0,
    'Business Degree' : 1,
    'Humanities' : 2,
    'Arts' : 3,
    'Other' : 4,
    'No Major' : 5
    }

experience_map = {
    '<1' : 0,
    '1' : 1,
    '2' : 2,
    '3' : 3,
    '4' : 4,
    '5' : 5,
    '6' : 6,
    '7' : 7,
    '8' : 8,
    '9' : 9,
    '10' : 10,
    '11' : 11,
    '12' : 12,
    '13' : 13,
    '14' : 14,
    '15' : 15,
    '16' : 16,
    '17' : 17,
    '18' : 18,
    '19' : 19,
    '20' : 20,
    '>20' : 21
    }

size_map = {
    '<10' : 0,
    '10/49' : 1,
    '50-99' : 2,
    '100-500' :3,
    '500-999' :4,
    '1000-4999': 5,
    '5000-9999' : 6,
    '10000+' : 7
    }

company_type_map = {
    'Pvt Ltd'               :    0,
    'Funded Startup'        :    1, 
    'Early Stage Startup'   :    2, 
    'Other'                 :    3, 
    'Public Sector'         :    4, 
    'NGO'                   :    5
}

last_new_job_map = {
    'never'        :    0,
    '1'            :    1, 
    '2'            :    2, 
    '3'            :    3, 
    '4'            :    4, 
    '>4'           :    5
}

data.loc[:,'education_level'] = data['education_level'].map(education_level_map)
data.loc[:,'company_size'] = data['company_size'].map(size_map)
data.loc[:,'company_type'] = data['company_type'].map(company_type_map)
data.loc[:,'last_new_job'] = data['last_new_job'].map(last_new_job_map)
data.loc[:,'major_discipline'] = data['major_discipline'].map(major_map)
data.loc[:,'enrolled_university'] = data['enrolled_university'].map(enrolled_university_map)
data.loc[:,'relevent_experience'] = data['relevent_experience'].map(relevent_experience_map)
data.loc[:,'gender'] = data['gender'].map(gender_map)
data.loc[:,'experience'] = data['experience'].map(experience_map)

# Handling missing data with knn
As we saw in the EDA report, there is a lot of missing data which we can deal with the help of KNN Imputer

In [None]:
knn_imputer = KNNImputer()
#making a copy just in case
copy = data.copy()

copy = knn_imputer.fit_transform(copy)
#rounding the knn values
copy[:, 3:] = np.round(copy[:, 3:])
data = pd.DataFrame(copy, columns = data.columns)

# One Hot Encoding 
Now, the categorical features which are not ordinal will be one hot encoded

 **Dividing numeric and categorical data**

In [None]:
numeric = data[["city_development_index", "training_hours", "target"]].copy()
category = data[["city", "gender", "relevent_experience", "enrolled_university", "education_level", "major_discipline", "experience", "company_size", "company_type", "last_new_job"]].copy()

#using the previously manual encoded columns
category_ordinalencoded = category[['education_level', 'experience', 'company_size', 'last_new_job']]


 **One Hot encoding the rest categorical columns**

In [None]:
#columns that need to be one hot encoded
one_how_columns = [ col for col in category.columns if col not in ['education_level', 'experience', 'company_size', 'last_new_job']]

#onehotencoder
ohe = OneHotEncoder(sparse=False).fit(category.loc[:, one_how_columns])
category_onehotEncoded = ohe.transform(category.loc[:, one_how_columns])

#joining all the category columns
category_preprocessed = np.concatenate([category_onehotEncoded, category_ordinalencoded], axis=1)

#joining all the features
X = np.concatenate([numeric.drop('target', axis=1).values, category_preprocessed], axis=1)
y = numeric['target'].values

## Imbalanced data
As we can see clearly, the data is very imbalanced which needs to be dealt with if we want our model to train properly.
For this, I used SMOTE(Synthetic Minority Oversampling Technique) to make the data balanced. 
> If you want to know more about SMOTE, check this [link](https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/)

In [None]:
X, y = SMOTE(random_state = 99).fit_resample(X, y)

## Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state = 99)

# Comparing models
### Decision tree

In [None]:
dtc = DecisionTreeClassifier(criterion='entropy',random_state = 99)
dtc.fit(X_train, y_train)

y_dtc = dtc.predict(X_test)
accuracy_score(y_test, y_dtc)

Decision tree gives us a **79.2%** accuracy
### SVM

In [None]:
#svm needs scaled data
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
train_copy = X_train.copy()
test_copy = X_test.copy()

train_copy = sc_X.fit_transform(train_copy)
test_copy = sc_X.transform(test_copy)
svm = SVC(kernel = 'rbf', random_state=0)
svm.fit(train_copy, y_train)

#predicting test set
y_svm = svm.predict(test_copy)
accuracy_score(y_test, y_svm)

SVM gives us a **79.1%** accuracy
### Random Forest

In [None]:
rf1 = RandomForestClassifier(random_state = 0)
rf1.fit(X_train, y_train)

#predicting test set
y_rf1 = rf1.predict(X_test)
accuracy_score(y_test, y_rf1)

Random forest performs the best among these with an accuracy of **84.2%**

So, we'll finetune the random forest model to increase the performance.

# Tuning the Random Forest Model

## Base Model

In [None]:
rf = RandomForestClassifier(random_state = 99)
rf.fit(X_train, y_train)

y_base = rf.predict(X_test)

cm_base =  confusion_matrix(y_test, y_base)
cm_base

## Using RandomSearch to find the optimal parameters

RandomizedSearchCV randomly goes through the combination of parameters and gives the best one found. It does not give the absolute best parameters but its usually pretty close and helps in reducing the iteratons in Gridsearch

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

rf_random.best_params_

Comparing the Random search model

In [None]:
best_random = rf_random.best_estimator_
y_rand = best_random.predict(X_test)

#confusion matrix
cm_rand = confusion_matrix(y_test, y_rand)
cm_rand

## Grid Search

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [None, 20, 50, 70, 100],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [2],
    'min_samples_split': [5],
    'n_estimators': [400, 600, 800, 1200, 1400, 1600]
}
# Create a base model
rfc = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rfc, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_train)
grid_search.best_params_

y_grid = grid_search.predict(X_test)
cm_grid = confusion_matrix(y_test, y_grid)
cm_grid


**NOTE: I used random search twice and created the param_grid based on those two results (Yes, I got two different best_params_!)**

## Accuracy

In [None]:
print(classification_report(y_test, y_grid))
accuracy_score(y_test, y_grid)