# General Informations

* First, I will clean the data to improve the classifier's performance.
* Next, I'll do a Bayesian optimization on Random Forest's hyper parameters. For that, I will use the scikit-optmize package.
* And, I will end with the results of the optimized classifier.

# Importing Packages

In [None]:
import numpy as np

import pandas as pd
pd.options.plotting.backend = "plotly"

import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (train_test_split, ShuffleSplit, cross_val_score)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix

from skopt.space import (Real,
                         Integer,
                         Categorical)
from skopt.utils import use_named_args
from skopt import (gp_minimize, 
                   dump, 
                   load)
from skopt.plots import plot_convergence
from skopt.callbacks import CheckpointSaver

import json

# Dataset

## Reading dataset

In [None]:
df = pd.read_csv('../input/human-resources-data-set/HRDataset_v13.csv')

# Null rows after 310
df = df.iloc[0:310, :]

In [None]:
df

## Missing data

There are 3 features with more than 30% of missing data.

In [None]:
df.isnull().sum()/df.shape[0]

### Deleting features with more than 30% missing data  

In [None]:
df = df.loc[:, (df.isnull().sum()/df.shape[0] < 0.3)]

In [None]:
df.isnull().sum()

### Droping samples with missing data 
Doing this because it`s just 3% of all the dataset (9 samples). 

#### The samples with missing data

In [None]:
df[df.isnull().any(axis=1)]

#### Deleting samples

In [None]:
df.dropna(axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

## Feature Analysis

In [None]:
df.dtypes

### Numeric features

I`m going to use 4.

In [None]:
df[['PayRate', 'EngagementSurvey', 'EmpSatisfaction', 'SpecialProjectsCount']]

#### PayRate

In [None]:
df['PayRate'].plot.hist()

#### EngagementSurvey

In [None]:
df['EngagementSurvey'].plot.hist()

#### EmpSatisfaction

In [None]:
df['EmpSatisfaction'].plot.hist()

#### SpecialProjectsCount

In [None]:
df['SpecialProjectsCount'].plot.hist()

### Categorical features

I`m going to use 10.

In [None]:
df[['Position', 'State', 'RecruitmentSource', 'ManagerName', 'Sex', 
    'MaritalDesc', 'CitizenDesc', 'HispanicLatino','RaceDesc', 'Department']]

#### Position

In [None]:
df['Position'].plot.hist()

#### State

In [None]:
df['State'].plot.hist()

Let`s transform the feature to "MA" and "non-MA" categories. 

In [None]:
for state in df['State'].unique():
    if state != 'MA':
        df.replace(state, 'non-MA', inplace=True)

In [None]:
df['State'].plot.hist()

#### RecruitmentSource

In [None]:
df['RecruitmentSource'].plot.hist()

Adding 'Pay Per Click', 'On-line Web application', 'Careerbuilder' and 'Company Intranet - Partner' into 'Other'

In [None]:
df['RecruitmentSource'].replace('Pay Per Click', 'Other', inplace=True)
df['RecruitmentSource'].replace('On-line Web application', 'Other', inplace=True)
df['RecruitmentSource'].replace('Careerbuilder', 'Other', inplace=True)
df['RecruitmentSource'].replace('Company Intranet - Partner', 'Other', inplace=True)

In [None]:
df['RecruitmentSource'].plot.hist()

#### ManagerName

In [None]:
df['ManagerName'].plot.hist()

#### Sex

In [None]:
df['Sex'].plot.hist()

#### MaritalDesc

In [None]:
df['MaritalDesc'].plot.hist()

#### CitizenDesc

In [None]:
df['CitizenDesc'].plot.hist()

Let`s transform the feature to "US Citizen" and "non-US Citizen" categories. 

In [None]:
for state in df['CitizenDesc'].unique():
    if state != 'US Citizen':
        df.replace(state, 'non-US Citizen', inplace=True)

In [None]:
df['CitizenDesc'].plot.hist()

#### HispanicLatino

In [None]:
df['HispanicLatino'].plot.hist()

Correcting the capital letters problem.

In [None]:
df['HispanicLatino'].replace('yes', 'Yes', inplace=True)
df['HispanicLatino'].replace('no', 'No', inplace=True)

In [None]:
df['HispanicLatino'].plot.hist()

#### RaceDesc

In [None]:
df['RaceDesc'].plot.hist()

Let`s transform "American Indian or Alaska Native", "Two or more races" and "Hispanic" categories into a single class "Others". 

In [None]:
df['RaceDesc'].replace('American Indian or Alaska Native', 'Others', inplace=True)
df['RaceDesc'].replace('Two or more races', 'Others', inplace=True)
df['RaceDesc'].replace('Hispanic', 'Others', inplace=True)

In [None]:
df['RaceDesc'].plot.hist()

There is no need to use the the "HispanicLatino" feature. It is redundant.

In [None]:
df.drop('HispanicLatino', axis=1, inplace=True)

#### Department

In [None]:
df['Department'].plot.hist()

Let`s remove the single "Executive Office" sample from the dataset. It is an outlier. 

In [None]:
print(df.shape)
df.drop(df[df['Department'] == 'Executive Office'].index, axis=0, inplace=True)
print(df.shape)

In [None]:
df['Department'].plot.hist()

### Target

I`m going to create a model that will predict the Performance Score.  

In [None]:
df['PerformanceScore'].plot.hist()

Let`s transform "PIP" and "Needs Improvement" categories into a single class "Bad". The same with  "Fully Meets" and "Exceeds" categories into a single class "Good"

In [None]:
df['PerformanceScore'].replace('PIP', 'Bad', inplace=True)
df['PerformanceScore'].replace('Needs Improvement', 'Bad', inplace=True)
df['PerformanceScore'].replace('Fully Meets', 'Good', inplace=True)
df['PerformanceScore'].replace('Exceeds', 'Good', inplace=True)

In [None]:
df['PerformanceScore'].plot.hist()

## Encoding data

In [None]:
X = df[['PayRate', 'EngagementSurvey', 'EmpSatisfaction', 'SpecialProjectsCount', 
        'Position', 'State', 'RecruitmentSource', 'ManagerName', 'Sex', 'MaritalDesc', 
        'CitizenDesc', 'RaceDesc', 'Department']]

y = df['PerformanceScore']

In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), slice(4, X.shape[1]))],
                       remainder='passthrough')
X = ct.fit_transform(X)

# Random Forest

In [None]:
model_name = 'random_forest'

In [None]:
input_shape = X.shape[1]
print(f'input_shape = {input_shape}')

try:
    output_shape = Y.shape[1]
except:
    output_shape = 1
print(f'output_shape = {output_shape}')
      
n_samples = X.shape[0]
print(f'n_samples = {n_samples}')

In [None]:
cv = ShuffleSplit(n_splits=20, test_size=0.20)
cv_opt = ShuffleSplit(n_splits=20, test_size=0.20)

## Hyperparameters Optimization

In [None]:
hyperparams_names = ['n_estimators',
                     'max_depth', 
                     'max_features', 
                     'min_samples_split', 
                     'min_samples_leaf']

In [None]:
space  = [Integer(64, 1024, name=hyperparams_names[0]),
          Integer(2, 256, name=hyperparams_names[1]),
          Integer(2, input_shape, name=hyperparams_names[2]),
          Integer(2, 16, name=hyperparams_names[3]),
          Integer(1, 16, name=hyperparams_names[4])]

In [None]:
@use_named_args(space)
def objective(**hyperparams):
    
    print(hyperparams)
        
    cv_scores = cross_val_score(RandomForestClassifier(**hyperparams), 
                                X, y, cv=cv_opt)
    
    return -np.mean(cv_scores)

In [None]:
res_gp = gp_minimize(objective, space, n_calls=100, n_random_starts=20, 
                    random_state=0, verbose=1)

In [None]:
plot_convergence(res_gp)
plt.show()

In [None]:
best_hyperparams = {param:value for param, value in zip(hyperparams_names, res_gp.x)}
print(f'best_hyperparams = {best_hyperparams}')

## Cross-validation

In [None]:
hyperparams = best_hyperparams

In [None]:
clf = RandomForestClassifier(**hyperparams)
cross_val_scores = pd.DataFrame(cross_val_score(clf, X, y, cv=cv), columns=['Random Forest'])

print(f'cross validation score (accuracy): {round(cross_val_scores.mean().values[0], 2)} +/- {round(cross_val_scores.std().values[0], 2)}')

In [None]:
cross_val_scores.plot.box()

## Prediction

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier(**hyperparams)

clf.fit(X_train, y_train)

score = clf.score(X_test, y_test)
print(round(score, 2))

In [None]:
disp = plot_confusion_matrix(clf, X_test, y_test, cmap=plt.cm.Blues)
disp.ax_.set_title(f'Confusion Matrix  -  accuracy: {round(score, 2)}')

# Observations

* The target is quite asymmetrical. This may have induced a biased training. To better handle this, it may be better to change the metric to "balanced_accuracy_score". [https://scikit-learn.org/stable/modules/model_evaluation.html#balanced-accuracy-score](http://)