In [32]:
import numpy
import pandas as pd
import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
import seaborn
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder


print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

recruitment = pd.read_csv('recruitment_data.csv', sep=',', decimal='.')


# Instanciando o objeto One-Hot encoder, pois vai ser utilizada na coluna Recruitment Strategy
ohe = OneHotEncoder(handle_unknown='ignore')
# Fazendo a transformação
ohe_df = pd.DataFrame(ohe.fit_transform(recruitment[['RecruitmentStrategy']]).toarray(), columns=['RecruitmentStrategy_1','RecruitmentStrategy_2','RecruitmentStrategy_3'])

# Agrupando a nova coluna com o df original
df = recruitment.join(ohe_df)
recruitment = df.drop(['RecruitmentStrategy'], axis=1)

features = list(recruitment.columns.values)

classificacao = 'HiringDecision'

corrKendall = recruitment.corr('kendall')

corrPearson = recruitment.corr('pearson')

corrSpearman = recruitment.corr('spearman')

tabela_correlacoes = pd.DataFrame()

tabela_correlacoes['kendal']= corrKendall['HiringDecision'].sort_values(ascending=False)
tabela_correlacoes['pearson']= corrPearson['HiringDecision'].sort_values(ascending=False)
tabela_correlacoes['spearman']= corrSpearman['HiringDecision'].sort_values(ascending=False)

tabela_correlacoes.drop(tabela_correlacoes[tabela_correlacoes['kendal']==1].index, inplace=True)
#Verifica-se que a estrategia de recrutamento  =1 que eh a agressiva, vai impactar muito na decisao final de contratacao
tabela_correlacoes

Num GPUs Available:  1


Unnamed: 0,kendal,pearson,spearman
RecruitmentStrategy_1,0.57133,0.57133,0.57133
EducationLevel,0.214417,0.23671,0.230218
SkillScore,0.167029,0.203668,0.203484
PersonalityScore,0.137927,0.169177,0.168043
InterviewScore,0.123372,0.146064,0.150309
ExperienceYears,0.102202,0.122494,0.121406
PreviousCompanies,0.039412,0.044025,0.044063
Age,0.002075,0.00185,0.0025
Gender,-0.002249,-0.002249,-0.002249
DistanceFromCompany,-0.013621,-0.016791,-0.016676


In [30]:

#removendo o sexo pois de acordo com o plot de distruicao , a chance de recrutamento eh equivalente para os dois sexos
#removendo distancia da compania pq pela correlacao usando todos metodos, o resultado foi muito abaixo.
features.remove('Gender')
features.remove('DistanceFromCompany')

#Remove o output
features.remove(classificacao)

X = recruitment[features].to_numpy() 
Y = recruitment[classificacao]

# Normalizando as features X
X_scaler = StandardScaler()
X_scaler = X_scaler.fit(X)
X_normalizado_standard = X_scaler.transform(X)


scaler = MinMaxScaler()
scaler = scaler.fit(X)
X_normalizado_min_max = scaler.transform(X)
#seaborn.scatterplot(X_normalizado_min_max) #melhor pois tem features que tem escala diferente 



# PARAMETROS DA REDE 
maximo_neuronios = 5
funcao_ativacao = 'tanh' 
metrica = 'Accuracy' 
#quantidade de loops q vai esperar ate q o erro de teste comece a aumentar 
paciencia = 300
max_epocas = 5*paciencia
min_improvement = 0.01
adam_initial_lr = 0.01
adam_lr_decay = 0.99
rollback_on_no_lower_bound_gain = True #se comecar o erro a subir, volta pra onde tava bom

# SEED que controla a aleatoriedade 
random_seed = 22
n_k_folds = 5 #quantas partes o dataset vai ser dividido
keras.utils.set_random_seed(random_seed)
# DIVIDE EM K PEDACOS
sk_folds = StratifiedKFold(n_splits=n_k_folds)
sk_folds.get_n_splits(X_normalizado_min_max, Y)

5

In [31]:
# TREINANDO
metric_lower_bound = 0.0
metric_median = 0.0
resultados = []
for n_neuron in numpy.arange(1, maximo_neuronios+1):
    ix_fold = 1
    #divisao os grupos de treino e teste, ele faz pelos indices
    for train_index, test_index in sk_folds.split(X_normalizado_min_max, Y):
        # pegar o dado do teste  e treino   a partir dos indices da divisao feita pelo skfolds
        X_train, X_test = X_normalizado_min_max[train_index], X_normalizado_min_max[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        #configurando o otimizador 
        optimizer = keras.optimizers.Adam( learning_rate=adam_initial_lr, 
                                                  beta_1=adam_lr_decay ) 
        
        # Configurando a rede
        model_name = f"ADAM-N{n_neuron}F{ix_fold}_{classificacao}"
        val_metric_name = f'val_{metrica}'
        
        inputs = keras.Input(shape=(len(features), ))
        hidden = keras.layers.Dense(n_neuron, activation=funcao_ativacao)(inputs)
        outputs = keras.layers.Dense(1, activation=funcao_ativacao)(hidden) #penas 1 classificacao = 1 saida
        rede = keras.Model(inputs=inputs, outputs=outputs, name=model_name)

        #Usar o otimizador com erro medio quadratico 
        rede.compile(optimizer=optimizer, loss='mse', metrics=[metrica])
        
        #early stop com foco no erro do grupo de validacao
        es_loss = keras.callbacks.EarlyStopping(monitor='val_loss', patience=paciencia, restore_best_weights=True)
        #early stop com foco na acuracia dos testes
        es_metric = keras.callbacks.EarlyStopping(monitor=metrica, mode='max', patience=paciencia, min_delta=min_improvement, restore_best_weights=True)


        #iniciando a rede para tentar encontrar o modelo
        resultado = rede.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=X_train.shape[0], 
                                epochs=max_epocas, verbose=0, callbacks=[es_loss, es_metric])

    

        # RESULTADOS DA REDE
        model_loss = resultado.history['loss'][-1]
        model_val_loss = resultado.history['val_loss'][-1]
        metric_val = resultado.history[metrica][-1]
        val_metric_val = resultado.history[val_metric_name][-1]
        n_epochs = len(resultado.history['val_loss'])

        resultado_dict = {'model_name': model_name,
                        'optimizer': optimizer,
                        'neurons': n_neuron,
                        'fold': ix_fold,
                        'loss': model_loss,
                        'val_loss': model_val_loss,
                        metrica: metric_val,
                        val_metric_name: val_metric_val,
                        'epochs': n_epochs,
                        'net': rede,
                        'history': resultado}
        resultados.append(resultado_dict)

        print(f"{model_name} > epochs: {n_epochs} loss: {model_loss} val_loss: {model_val_loss} {metrica}: {100.0*metric_val}% {val_metric_name}: {100.0*val_metric_val}%")
              

        # SE CHEGAR NO 100% para de aumentar 
        maximized_metrics = (val_metric_val == 1.0) & (metric_val == 1.0)

        if maximized_metrics:
            break

        print('')
        ix_fold += 1
        
    # GET LOWER BOUND OF WINNING ALGORITHM
    df_resultados = pd.DataFrame(resultados)


ADAM-N1F1_RecruitmentStrategy_3 > epochs: 566 loss: 0.0017717911396175623 val_loss: 0.0016852682456374168 Accuracy: 100.0% val_Accuracy: 100.0%
ADAM-N2F1_RecruitmentStrategy_3 > epochs: 528 loss: 0.0021910679060965776 val_loss: 0.002174674067646265 Accuracy: 100.0% val_Accuracy: 100.0%
ADAM-N3F1_RecruitmentStrategy_3 > epochs: 364 loss: 0.000555908540263772 val_loss: 0.0005406267009675503 Accuracy: 100.0% val_Accuracy: 100.0%
ADAM-N4F1_RecruitmentStrategy_3 > epochs: 343 loss: 0.0007782704196870327 val_loss: 0.0008650257368572056 Accuracy: 100.0% val_Accuracy: 100.0%
ADAM-N5F1_RecruitmentStrategy_3 > epochs: 316 loss: 0.0004115051415283233 val_loss: 0.0003974048886448145 Accuracy: 100.0% val_Accuracy: 100.0%
