In [1]:
import numpy
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
import seaborn
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

recrutamento = pd.read_csv('recruitment_data.csv', sep=',', decimal='.')


# Instanciando o objeto One-Hot encoder, pois vai ser utilizada na coluna recrutamento Strategy
ohe = OneHotEncoder(handle_unknown='ignore')
# Fazendo a transformação
ohe_df = pd.DataFrame(ohe.fit_transform(recrutamento[['RecruitmentStrategy']]).toarray(), columns=['RecruitmentStrategy_1','RecruitmentStrategy_2','RecruitmentStrategy_3'])

# Agrupando a nova coluna com o df original
df = recrutamento.join(ohe_df)

recrutamento = df.drop(['RecruitmentStrategy'], axis=1)

features = list(recrutamento.columns.values)

classificacao = 'HiringDecision'

corrKendall = recrutamento.corr('kendall')

corrPearson = recrutamento.corr('pearson')

corrSpearman = recrutamento.corr('spearman')

tabela_correlacoes = pd.DataFrame()

tabela_correlacoes['kendal']= corrKendall['HiringDecision'].sort_values(ascending=False)
tabela_correlacoes['pearson']= corrPearson['HiringDecision'].sort_values(ascending=False)
tabela_correlacoes['spearman']= corrSpearman['HiringDecision'].sort_values(ascending=False)

tabela_correlacoes.drop(tabela_correlacoes[tabela_correlacoes['kendal']==1].index, inplace=True)
#Verifica-se que a estrategia de recrutamento  =1 que eh a agressiva, vai impactar muito na decisao final de contratacao

#removendo as features que tiveram indice de correlacao menor que 0.1
#tabela_correlacoes[tabela_correlacoes['kendal']<0.1].index

#relacoes_excluidas = tabela_correlacoes[tabela_correlacoes['kendal']<0.1].index

#features = [i for i in features if i not in relacoes_excluidas]

features.remove('PreviousCompanies')
features.remove('Age')
features.remove('Gender')
features.remove('DistanceFromCompany')
#Remove o output
features.remove(classificacao)

X = recrutamento[features].to_numpy() 
Y = recrutamento[classificacao].to_numpy() 


# Normalizando as features X
X_scaler = StandardScaler()
X_scaler = X_scaler.fit(X)
X_normalizado_standard = X_scaler.transform(X)


scaler = MinMaxScaler()
scaler = scaler.fit(X)
X_normalizado_min_max = scaler.transform(X)


X_transformado= X_normalizado_standard

#Testando usando PCA
# pca = PCA(n_components=3)
# pca.fit(X)

# print(pca.explained_variance_ratio_)
# print(pca.singular_values_)

# X_transformado = pca.fit_transform(X= X_transformado)


#seaborn.scatterplot(X_normalizado_min_max) #melhor pois tem features que tem escala diferente 



Num GPUs Available:  1


In [2]:
# PARAMETROS DA REDE 
maximo_neuronios = 5
funcoes_ativacao = ['tanh', 'relu', 'sigmoid'] 
metrica = 'Accuracy' 
#quantidade de loops q vai esperar ate q o erro de teste comece a aumentar 
paciencia = 150
max_epocas =3*paciencia
min_improvement = 0.01
adam_initial_lr = 0.01
adam_lr_decay = 0.99
rollback_on_no_lower_bound_gain = True #se comecar o erro a subir, volta pra onde tava bom

# SEED que controla a aleatoriedade 
random_seed = 22
n_k_folds = 5 #quantas partes o dataset vai ser dividido

tf.random.set_seed(random_seed)
#keras.utils.set_random_seed(random_seed)
# DIVIDE EM K PEDACOS
sk_folds = StratifiedKFold(n_splits=n_k_folds)
sk_folds.get_n_splits(X_transformado, Y)

5

In [6]:
#Testando com KerasClassifier
def create_model(neurons, init_mode='uniform'):
    # create model
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(neurons, input_shape=(X_transformado.shape[1],), kernel_initializer=init_mode, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
    return model

tf.random.set_seed(random_seed)

# create model
model = KerasClassifier(model=create_model,epochs=100, batch_size=10, verbose=1)
# define the grid search parameters
init_mode = ['uniform', 'lecun_uniform', 'normal']
neurons = [1, 3, 5]
param_grid = dict(model__neurons=neurons, model__init_mode=init_mode)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_result = grid.fit(X_transformado, Y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [3]:
# TREINANDO
metric_lower_bound = 0.0
metric_median = 0.0
resultados = []
for n_neuron in numpy.arange(1, maximo_neuronios+1):
    ix_fold = 1
    #divisao os grupos de treino e teste, ele faz pelos indices
    for train_index, test_index in sk_folds.split(X_transformado, Y):
        # pegar o dado do teste  e treino   a partir dos indices da divisao feita pelo skfolds
        X_train, X_test = X_transformado[train_index], X_transformado[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        for funcao_ativacao in funcoes_ativacao : 
            #configurando o otimizador 
            optimizer = tf.keras.optimizers.Adam( learning_rate=adam_initial_lr, 
                                                    beta_1=adam_lr_decay ) 
            
            # Configurando a rede
            model_name = f"ADAM-N{n_neuron}F{ix_fold}_{classificacao}_{funcao_ativacao}"
            val_metric_name = f'val_{metrica}'
            
            inputs = tf.keras.Input(shape=(X_transformado.shape[1], ))        
            hidden = tf.keras.layers.Dense(n_neuron, activation=funcao_ativacao)(inputs)
            outputs = tf.keras.layers.Dense(1, activation='sigmoid')(hidden) #penas 1 classificacao = 1 saida
            rede = tf.keras.Model(inputs=inputs, outputs=outputs, name=model_name)

            #Usar o otimizador com erro medio quadratico 
            rede.compile(optimizer=optimizer, loss='mse', metrics=[metrica])
            
            #early stop com foco no erro do grupo de validacao
            es_loss = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=paciencia, restore_best_weights=True)
            #early stop com foco na acuracia dos testes
            es_metric = tf.keras.callbacks.EarlyStopping(monitor=metrica, mode='max', patience=paciencia, min_delta=min_improvement, restore_best_weights=True)


            #iniciando a rede para tentar encontrar o modelo
            resultado = rede.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=X_train.shape[0], 
                                    epochs=max_epocas, verbose=0, callbacks=[es_loss, es_metric], )

        

            # RESULTADOS DA REDE
            model_loss = resultado.history['loss'][-1]
            model_val_loss = resultado.history['val_loss'][-1]
            metric_val = resultado.history[metrica][-1]
            val_metric_val = resultado.history[val_metric_name][-1]
            n_epochs = len(resultado.history['val_loss'])

            resultado_dict = {'model_name': model_name,
                            'optimizer': optimizer,
                            'neurons': n_neuron,
                            'fold': ix_fold,
                            'loss': model_loss,
                            'val_loss': model_val_loss,
                            metrica: metric_val,
                            val_metric_name: val_metric_val,
                            'epochs': n_epochs,
                            'net': rede,
                            'history': resultado}
            resultados.append(resultado_dict)

            print(f"{model_name} > epochs: {n_epochs} loss: {model_loss} val_loss: {model_val_loss} {metrica}: {100.0*metric_val}% {val_metric_name}: {100.0*val_metric_val}%")
                

            # SE CHEGAR NO 100% para de aumentar 
            maximized_metrics = (val_metric_val == 1.0) & (metric_val == 1.0)

            if maximized_metrics:
                break

            print('')
        ix_fold += 1
        
    # GET LOWER BOUND OF WINNING ALGORITHM
    df_resultados = pd.DataFrame(resultados)


ADAM-N1F1_HiringDecision_tanh > epochs: 360 loss: 0.10653781145811081 val_loss: 0.07436573505401611 Accuracy: 86.2500011920929% val_Accuracy: 90.66666960716248%

ADAM-N1F1_HiringDecision_relu > epochs: 450 loss: 0.10493476688861847 val_loss: 0.07193761318922043 Accuracy: 87.08333373069763% val_Accuracy: 90.3333306312561%

ADAM-N1F1_HiringDecision_sigmoid > epochs: 271 loss: 0.10685708373785019 val_loss: 0.08038945496082306 Accuracy: 86.58333420753479% val_Accuracy: 89.99999761581421%

ADAM-N1F2_HiringDecision_tanh > epochs: 272 loss: 0.10407713800668716 val_loss: 0.07877912372350693 Accuracy: 86.75000071525574% val_Accuracy: 90.3333306312561%

ADAM-N1F2_HiringDecision_relu > epochs: 151 loss: 0.1427382528781891 val_loss: 0.11991934478282928 Accuracy: 68.99999976158142% val_Accuracy: 68.99999976158142%

ADAM-N1F2_HiringDecision_sigmoid > epochs: 197 loss: 0.1884968876838684 val_loss: 0.18404684960842133 Accuracy: 68.99999976158142% val_Accuracy: 68.99999976158142%

ADAM-N1F3_HiringDecis

In [4]:
resultado_resumido = pd.DataFrame()

lista_dict = []
for qtde_neuronio in numpy.arange(1, maximo_neuronios+1):
    for funcao_ativacao in funcoes_ativacao:
        filtrado = df_resultados[(df_resultados['neurons']==qtde_neuronio) & (df_resultados['model_name'].str.contains(funcao_ativacao))]
        temp_dict ={}
        temp_dict['neuronios'] = qtde_neuronio
        temp_dict['ativacao'] = funcao_ativacao
        temp_dict[f'{metrica}_media'] =  filtrado.loc[:, metrica].mean()
        stringVal = f'val_{metrica}'
        temp_dict[f'{stringVal}_media'] =  filtrado.loc[:, stringVal].mean()
        temp_dict[f'epocas_media'] =  filtrado.loc[:, 'epochs'].mean()
        temp_dict[f'val_loss_media'] =  filtrado.loc[:, 'val_loss'].mean()
        temp_dict[f'loss_media'] =  filtrado.loc[:, 'loss'].mean()
        lista_dict.append(temp_dict)
        

resultado_resumido = pd.DataFrame(lista_dict)
resultado_resumido.sort_values(by=f'{metrica}_media', ascending=False)

Unnamed: 0,neuronios,ativacao,Accuracy_media,val_Accuracy_media,epocas_media,val_loss_media,loss_media
13,5,relu,0.8855,0.878,274.8,0.098391,0.091646
12,5,tanh,0.883333,0.874667,247.6,0.098341,0.091673
9,4,tanh,0.882333,0.869333,257.4,0.102336,0.093124
10,4,relu,0.881667,0.879333,287.2,0.100213,0.092263
6,3,tanh,0.880833,0.871333,276.2,0.101993,0.09422
7,3,relu,0.88,0.865333,289.6,0.103185,0.094274
14,5,sigmoid,0.879833,0.872667,273.4,0.097922,0.094056
3,2,tanh,0.878667,0.868667,278.0,0.100804,0.095289
8,3,sigmoid,0.878667,0.872,324.6,0.097225,0.095233
0,1,tanh,0.875833,0.865333,331.2,0.101791,0.098174
