<a href="https://colab.research.google.com/github/samuelrudnicki/validacao_cruzada/blob/main/validacao_cruzada.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INF01017 - Aprendizado de Máquina

## Exercício sobre Validação Cruzada

### Samuel Rudnicki

In [None]:
import pandas as pd
import numpy as np
import math

In [None]:
url = 'https://raw.githubusercontent.com/samuelrudnicki/cross-validation/master/diabetes.csv'

df = pd.read_csv(url)

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Pré-processamento: normalização

In [None]:
def normalize(dataset):
  dataset_normalized=((dataset-dataset.min())/(dataset.max()-dataset.min()))
  dataset_normalized['Outcome'] = dataset['Outcome']
  dataset_normalized = dataset_normalized.sort_values('Outcome').reset_index(drop = True)
  return dataset_normalized

df_normalized = normalize(df)
df_normalized.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.058824,0.452261,0.508197,0.181818,0.06974,0.374069,0.508113,0.066667,0
1,0.0,0.623116,0.459016,0.131313,0.124113,0.324888,0.159693,0.0,0
2,0.0,0.371859,0.42623,0.10101,0.042553,0.414307,0.081554,0.016667,0
3,0.0,0.487437,0.52459,0.363636,0.118203,0.548435,0.222886,0.066667,0
4,0.352941,0.773869,0.639344,0.414141,0.165485,0.687034,0.210504,0.1,0


## K-fold coss-validation



In [None]:
k = 10 #definição do número de folds para a validação cruzada

instance_count = df_normalized['Outcome'].count()

positive_outcome_count = df_normalized['Outcome'].sum()
negative_outcome_count = instance_count - positive_outcome_count

positive_outcome_ratio = positive_outcome_count / instance_count 
negative_outcome_ratio = negative_outcome_count / instance_count

fold_size = instance_count / k

positive_outcome_per_fold = positive_outcome_ratio * fold_size
negative_outcome_per_fold = negative_outcome_ratio * fold_size

#cada fold é armazenado em uma posição da lista folded_data
#pela maneira que o pre-processamento foi realizado as instancias da tabela estão ordenadas de forma crescente pelo atributo Outcome
#os 500 primeiros valores são negativos e os proximos 268 positivos
folded_data = []
for fold_index in range(k):
  folded_data.append(df_normalized[int(fold_index*negative_outcome_per_fold):int((fold_index+1)*negative_outcome_per_fold)].append(df_normalized[int(500 + fold_index*positive_outcome_per_fold):int(500 + (fold_index+1)*positive_outcome_per_fold)]))

In [None]:
#for fold in folded_data:
#  print(fold['Outcome'].count(), fold['Outcome'].sum(), fold['Outcome'].sum()/fold['Outcome'].count())

75 26 0.3466666666666667
77 27 0.35064935064935066
77 27 0.35064935064935066
77 27 0.35064935064935066
77 27 0.35064935064935066
76 26 0.34210526315789475
77 27 0.35064935064935066
77 27 0.35064935064935066
77 27 0.35064935064935066
77 27 0.35064935064935066


## Algoritmo K-nearest neighbors (KNN)

In [None]:
def get_euclidean_distance(instances, test_instance):
  
  return instances.apply(lambda instance: np.linalg.norm(instance - test_instance), axis = 1)

In [None]:
def knn(training_data, test_instance, k):

  training_data_copy = training_data.copy()
  training_data_copy['instance_test_distance'] = get_euclidean_distance(training_data_copy.drop('Outcome', axis=1)[:], test_instance.drop('Outcome'))
  training_data_copy = training_data_copy.sort_values('instance_test_distance')
  positive_outcome_count = training_data_copy.head(k)['Outcome'].sum()

  result = 1 if positive_outcome_count > k/2 else 0

  return result


In [None]:
knn_k = 5; #definição do valor de K para do modelo KNN

evaluation_table = pd.DataFrame()

#para cada iteração gera uma matriz de confusão do atributo Outcome
for i in range(len(folded_data)):
  training_data = pd.DataFrame()
  confusion_matrix = [0, 0, 0, 0] # [VP, FP, FN, VN]

  #une os k-1 folds para o treinamento do modelo 
  for j in [x for x in range(len(folded_data)) if x!=i]:
    training_data = training_data.append(folded_data[j])
  
  #gera a matriz de confusão para o modelo testando-o com o fold restante
  for index, row in folded_data[i].iterrows():
    prediction = knn(training_data, row, knn_k)
    confusion_matrix[0] += 1 if (prediction==row['Outcome'] and row['Outcome']==1) else 0
    confusion_matrix[1] += 1 if (prediction!=row['Outcome'] and row['Outcome']==0) else 0
    confusion_matrix[2] += 1 if (prediction!=row['Outcome'] and row['Outcome']==1) else 0
    confusion_matrix[3] += 1 if (prediction==row['Outcome'] and row['Outcome']==0) else 0

  #Calcula os valores de acurária e F1-measure do modelo treinado atual
  prec = confusion_matrix[0] / (confusion_matrix[0] + confusion_matrix[1]) 
  rev = confusion_matrix[0] / (confusion_matrix[0] + confusion_matrix[2])
  iteration_evaluation = pd.Series(dtype='float64')
  iteration_evaluation['Acurácia'] = (confusion_matrix[0]+confusion_matrix[3]) / len(folded_data[i])
  iteration_evaluation['F1-measure'] = 2 * prec * rev / (prec + rev)
  evaluation_table = evaluation_table.append(iteration_evaluation, ignore_index=True)

#gera tabela com o número da iteração e seus valores de acurária e F1-measure, bem como a media e desvio padrão dos mesmos
evaluation_table.index += 1
evaluation_table = evaluation_table.append(evaluation_table.describe()['mean':'std'] )

display(evaluation_table)





Unnamed: 0,Acurácia,F1-measure
1,0.826667,0.745098
2,0.857143,0.8
3,0.688312,0.478261
4,0.636364,0.44
5,0.792208,0.703704
6,0.723684,0.511628
7,0.727273,0.571429
8,0.805195,0.716981
9,0.675325,0.468085
10,0.675325,0.528302
