<a href="https://colab.research.google.com/github/stefanoridolfi/ML_From_scratch/blob/master/ch_1_14_LQV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
##############################################
import pandas as pd
from random import seed
from random import randrange
from csv import reader
from math import sqrt,exp

''' if no headers in csv
def load_csv(filename_url):
  CSV_url=filename_url
  pdfile=pd.read_csv(CSV_url,header=None)
  dataset=pdfile.values.tolist()
  return dataset
  '''
'''if header in csv
#CVS with headers ##################
def load_csv(filename_url):
  CSV_url=filename_url
  pdfile=pd.read_csv(CSV_url,sep=';',nrows=48)
  #pdfile=pd.read_csv(CSV_url,sep=';')
  headers=pdfile.head()
  dataset=pdfile.values.tolist()
  return dataset, list(headers)
  '''
def load_csv(filename_url):
  CSV_url=filename_url
  pdfile=pd.read_csv(CSV_url,header=None, nrows=400)
  #pdfile=pd.read_csv(CSV_url,header=None)
  dataset=pdfile.values.tolist()
  return dataset

##calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
  distance = 0.0
  for i in range(len(row1)-1):
    distance += (row1[i] - row2[i])**2
  return sqrt(distance)

# calculate the num_neighbors of train respect to test_row: is distance between test_row and each of row in train
# lambda è una funzione che ritorna il valore della tupla in posizione 1, tup[1], ovvero la posizione in cui ho messo la distanza, 
#perchè distances ha in posizione 0 il valore riga ed in posizione 1 il valore distance
def get_neighbors(train, test_row, num_neighbors):
  distances = list()
  neighbors=list()
  for riga in train:
    distance=euclidean_distance(test_row,riga)
    distances.append((riga,distance))
    distances.sort(key=lambda tup:tup[1])
  for k in range(0,num_neighbors):
    neighbors.append(distances[k][0])
  return neighbors

# Convert string column to integer
def str_column_to_int(dataset, column):
  class_values = [row[column] for row in dataset]
  unique = set(class_values)
  lookup = dict()
  for i, value in enumerate(unique):
    lookup[value] = i
  for row in dataset:
    row[column] = lookup[row[column]]
  return lookup

def predict_classification(train, test_row, num_neighbors):
  neighbors=get_neighbors(train, test_row, num_neighbors)
  output_values = [row[-1] for row in neighbors]
  insieme=set(output_values)
  prediction=max(insieme, key=output_values.count)
  return prediction          

# Split a dataset into a train and test set
def train_test_split(dataset, split):
  train = list()
  train_size = split * len(dataset)
  dataset_copy = list(dataset)
  while len(train) < train_size:
    index = randrange(len(dataset_copy))
    elem=dataset_copy[index]
    dataset_copy.pop(index)
    train.append(elem)
  return train, dataset_copy      

# Split a dataset into k folds
def cross_validation_split(dataset, folds):
  dataset_split=list()
  dataset_copy=list(dataset)
  #print("len dataset", len(dataset))
  fold_size=int(len(dataset)/folds)
  for i in range(folds):
    fold=[]
    while len(fold)<fold_size:
      index=randrange(len(dataset_copy))
      elem=dataset_copy.pop(index)
      fold.append(elem)
    dataset_split.append(fold)
  return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
  correct = 0
  for i in range(len(actual)):
    if actual[i] == predicted[i]:
      correct += 1
  return correct / float(len(actual)) * 100.0


# Find the min and max values for each column
def dataset_minmax(dataset):
  minmax = list()
  for i in range(len(dataset[0])):
    col_values = [row[i] for row in dataset]
    value_min = min(col_values)
    value_max = max(col_values)
    minmax.append([value_min, value_max])
  return minmax

# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
  for row in dataset:
    for i in range(len(row)):
      row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
    



#kNN Algorithm
def k_nearest_neighbors(train, test, num_neighbors):
  predictions = list()
  for row in test:
    output = predict_classification(train, row, num_neighbors)
    predictions.append(output)
  return(predictions)

# Evaluate for case where Split a dataset into a train and test set
def evaluate_algorithm(dataset, train, test,num_neighbors):
  scores=list()
  actual = [row[-1] for row in test]
  predicted=k_nearest_neighbors(train, test, num_neighbors)
  accuracy=accuracy_metric(actual, predicted)
  scores.append(accuracy)
  return scores

# Evaluate for case where Split in folds
def evaluate_algorithm_folds(dataset,num_folds,num_neighbors):
  scores=list()
  folds=cross_validation_split(dataset, num_folds)
  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set, [])
    test_set=list()
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
      row_copy[-1] = None
    predicted = k_nearest_neighbors(train_set, test_set, num_neighbors)
    actual = [row[-1] for row in fold]
    accuracy = accuracy_metric(actual, predicted)
    scores.append(accuracy)
  return scores



##################

# Dataset are in https://github.com/jbrownlee/Datasets
seed(1)
CSV_url='https://raw.githubusercontent.com/jbrownlee/Datasets/master/abalone.csv'
dataset= load_csv(CSV_url)
str_column_to_int(dataset, 0)
train, test = train_test_split(dataset, 0.6)
score_split=evaluate_algorithm(dataset, train, test,5)
print("score split",score_split)

score_folds=evaluate_algorithm_folds(dataset,5,5)
print("score folds",score_folds)
mean_score_folds=sum(score_folds)/float(len(score_folds))
print("mean score folds",mean_score_folds)



score split [18.75]
score folds [16.25, 21.25, 16.25, 11.25, 16.25]
mean score folds 16.25
