<a href="https://colab.research.google.com/github/stefanoridolfi/ML_From_scratch/blob/master/ch_1_14_LQV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
##############################################
import pandas as pd
from random import seed
from random import randrange
from csv import reader
from math import sqrt,exp


''' if no headers in csv
def load_csv(filename_url):
  CSV_url=filename_url
  pdfile=pd.read_csv(CSV_url,header=None)
  dataset=pdfile.values.tolist()
  return dataset
  '''
'''if header in csv
#CVS with headers ##################
def load_csv(filename_url):
  CSV_url=filename_url
  pdfile=pd.read_csv(CSV_url,sep=';',nrows=48)
  #pdfile=pd.read_csv(CSV_url,sep=';')
  headers=pdfile.head()
  dataset=pdfile.values.tolist()
  return dataset, list(headers)
  '''
def load_csv(filename_url,n_row):
  CSV_url=filename_url
  pdfile=pd.read_csv(CSV_url,header=None,sep=',', nrows=n_row)
  #pdfile=pd.read_csv(CSV_url,header=None)
  dataset=pdfile.values.tolist()
  return dataset

##calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
  distance = 0.0
  for i in range(len(row1)-1):
    distance += (row1[i] - row2[i])**2
  return sqrt(distance)

 

# Convert string column to integer
def str_column_to_int(dataset, column):
  class_values = [row[column] for row in dataset]
  unique = set(class_values)
  lookup = dict()
  for i, value in enumerate(unique):
    lookup[value] = i
  for row in dataset:
    row[column] = lookup[row[column]]
  return lookup



# Split a dataset into a train and test set
def train_test_split(dataset, split):
  train = list()
  train_size = split * len(dataset)
  dataset_copy = list(dataset)
  while len(train) < train_size:
    index = randrange(len(dataset_copy))
    elem=dataset_copy[index]
    dataset_copy.pop(index)
    train.append(elem)
  return train, dataset_copy      

# Split a dataset into k folds
def cross_validation_split(dataset, folds):
  dataset_split=list()
  dataset_copy=list(dataset)
  #print("len dataset", len(dataset))
  fold_size=int(len(dataset)/folds)
  for i in range(folds):
    fold=[]
    while len(fold)<fold_size:
      index=randrange(len(dataset_copy))
      elem=dataset_copy.pop(index)
      fold.append(elem)
    dataset_split.append(fold)
  return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
  correct = 0
  for i in range(len(actual)):
    if actual[i] == predicted[i]:
      correct += 1
  return correct / float(len(actual)) * 100.0


## Create a random codebook vector : genero un numero casuale per selezionare un record, k-esemio.di questo k-esemio record prendo la colonna i
# ovvero scorro le variabili, le colonne cioè, e per ogni colonna selzione un record e prendo il valore della colonna per quel record. In questo modo costruoisco un nuovo record con i valori
# casuali delle variabili, le colonne.
# questa funzione genera un solo codebook, se la uso pià volte genero un set di codebook, che chiamo codebooks. codebook è una lista, codebbiks è una lista di liste
def random_codebook(train):
  codebook=[]
  n_records = len(train)
  n_features = len(train[0])
  for i in range(n_features):
    k=randrange(n_records)
    elem=train[k][i]
    codebook.append(elem)
  return codebook


#LQV Algorithm

#Locate the best matching unit
def get_best_matching_unit(codebooks, test_row):
  distances = list()
  for codebook in codebooks:
    dist = euclidean_distance(codebook, test_row)
    distances.append((codebook, dist))
    distances.sort(key=lambda tup: tup[1])
  return distances[0][0]

# Make a prediction with codebook vectors
def predict(codebooks, test_row):
  bmu = get_best_matching_unit(codebooks, test_row)
  return bmu[-1]

# ogni riga di train è confrontata con tutti i record della lista di liste contenuti in codebooks
# cerco il codebook la cui  distanza dal record in train è minore.  get_best_matching_unit mi restituisce il record in codebooks con minima distanza dalla
# riga corrente in train e la distanza minima, la variabile bmu contiene il coodebook migliore  
def train_codebooks(train, n_codebooks, lrate, epochs):
  codebooks = [random_codebook(train) for i in range(n_codebooks)]
  print("codebook",codebooks)
  for epoch in range(epochs):
    rate = lrate * (1.0-(epoch/float(epochs)))
    sum_error=0
    for row in train:
      bmu=get_best_matching_unit(codebooks, row)
      for i in range(len(row)-1):
        error = row[i] - bmu[i]
        sum_error += error**2
        if bmu[-1] == row[-1]:
          bmu[i] += rate * error
        else:
          bmu[i] -= rate * error
    print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, rate, sum_error))
  return codebooks

# Evaluate for case where Split a dataset into a train and test set
def evaluate_algorithm(dataset, train, codebooks):
  scores=list()
  actual = [row[-1] for row in test]
  for row in test:
    output = predict(codebooks, row)
    predictions.append(output)
    accuracy=accuracy_metric(actual, predictions)
  scores.append(accuracy)
  return scores

# Evaluate for case where Split in folds
def evaluate_algorithm_folds(dataset,num_folds,num_neighbors):
  scores=list()
  folds=cross_validation_split(dataset, num_folds)
  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set, [])
    test_set=list()
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
      row_copy[-1] = None
    predicted = k_nearest_neighbors(train_set, test_set, num_neighbors)
    actual = [row[-1] for row in fold]
    accuracy = accuracy_metric(actual, predicted)
    scores.append(accuracy)
  return scores



##################
'''dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
'''
# Dataset are in https://github.com/jbrownlee/Datasets
seed(1)
CSV_url='https://github.com/jbrownlee/Datasets/blob/master/ionosphere.csv'
dataset= load_csv(CSV_url,10)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
learn_rate = 0.3
n_epochs = 10
n_codebooks = 2
train, test= train_test_split(dataset, 0.4)
print("dataset",dataset)
print("train",train)
print("test",test)
codebooks = train_codebooks(train, n_codebooks, learn_rate, n_epochs)
print("Codebooks:",codebooks)


dataset [[8], [1], [4], [9], [5], [0], [6], [3], [2], [7]]
train [[4], [1], [6], [8]]
test [[9], [5], [0], [3], [2], [7]]
codebook [[8], [8]]
>epoch=0, lrate=0.300, error=0.000
>epoch=1, lrate=0.270, error=0.000
>epoch=2, lrate=0.240, error=0.000
>epoch=3, lrate=0.210, error=0.000
>epoch=4, lrate=0.180, error=0.000
>epoch=5, lrate=0.150, error=0.000
>epoch=6, lrate=0.120, error=0.000
>epoch=7, lrate=0.090, error=0.000
>epoch=8, lrate=0.060, error=0.000
>epoch=9, lrate=0.030, error=0.000
Codebooks: [[8], [8]]
