In [7]:
from math import sqrt
from random import randrange
def euclidean_distance(row1,row2):
  distance =0.0
  for i in range(len(row1)-1):
    distance += (row1[i] - row2[i])**2
  return sqrt(distance)

In [2]:
dataset = [[2.78,2.55,0],[1.46,2.36,0],[3.40,4.40,0],[1.39,1.85,0],[3.06,3.00,0],[7.63,2.56,1],[5.33,2.09,1],[6.92,1.77,1],[8.67,-0.24,1],[7.67,3.51,1]]


In [3]:
def get_best_matching_unit(codebooks,test_row):
  distances = list()
  for codebook in codebooks:
    dist = euclidean_distance(codebook,test_row)
    distances.append((codebook,dist))
  distances.sort(key=lambda tup : tup[1])
  return distances[0][0]



In [5]:
test_row = dataset[0]
bmu = get_best_matching_unit(dataset,test_row)
print(bmu)

[2.78, 2.55, 0]


In [8]:
def random_codebook(train):
  n_records = len(train)
  n_features = len(train[0])
  codebook = [train[randrange(n_records)][i] for i in range(n_features)]
  return codebook

In [9]:
def train_codebooks(train,n_codebooks,lrate,epochs):
  codebooks = [random_codebook(train) for i in range(n_codebooks)]
  for epoch in range(epochs):
    rate = lrate*(1.0-epoch/float(epochs))
    sum_error = 0.0
    for row in train:
      bmu = get_best_matching_unit(codebooks,row)
      for i in range(len(row)-1):
        error = row[i]-bmu[i]
        sum_error += error**2
        if bmu[-1] == row[-1]:
          bmu[i]+=rate*error
        else:
          bmu[i] -= rate*error
    print('>epoch = %d, lrate = %.3f, error=%.3f'%(epoch,rate,sum_error))
  return codebooks


In [10]:
from random import seed
seed(1)
learn_rate = 0.3
n_codebooks = 2
n_epochs =10
codebooks = train_codebooks(dataset,n_codebooks,learn_rate,n_epochs)
print(codebooks)

>epoch = 0, lrate = 0.300, error=43.038
>epoch = 1, lrate = 0.270, error=30.085
>epoch = 2, lrate = 0.240, error=26.808
>epoch = 3, lrate = 0.210, error=25.966
>epoch = 4, lrate = 0.180, error=25.209
>epoch = 5, lrate = 0.150, error=24.467
>epoch = 6, lrate = 0.120, error=23.743
>epoch = 7, lrate = 0.090, error=23.039
>epoch = 8, lrate = 0.060, error=22.355
>epoch = 9, lrate = 0.030, error=21.691
[[2.4312717245324498, 2.837957149509272, 0], [7.316940960409853, 1.9387693697496915, 1]]


In [11]:
from random import seed
from random import randrange
from csv import reader
from math import sqrt

def load_csv(filename):
  dataset = list()
  with open(filename,'r') as file:
    csv_reader = reader(file)
    next(csv_reader)
    for row in csv_reader:
      if not row:
        continue
      dataset.append(row[0].split(';'))
  return dataset
def accuracy_metric(actual,predicted):
  correct=0
  for i in range(len(actual)):
     if actual[i] == predicted[i]:
      correct +=1
  return correct/float(len(actual)) * 100.0
def str_column_to_int(dataset,column):
  class_values = [row[column] for row in dataset]
  unique = set(class_values)
  lookup = dict()
  for i, value in enumerate(unique):
    lookup[value] = i
  for row in dataset:
    row[column] = lookup[row[column]]
  return lookup
def str_column_to_float(dataset,column):
  for row in dataset:
    row[column]=float(row[column].strip())
def dataset_minmax(dataset):
  minmax = list()
  for i in range(len(dataset[0])):
    col_values = [row[i] for row in dataset]
    value_min = min(col_values)
    value_max = max(col_values)
    minmax.append([value_min,value_max])
  return minmax

def normalize_dataset(dataset, minmax):
  for row in dataset:
    for i in range(len(row)):
      row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
def cross_validation_split(dataset,n_folds):
  dataset_split = list()
  dataset_copy = list(dataset)
  fold_size = int(len(dataset)/n_folds)
  for _ in range(n_folds):
    fold = list()
    while len(fold) < fold_size:
      index = randrange(len(dataset_copy))
      fold.append(dataset_copy.pop(index))
    dataset_split.append(fold)
  return dataset_split
  
def evaluate_algorithm(dataset,algorithm,n_folds,*args):
  folds = cross_validation_split(dataset,n_folds)
  scores = list()
  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set,[])
    test_set = list()
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
      row_copy[-1] = None
    predicted = algorithm(train_set,test_set,*args)
    actual = [row[-1] for row in fold]
    accur = accuracy_metric(actual,predicted)
    scores.append(accur)
  return scores

In [13]:
def predict(codebooks,test_row):
  bmu = get_best_matching_unit(codebooks,test_row)
  return bmu[-1]
def learning_vector_quantization(train,test,n_codebooks,lrate,epochs):
  codebooks = train_codebooks(train,n_codebooks,lrate,epochs)
  predictions = list()
  for row in test:
    output = predict(codebooks,row)
    predictions.append(output)
  return (predictions)

In [14]:
from sklearn import datasets
import numpy as np
iris = datasets.load_iris()
data=np.concatenate([iris.data,np.expand_dims(iris.target,1)],axis=1)
data = data.tolist()

In [15]:
seed(1)


n_folds = 5
n_codebooks,learn_rate,n_epochs = 20,0.3,50
minmax = dataset_minmax(data)
normalize_dataset(data,minmax)
scores = evaluate_algorithm(data,learning_vector_quantization,n_folds,n_codebooks,learn_rate,n_epochs)

print("Scores = %s"%scores)
print("Mean Accuracy : %.3f"%(sum(scores)/float(len(scores))))

>epoch = 0, lrate = 0.300, error=8.494
>epoch = 1, lrate = 0.294, error=6.094
>epoch = 2, lrate = 0.288, error=6.101
>epoch = 3, lrate = 0.282, error=6.078
>epoch = 4, lrate = 0.276, error=6.293
>epoch = 5, lrate = 0.270, error=5.806
>epoch = 6, lrate = 0.264, error=5.776
>epoch = 7, lrate = 0.258, error=5.610
>epoch = 8, lrate = 0.252, error=5.550
>epoch = 9, lrate = 0.246, error=5.557
>epoch = 10, lrate = 0.240, error=5.513
>epoch = 11, lrate = 0.234, error=5.528
>epoch = 12, lrate = 0.228, error=5.563
>epoch = 13, lrate = 0.222, error=5.390
>epoch = 14, lrate = 0.216, error=5.449
>epoch = 15, lrate = 0.210, error=5.305
>epoch = 16, lrate = 0.204, error=5.386
>epoch = 17, lrate = 0.198, error=5.271
>epoch = 18, lrate = 0.192, error=5.353
>epoch = 19, lrate = 0.186, error=5.340
>epoch = 20, lrate = 0.180, error=5.309
>epoch = 21, lrate = 0.174, error=5.188
>epoch = 22, lrate = 0.168, error=5.268
>epoch = 23, lrate = 0.162, error=5.151
>epoch = 24, lrate = 0.156, error=5.127
>epoch = 2