In [173]:
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn import preprocessing
import math
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier

In [115]:
data = arff.loadarff('./Data/cm1.arff.txt')
df = pd.DataFrame(data[0])
df['defects'] = df['defects'].apply(lambda x: str(x)[1:]) #removing 'b' from classes
numberOfColumns = len(df.columns) #qtd of columns with defects column
justClass = df[['defects']].values #only the classes values
justData = df.drop(['defects'], axis=1) #removing the classes column for normalization
valuesOfColums = justData.columns.values #get attributes names
justDataValues = justData.values #get data as numpy array

In [116]:
min_max_scaler = preprocessing.MinMaxScaler()
normalized_data = min_max_scaler.fit_transform(justDataValues)
df = pd.DataFrame(normalized_data)
df.columns = valuesOfColums
df.insert(loc=numberOfColumns-1, column='defects', value=justClass)

In [117]:
def euclidian_distance(x, y):
    distance = 0
    for i in range(len(x)-1):
        distance += np.square(x[i] - y[i])
    return np.sqrt(distance)

def calc_dists(propotypes, testInstance):
    d = []
    for p in propotypes:
        d.append([p, euclidean_distance(testInstance, p)])
    distance.sort(key=lambda dist: dist[1])
    return d

In [126]:
def lvq1(trainSet, prototypes, lrate, epochs):
    for epoch in range(epochs):
        rate = lrate * (1.0 - (epoch / float(epochs)))
        sum_error = 0
        for example in trainSet:
            neighbor = calc_dists(prototypes, example)[0][0]
            error = np.subtract(example[:-1], neighbor[:-1])
            sum_error += np.sum(np.square(error))
            if(example[-1] == neighbor[-1]):
                neighbor[:-1] = np.add(neighbor[:-1], np.dot(error, rate))
            else:
                neighbor[:-1] = np.subtract(neighbor[:-1], np.dot(error, rate))
    return prototypes

In [119]:
def window(neighbor1, neighbor2, instance, w):
    di = euclidian_distance(neighbor1, instance)
    dj = euclidian_distance(neighbor2, instance)
    if di != 0 and dj != 0:
        mini = min(di/dj, dj/di)
    else:
        mini = 0
    s = ((1-w)/(1+w))
    return (mini > s)

In [127]:
def lvq2_1(trainSet, prototypes, lrate, epochs):
    prots = lvq1(trainSet, lrate, prototypes, epochs)
    for epoch in range(epochs):
        rate = lrate * (1.0 - (epoch / float(epochs)))
        sum_error = 0
        for example in trainSet:
            neighbors = calc_dists(prototypes, example)
            ei, ej = neighbors[0][0], neighbors[1][0]
            if window(ei, ej, example, 0.3):
                if(ei[-1] != ej[-1]):
                    if(ei[-1] == example[-1]):
                        error_i = np.subtract(example[:-1], ei[:-1])
                        error_j = np.subtract(example[:-1], ej[:-1])
                        sum_error += np.sum(np.square(error_i))
                        ei[:-1] = np.add(ei[:-1], np.dot(error_i, rate))
                        ej[:-1] = np.subtract(ej[:-1], np.dot(error_j, rate))
                    elif(ej[-1] == example[-1]):
                        error_i = np.subtract(example[:-1], ei[:-1])
                        error_j = np.subtract(example[:-1], ej[:-1])
                        sum_error += np.sum(np.square(error_j))
                        ej[:-1] = np.add(ej[:-1], np.dot(error_j, rate))
                        ei[:-1] = np.subtract(ei[:-1], np.dot(error_i, rate))
    return prototypes

In [128]:
def lvq3(trainSet, prototypes, lrate, epochs):
    prots = lvq1(trainSet, lrate, prototypes, epochs)
    for epoch in range(epochs):
        rate = lrate * (1.0 - (epoch / float(epochs)))
        sum_error = 0
        for example in trainSet:
            neighbors = calc_dists(prototypes, example)
            ei, ej = neighbors[0][0], neighbors[1][0]
            if window(ei, ej, example, 0.3):
                if(ei[-1] != ej[-1]):
                    if(ei[-1] == example[-1]):
                        error_i = np.subtract(example[:-1], ei[:-1])
                        error_j = np.subtract(example[:-1], ej[:-1])
                        sum_error += np.sum(np.square(error_i))
                        ei[:-1] = np.add(ei[:-1], np.dot(error_i, rate))
                        ej[:-1] = np.subtract(ej[:-1], np.dot(error_j, rate))
                    elif(ej[-1] == example[-1]):
                        error_i = np.subtract(example[:-1], ei[:-1])
                        error_j = np.subtract(example[:-1], ej[:-1])
                        sum_error += np.sum(np.square(error_j))
                        ej[:-1] = np.add(ej[:-1], np.dot(error_j, rate))
                        ei[:-1] = np.subtract(ei[:-1], np.dot(error_i, rate))
                elif (ei[-1] == ej[-1]) and (ei[-1] == example[-1]):
                    error_i = np.subtract(example[:-1], ei[:-1])
                    error_j = np.subtract(example[:-1], ej[:-1])
                    sum_error += np.sum(np.square(np.add(error_i, error_j)))
                    ei[:-1] = np.add(ei[:-1], np.dot(error_i, (rate * 0.3)))
                    ej[:-1] = np.add(ej[:-1], np.dot(error_j, (rate * 0.3)))
    return prototypes

In [187]:
def splitData(train, test):
    trainSet = []
    resultTrainSet = []
    testSet = []
    resultTestSet = []

    for item in train:
        trainSet.append(item[:-1])
        resultTrainSet.append(item[-1])
    for item in test:
        testSet.append(item[:-1])
        resultTestSet.append(item[-1])
    return trainSet,resultTrainSet,testSet,resultTestSet

In [193]:
#CONSTANTS
NUMBER_OF_PROTOTYPES = 20 #number of prototypes
EPOCHS = 10 #number of epochs training
kfoldNumber = 5
LVQS = [lvq1, lvq2_1, lvq3]

In [192]:
dataset = np.array(df.values)
Y = df.iloc[:, -1]
numberOfClasses = len(set(Y))
classesValues = list(set(Y))
count = 1
for lvq in LVQS:
    print("LVQ ", str(count))
    skf = StratifiedKFold(n_splits=kfoldNumber, shuffle=True, random_state=1)
    totalAccuracy = []
    for train_index, test_index in skf.split(dataset, Y):
        X_train, X_test = dataset[train_index], dataset[test_index]
        #prototypeSet = []
        start = time.time()
        LVQ_Prototypes = lvq(X_train, prototypeSet, 0.3, EPOCHS)
        final = time.time() - start
        print("Training time = %.2f" % (final))
        train_data, train_classes, test_data, test_classes = splitData(LVQ_Prototypes, X_test)
        for k in [1, 3]:
            knn = KNeighborsClassifier(n_neighbors=k)
            knn.fit(train_data, train_classes)
            predictions = knn.predict(test_data)
            acc = np.sum([1 for i, x in enumerate(predictions) if x == test_classes[i]])
            totalAccuracy.append([k, (acc / len(predictions))])
    count += 1
    print('KNN = 1, Accuracy = %.2f' % (sum([x[1] for x in totalAccuracy if x[0] == 1]) / kfoldNumber))
    print('KNN = 3, Accuracy = %.2f' % (sum([x[1] for x in totalAccuracy if x[0] == 3]) / kfoldNumber))

LVQ  1


NameError: name 'prototypeSet' is not defined