In [11]:
import csv
import math
import operator
import random
import pandas as pd

pd.set_option('display.max_rows', 400)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

class Ecoli:
    def __init__(self, mcg, gvh, lip, chg, aac, alm1, alm2, classes):
        self.mcg = mcg
        self.gvh = gvh
        self.lip = lip
        self.chg = chg
        self.aac = aac
        self.alm1 = alm1
        self.alm2 = alm2
        self.classes = classes
        self.validity = 0
        self.weight = 0

    def __str__(self):
        return f"mcg: {self.mcg} gvh: {self.gvh} lip: {self.lip} chg: {self.chg} aac: {self.aac} alm1: {self.alm1} alm2: {self.alm2} validity: {self.validity} weight: {self.weight} classes: {self.classes}"

    def get_features(self):
        return [self.mcg, self.gvh, self.lip, self.chg, self.aac, self.alm1, self.alm2]
    
    def get_classes(self):
        return self.classes
    
    def get_validity(self):
        return self.validity
    
    def get_weight(self):
        return self.weight

    @classmethod
    def get_features_index(cls):
        return ["mcg", "gvh", "lip", "chg", "aac", "alm1", "alm2", "classes"]


class Utils:
    @classmethod
    def display_prediction_dataframe(cls, test_set, prediction):
        dataframe = []

        for i in range(len(test_set)):
            dataframe.append(pd.Series([*test_set[i].get_features(), test_set[i].get_classes(), prediction[i], test_set[i].get_classes() == prediction[i]],
                                       index=[*Ecoli.get_features_index(), 'prediction', 'result']))

        return pd.DataFrame(dataframe)
    
    @classmethod
    def display_dataframe(cls, dataset):
        dataframe = []

        for data in dataset:
            dataframe.append(pd.Series([*data.get_features(), data.get_classes()],
                                       index=[*Ecoli.get_features_index()]))

        return pd.DataFrame(dataframe)

    @classmethod
    def load_dataset(cls, filename):
        parsed_dataset = []

        with open(filename, 'r') as csv_file:
            dataset = list(csv.reader(csv_file))
            
            for i in range(len(dataset)):
                data = list(filter(None,dataset[i][0].split(' ')))
                mcg = float(data[1])
                gvh = float(data[2])
                lip = float(data[3])
                chg = float(data[4])
                aac = float(data[5])
                alm1 = float(data[6])
                alm2 = float(data[7])
                classes = data[8]

                parsed_dataset.append(Ecoli(
                    mcg,
                    gvh,
                    lip,
                    chg,
                    aac,
                    alm1,
                    alm2,
                    classes,
                ))
        return parsed_dataset

    @classmethod
    def accuracy_score(cls, test_set, list_predictions):
        correct = 0
        for i in range(len(test_set)):
            if test_set[i].get_classes() == list_predictions[i]:
                correct += 1
        print(f"akurasi: {(correct/float(len(test_set))) * 100.0}%")

    @classmethod
    def split_dataset(cls, dataset, test_size_percentage):
        shuffled_dataset = random.sample(dataset, len(dataset))
        test_set = []

        divider = int(len(shuffled_dataset)*test_size_percentage)
        for _ in range(divider):
            test_set.append(shuffled_dataset.pop())

        return shuffled_dataset, test_set


class KNN:
    def __init__(self, train_set, k):
        self.train_set = list(train_set)
        self.k = k

    def get_neighbors(self, test_instance):
        neighbors = []
        distances = []

        for i in range(len(self.train_set)):
            current_distance = self.euclidean_distance(
                test_instance,
                self.train_set[i],
                len(test_instance.get_features())
            )
            distances.append((self.train_set[i], current_distance))

        distances.sort(key=operator.itemgetter(1))

        for i in range(self.k):
            neighbors.append(distances[i][0])

        return neighbors

    def euclidean_distance(self, test_instance, train_instance, features_count):
        distance = 0

        for i in range(features_count):
            distance += pow((test_instance.get_features()
                             [i] - train_instance.get_features()[i]), 2)
        return math.sqrt(distance)

    def predict(self, test_instance):
        classVotes = {}

        neighbors = self.get_neighbors(test_instance)

        for i in range(len(neighbors)):
            response = neighbors[i].get_classes()

            if response in classVotes:
                classVotes[response] += 1
            else:
                classVotes[response] = 1

        sortedVotes = sorted(classVotes.items(),
                             key=operator.itemgetter(1), reverse=True)
        return sortedVotes[0][0]
    
class MKNN():
    def __init__(self, train_set, k):
        self.train_set = list(train_set)
        self.k = k
    
    def euclidean_distance(self, test_instance, train_instance, features_count):
        distance = 0

        for i in range(features_count):
            distance += pow((test_instance.get_features()[i] - train_instance.get_features()[i]), 2)
        return math.sqrt(distance)
    
    def calculate_validity(self):
        for i in range(len(train_set)):
            neighbors = []
            distances = []
            copy_train_set = list(self.train_set)
                                    
            for j in range(len(copy_train_set)):
                current_distance = self.euclidean_distance(
                    self.train_set[i],
                    copy_train_set[j],
                    len(test_instance.get_features())
                )
                distances.append((copy_train_set[j], current_distance))

            distances.sort(key=operator.itemgetter(1))
            
            for m in range(self.k):
                neighbors.append(distances[m][0])
            
            class_votes = {}

            for n in range(len(neighbors)):
                response = neighbors[n].get_classes()

                if response in class_votes:
                    class_votes[response] += 1
                else:
                    class_votes[response] = 1
            
            sorted_votes = sorted(class_votes.items(), key=operator.itemgetter(1), reverse=True)
            self.train_set[i].validity = sorted_votes[0][1]/self.k
            
        return self.train_set
            
    def calculate_weight(self, test_instance):
        distances = []

        for i in range(len(self.train_set)):
            current_distance = self.euclidean_distance(
                test_instance,
                self.train_set[i],
                len(test_instance.get_features())
            )
            distances.append((self.train_set[i], current_distance))

        for j in range(len(self.train_set)):
            self.train_set[j].weight = self.train_set[j].validity * (1/(distances[j][1] + 0.5))

        return self.train_set
    
    def predict(self, test_instance):
        
        self.calculate_validity()
        datas = self.calculate_weight(test_instance)
                
        sorted_result = sorted(self.train_set, key=operator.attrgetter('weight'), reverse=True)
        return sorted_result[0].get_classes()

            
            
        
    

In [12]:
dataset = Utils.load_dataset('ecoli.data')
print(f"Total dataset:{len(dataset)}")

train_set, test_set = Utils.split_dataset(dataset, 0.3)
Utils.display_dataframe(dataset)

Total dataset:336


Unnamed: 0,mcg,gvh,lip,chg,aac,alm1,alm2,classes
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,0.07,0.4,0.48,0.5,0.54,0.35,0.44,cp
2,0.56,0.4,0.48,0.5,0.49,0.37,0.46,cp
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp
5,0.67,0.39,0.48,0.5,0.36,0.38,0.46,cp
6,0.29,0.28,0.48,0.5,0.44,0.23,0.34,cp
7,0.21,0.34,0.48,0.5,0.51,0.28,0.39,cp
8,0.2,0.44,0.48,0.5,0.46,0.51,0.57,cp
9,0.42,0.4,0.48,0.5,0.56,0.18,0.3,cp


In [13]:
print(f"Train set:{len(train_set)}")
Utils.display_dataframe(train_set)

Train set:236


Unnamed: 0,mcg,gvh,lip,chg,aac,alm1,alm2,classes
0,0.56,0.54,0.48,0.5,0.43,0.37,0.3,pp
1,0.64,0.58,0.48,0.5,0.48,0.78,0.73,im
2,0.36,0.39,0.48,0.5,0.48,0.22,0.23,cp
3,0.06,0.61,0.48,0.5,0.49,0.92,0.37,im
4,0.73,0.78,0.48,0.5,0.58,0.51,0.31,pp
5,0.64,0.45,0.48,0.5,0.67,0.61,0.66,imU
6,0.42,0.24,0.48,0.5,0.57,0.27,0.37,cp
7,0.34,0.49,0.48,0.5,0.58,0.85,0.8,im
8,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
9,0.38,0.46,0.48,0.5,0.48,0.22,0.29,cp


In [14]:
print(f"Test set:{len(test_set)}")
Utils.display_dataframe(test_set)

Test set:100


Unnamed: 0,mcg,gvh,lip,chg,aac,alm1,alm2,classes
0,0.66,0.74,0.48,0.5,0.31,0.38,0.43,pp
1,0.48,0.45,0.48,0.5,0.6,0.78,0.8,im
2,0.63,0.47,0.48,0.5,0.51,0.82,0.84,im
3,0.37,0.5,0.48,0.5,0.42,0.36,0.45,cp
4,0.23,0.33,0.48,0.5,0.43,0.33,0.43,cp
5,0.64,0.78,0.48,0.5,0.5,0.36,0.38,pp
6,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
7,0.29,0.39,0.48,0.5,0.52,0.4,0.48,pp
8,0.69,0.8,0.48,0.5,0.46,0.57,0.26,pp
9,0.7,0.61,0.48,0.5,0.56,0.52,0.43,pp


In [15]:
knn = KNN(train_set, 5)
list_prediction = []
for test_instance in test_set:
    prediction = knn.predict(test_instance)
    list_prediction.append(prediction)
    
Utils.accuracy_score(test_set, list_prediction)
Utils.display_prediction_dataframe(test_set, list_prediction)

akurasi: 83.0%


Unnamed: 0,mcg,gvh,lip,chg,aac,alm1,alm2,classes,prediction,result
0,0.66,0.74,0.48,0.5,0.31,0.38,0.43,pp,pp,True
1,0.48,0.45,0.48,0.5,0.6,0.78,0.8,im,im,True
2,0.63,0.47,0.48,0.5,0.51,0.82,0.84,im,im,True
3,0.37,0.5,0.48,0.5,0.42,0.36,0.45,cp,cp,True
4,0.23,0.33,0.48,0.5,0.43,0.33,0.43,cp,cp,True
5,0.64,0.78,0.48,0.5,0.5,0.36,0.38,pp,pp,True
6,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp,cp,True
7,0.29,0.39,0.48,0.5,0.52,0.4,0.48,pp,cp,False
8,0.69,0.8,0.48,0.5,0.46,0.57,0.26,pp,pp,True
9,0.7,0.61,0.48,0.5,0.56,0.52,0.43,pp,pp,True


In [19]:
mknn = MKNN(train_set, 50)

list_prediction = []
for test_instance in test_set:
    prediction = mknn.predict(test_instance)
    list_prediction.append(prediction)
    
Utils.accuracy_score(test_set, list_prediction)
Utils.display_prediction_dataframe(test_set, list_prediction)

akurasi: 68.0%


Unnamed: 0,mcg,gvh,lip,chg,aac,alm1,alm2,classes,prediction,result
0,0.66,0.74,0.48,0.5,0.31,0.38,0.43,pp,cp,False
1,0.48,0.45,0.48,0.5,0.6,0.78,0.8,im,imU,False
2,0.63,0.47,0.48,0.5,0.51,0.82,0.84,im,im,True
3,0.37,0.5,0.48,0.5,0.42,0.36,0.45,cp,cp,True
4,0.23,0.33,0.48,0.5,0.43,0.33,0.43,cp,cp,True
5,0.64,0.78,0.48,0.5,0.5,0.36,0.38,pp,cp,False
6,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp,cp,True
7,0.29,0.39,0.48,0.5,0.52,0.4,0.48,pp,cp,False
8,0.69,0.8,0.48,0.5,0.46,0.57,0.26,pp,pp,True
9,0.7,0.61,0.48,0.5,0.56,0.52,0.43,pp,cp,False
