In [1]:
import pandas as pd
import numpy as np

In [2]:
#Check the categorical data
def check_categorical(data):
    categorical_index = []
    for i,value in enumerate(data.dtypes):
        if(value == 'object'):
            categorical_index.append(i)
    j = 0
    data = data.to_numpy()
    while j < len(categorical_index):
        temp = data[:,categorical_index[j]]
        u_categorical = np.unique(temp)
        num_indexes = []
        for i,value in enumerate(u_categorical):
            num_indexes.append(i)
        dictionary = dict(zip(u_categorical, num_indexes))
        for k in range(len(temp)):
            data[k][categorical_index[j]] = dictionary.get(data[k][categorical_index[j]])
        j+=1
    
    return data.astype(float),categorical_index
        

In [3]:
#Euclidean distance
def calc_distance(a,b):
    dist = 0
    for i in range(len(a)):
        dist += (abs(a[i] - b[i]) ** 2)
    return (dist**(1/2))

In [4]:
#Normalization of data
def normalize(data):
    normalized_data  = (data - np.mean(data,axis = 0))/np.std(data,axis = 0)
    return normalized_data

In [5]:
#Knn - Classification
def knn_classify(k, train_p,test_p):
    distance_list = []
    for train_point in train_p:
        distance_list.append(calc_distance(train_point[0:-1],test_p[0:-1]))
    k_neighbors = np.argsort(np.asarray(distance_list))
    k_neighbors = k_neighbors[:k]
    train_last_column = train_p[:,-1]
    ones = 0
    zeroes = 0
    for x in k_neighbors:
        if train_last_column[x] == 0:
            zeroes +=1
        elif train_last_column[x] == 1:
            ones += 1
    if(ones > zeroes):
        return 1
    else:
        return 0

In [6]:
#Calculation of Confusion Matrix
def confusion_matrix(original, predicted):
    accuracy = precision = recall = f_measure = 0
    TP = FN = FP = TN = 0
    for i in range(len(original)):
        if original[i] == 1 and predicted[i] == 1:
            TP += 1
        elif original[i] == 1 and predicted[i] == 0:
            FN += 1
        elif original[i] == 0 and predicted[i] == 1:
            FP += 1
        else:
            TN += 1
            
    accuracy = (float(TP + TN)/(TP + FN + FP + TN))
    if(TP + FP) != 0:
        precision = (float(TP)/(TP + FP))
    if(TP + FN) != 0:
        recall = (float(TP)/(TP + FN))
    f_measure = (float(2 * TP) / ((2 * TP) + FN + FP))
            
    return accuracy, precision, recall, f_measure

In [7]:
#Main Function
def __main__():
    accuracy = precision = recall = f_measure = 0
    
    #Load the file in a Data Frame
    file = 'project3_dataset1.txt'
    data = pd.read_csv(file,sep="\t", header=None)    
    data = data.iloc[:,:data.shape[1]]
    data, categorical_index = check_categorical(data)
    
    #Number of Folds
    folds = np.array_split(data,10)
   
    i=0
    #Number of neighbors
    k = int(input('Enter the Number of Neighbors : '))
    
    #Looping through the folds
    while(i<len(folds)):
        test = np.asarray(folds[i])
        train = []
        predicted_labels = []
        for index,value in enumerate(folds):
            if(index != i):
                train.append(value)
        train = np.asarray(np.vstack(train))


        for item in range(len(test)):
            test_point = test[item]
            data_slice = np.vstack((train[:,0:-1],test_point[0:-1].reshape((1,test_point[0:-1].shape[0]))))
            
            #Normalization
            n_data = normalize(data_slice)
            n_train = np.hstack((n_data[0:-1,:],train[:,-1].reshape(train[:,-1].shape[0],1)))
            n_test = np.hstack((n_data[-1,:],test_point[-1]))

            #Classification and Predication of the points
            final_class_labels = knn_classify(k, n_train,n_test)
            predicted_labels.append(final_class_labels)
    
        #Accuracy, Precision, Recall, Precision calculations
        acc, prec, rec, f_meas = confusion_matrix(test[:,-1], np.asarray(predicted_labels))
        accuracy += acc
        precision += prec
        recall += rec
        f_measure += f_meas

        i+=1
    
    #Outputs
    print('K Nearest Neighbors Results for the file {} --------------'.format(file))
    print('Accuracy : {}'.format(str(accuracy * 10)))
    print('Precision: {}'.format(str(precision * 10)))
    print('Recall : {}'.format(str(recall * 10)))
    print('F-1 Measure: {}'.format(str(f_measure * 10)))
        

        
__main__()   

Enter the Number of Neighbors : 9
K Nearest Neighbors Results for the file project3_dataset1.txt --------------
Accuracy : 96.66353383458647
Precision: 98.54219948849105
Recall : 92.58877591865794
F-1 Measure: 95.29281749376221
