In [1]:
import pandas as pd
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt

In [2]:
colnames = ['class', 'a1', 'a2', 'a3','a4','a5','a6','id']
dataset1 = pd.read_csv("../input_data/RobotDataset/Robot1", names=colnames, header=None,delim_whitespace=True)
dataset2 = pd.read_csv("../input_data/RobotDataset/Robot2", names=colnames, header=None,delim_whitespace=True)
frames = [dataset1,dataset2]
dataset = pd.concat(frames)

In [3]:
dataset.head()

Unnamed: 0,class,a1,a2,a3,a4,a5,a6,id
0,1,1,1,1,1,3,1,data_5
1,1,1,1,1,1,3,2,data_6
2,1,1,1,1,3,2,1,data_19
3,1,1,1,1,3,3,2,data_22
4,1,1,1,2,1,2,1,data_27


In [91]:
def safe_div(x,y):
    if y == 0:
        return 0
    return x / y

In [92]:
def euclidean(v1,v2):
    ary = spatial.distance.cdist(v1,v2, metric='minkowski')
    return ary[0,0]

In [93]:
def minkowski(v1,v2):
    ary = spatial.distance.cdist(v1,v2, metric='minkowski')
    return ary[0,0]

In [94]:
def cosine(v1,v2):
    ary = spatial.distance.cdist(v1,v2, metric='cosine')
    return ary[0,0]

In [95]:
def distances(dataset,sample,metric):
    dist = []
    l = len(dataset)
    for i in range(l):
        dist.append(metric(dataset.iloc[[i]],sample))
    return np.asarray(dist)

In [96]:
def knn(dataset,sample,y,classes,k,metric):
    dist = distances(dataset,sample,metric)
    indices = dist.argsort()[:3]
    counts = np.zeros(len(classes))
    for i in indices:
        counts[classes.index(y.iloc[i])] += 1
    return classes[np.argmax(counts)]

In [97]:
def train_validate_test_split(dataset):
    size = len(dataset)
    tsize = int(size*0.6)
    vsize = int(size*0.8)
    training_data = dataset.iloc[:tsize].reset_index(drop=True)
    validation_data = dataset.iloc[tsize:vsize].reset_index(drop=True)
    testing_data = dataset.iloc[vsize:].reset_index(drop=True)
    return training_data,validation_data,testing_data

In [98]:
def knn_algorithm(training_data,test_data,classes,k,metric):
    ttrain = training_data[['a1', 'a2', 'a3','a4','a5','a6']]
    ttest = test_data[['a1', 'a2', 'a3','a4','a5','a6']]
    y = training_data['class']
    pred = []
    for i in range(len(ttest)):
        pred.append(knn(ttrain,ttest.iloc[[i]],y,classes,k,metric))
    return pred

In [99]:
def confusionmatrix(preds,y,classes):
    n = len(preds)
    noc = len(classes)
    matrix = np.zeros((noc,noc))
    for i in range(n):
        r = classes.index(preds[i])
        c = classes.index(y[i])
        matrix[r][c] += 1
    return matrix

In [100]:
def stats2(confusionmatrix,classes): 
    n = len(classes)
    precision = np.zeros(n)
    recall = np.zeros(n)
    f1 = np.zeros(n)
    colsums = confusionmatrix.sum(axis=0)
    rowsums = confusionmatrix.sum(axis=1)
    dval = 0
    for i in range(n):
        precision[i] = confusionmatrix[i,i]/colsums[i]
        recall[i] = confusionmatrix[i,i]/rowsums[i]
        f1[i] = safe_div(2,(safe_div(1,precision[i]))+safe_div(1,recall[i]))
        dval += confusionmatrix[i,i]
    return dval/np.sum(confusionmatrix)

In [101]:
def stats(confusionmatrix,classes): 
    n = len(classes)
    precision = np.zeros(n)
    recall = np.zeros(n)
    f1 = np.zeros(n)
    colsums = confusionmatrix.sum(axis=0)
    rowsums = confusionmatrix.sum(axis=1)
    dval = 0
    for i in range(n):
        precision[i] = confusionmatrix[i,i]/colsums[i]
        recall[i] = confusionmatrix[i,i]/rowsums[i]
        f1[i] = safe_div(2,(safe_div(1,precision[i]))+safe_div(1,recall[i]))
        dval += confusionmatrix[i,i]
    for i in range(n):
        print("Recall of class",classes[i],":",recall[i])
        print("Precision of class",classes[i],":",precision[i])
        print("F1 Score of class",classes[i],":",f1[i])
        print('\n')
    print("Accuracy:",dval/np.sum(confusionmatrix))
    print("Classification error:",1-(dval/np.sum(confusionmatrix)))
    print("Overall Precision:",np.mean(precision))
    print("Overall Recall:",np.average(recall))
    print("Overall F1 Score:",np.mean(f1))

In [102]:
def sklearnstats(data):
    datanames = ['training data','validation data','testing data']
    from sklearn.metrics import confusion_matrix
    from sklearn.neighbors import KNeighborsClassifier
    neigh = KNeighborsClassifier(n_neighbors=3)
    X = data[0].drop('class',axis=1)
    y = data[0]['class']
    neigh.fit(X, y)
    for i in range(len(data)):
        print("Accuracy on:",datanames[i])
        preds = neigh.predict(data[i].drop('class',axis=1))
        con_mat = confusion_matrix(data[i]['class'], preds)
        stats(con_mat,data[i]['class'].unique())
        print("\n\n\n")

In [103]:
print("Robot Dataset Statistics")

Robot Dataset Statistics


In [104]:
training_data,validation_data,testing_data = train_validate_test_split(dataset.drop('id',axis=1))
classes = list(training_data['class'].unique())
preds = knn_algorithm(training_data,training_data,classes,3,euclidean)
cm = confusionmatrix(preds,list(training_data['class']),list(training_data['class'].unique()))
print("Training Data Stats")
stats(cm,training_data['class'].unique())

Training Data Stats
Recall of class 1 : 0.8795180722891566
Precision of class 1 : 0.9358974358974359
F1 Score of class 1 : 0.9068322981366458


Recall of class 0 : 0.921875
Precision of class 0 : 0.855072463768116
F1 Score of class 0 : 0.8872180451127819


Accuracy: 0.8979591836734694
Classification error: 0.10204081632653061
Overall Precision: 0.8954849498327759
Overall Recall: 0.9006965361445782
Overall F1 Score: 0.8970251716247138


In [105]:
preds = knn_algorithm(training_data,validation_data,classes,3,euclidean)
cm = confusionmatrix(preds,list(validation_data['class']),list(training_data['class'].unique()))
print("Validation Data Stats")
stats(cm,training_data['class'].unique())

Validation Data Stats
Recall of class 1 : 0.6956521739130435
Precision of class 1 : 0.64
F1 Score of class 1 : 0.6666666666666666


Recall of class 0 : 0.6538461538461539
Precision of class 0 : 0.7083333333333334
F1 Score of class 0 : 0.68


Accuracy: 0.673469387755102
Classification error: 0.326530612244898
Overall Precision: 0.6741666666666667
Overall Recall: 0.6747491638795986
Overall F1 Score: 0.6733333333333333


In [106]:
preds = knn_algorithm(training_data,testing_data,classes,3,euclidean)
cm = confusionmatrix(preds,list(testing_data['class']),list(training_data['class'].unique()))
print("Testing Data Stats")
stats(cm,training_data['class'].unique())

Testing Data Stats
Recall of class 1 : 0.3939393939393939
Precision of class 1 : 0.6842105263157895
F1 Score of class 1 : 0.5


Recall of class 0 : 0.6470588235294118
Precision of class 0 : 0.3548387096774194
F1 Score of class 0 : 0.45833333333333337


Accuracy: 0.48
Classification error: 0.52
Overall Precision: 0.5195246179966044
Overall Recall: 0.5204991087344029
Overall F1 Score: 0.4791666666666667


In [107]:
# SkLearn library results

In [108]:
sklearnstats([training_data,validation_data,testing_data])

Accuracy on: training data
Recall of class 1 : 0.8840579710144928
Precision of class 1 : 0.9384615384615385
F1 Score of class 1 : 0.9104477611940298


Recall of class 0 : 0.9487179487179487
Precision of class 0 : 0.9024390243902439
F1 Score of class 0 : 0.9249999999999999


Accuracy: 0.9183673469387755
Classification error: 0.08163265306122447
Overall Precision: 0.9204502814258912
Overall Recall: 0.9163879598662208
Overall F1 Score: 0.9177238805970149




Accuracy on: validation data
Recall of class 1 : 0.8333333333333334
Precision of class 1 : 0.6896551724137931
F1 Score of class 1 : 0.7547169811320755


Recall of class 0 : 0.64
Precision of class 0 : 0.8
F1 Score of class 0 : 0.7111111111111111


Accuracy: 0.7346938775510204
Classification error: 0.26530612244897955
Overall Precision: 0.7448275862068966
Overall Recall: 0.7366666666666667
Overall F1 Score: 0.7329140461215933




Accuracy on: testing data
Recall of class 1 : 0.45161290322580644
Precision of class 1 : 0.7368421052631579

In [59]:
print("Iris Dataset Results")

Iris Dataset Results


In [60]:
colnames = ['class', 'a1', 'a2', 'a3','a4','a5','a6','id']
dataset1 = pd.read_csv("../input_data/RobotDataset/Robot1", names=colnames, header=None,delim_whitespace=True)
dataset2 = pd.read_csv("../input_data/RobotDataset/Robot2", names=colnames, header=None,delim_whitespace=True)
frames = [dataset1,dataset2]
dataset = pd.concat(frames)

In [61]:
dataset.head()

Unnamed: 0,class,a1,a2,a3,a4,a5,a6,id
0,1,1,1,1,1,3,1,data_5
1,1,1,1,1,1,3,2,data_6
2,1,1,1,1,3,2,1,data_19
3,1,1,1,1,3,3,2,data_22
4,1,1,1,2,1,2,1,data_27


In [62]:
training_data,validation_data,testing_data = train_validate_test_split(dataset.drop('id',axis=1))
classes = list(training_data['class'].unique())
preds = knn_algorithm(training_data,training_data,classes,3,euclidean)
cm = confusionmatrix(preds,list(training_data['class']),list(training_data['class'].unique()))
print("Training Data Stats")
stats(cm,training_data['class'].unique())

Training Data Stats
Recall of class 1 : 0.8795180722891566
Precision of class 1 : 0.9358974358974359
F1 Score of class 1 : 0.9068322981366458


Recall of class 0 : 0.921875
Precision of class 0 : 0.855072463768116
F1 Score of class 0 : 0.8872180451127819


Accuracy: 0.8979591836734694
Classification error: 0.10204081632653061
Overall Precision: 0.8954849498327759
Overall Recall: 0.9006965361445782
Overall F1 Score: 0.8970251716247138


In [63]:
preds = knn_algorithm(training_data,validation_data,classes,3,euclidean)
cm = confusionmatrix(preds,list(validation_data['class']),list(training_data['class'].unique()))
print("Validation Data Stats")
stats(cm,training_data['class'].unique())

Validation Data Stats
Recall of class 1 : 0.6956521739130435
Precision of class 1 : 0.64
F1 Score of class 1 : 0.6666666666666666


Recall of class 0 : 0.6538461538461539
Precision of class 0 : 0.7083333333333334
F1 Score of class 0 : 0.68


Accuracy: 0.673469387755102
Classification error: 0.326530612244898
Overall Precision: 0.6741666666666667
Overall Recall: 0.6747491638795986
Overall F1 Score: 0.6733333333333333


In [64]:
preds = knn_algorithm(training_data,testing_data,classes,3,euclidean)
cm = confusionmatrix(preds,list(testing_data['class']),list(training_data['class'].unique()))
print("Testing Data Stats")
stats(cm,training_data['class'].unique())

Testing Data Stats
Recall of class 1 : 0.3939393939393939
Precision of class 1 : 0.6842105263157895
F1 Score of class 1 : 0.5


Recall of class 0 : 0.6470588235294118
Precision of class 0 : 0.3548387096774194
F1 Score of class 0 : 0.45833333333333337


Accuracy: 0.48
Classification error: 0.52
Overall Precision: 0.5195246179966044
Overall Recall: 0.5204991087344029
Overall F1 Score: 0.4791666666666667


In [65]:
# sklearn library results

In [66]:
sklearnstats([training_data,validation_data,testing_data])

Accuracy on: training data
Recall of class 1 : 0.8840579710144928
Precision of class 1 : 0.9384615384615385
F1 Score of class 1 : 0.9104477611940298


Recall of class 0 : 0.9487179487179487
Precision of class 0 : 0.9024390243902439
F1 Score of class 0 : 0.9249999999999999


Accuracy: 0.9183673469387755
Classification error: 0.08163265306122447
Overall Precision: 0.9204502814258912
Overall Recall: 0.9163879598662208
Overall F1 Score: 0.9177238805970149




Accuracy on: validation data
Recall of class 1 : 0.8333333333333334
Precision of class 1 : 0.6896551724137931
F1 Score of class 1 : 0.7547169811320755


Recall of class 0 : 0.64
Precision of class 0 : 0.8
F1 Score of class 0 : 0.7111111111111111


Accuracy: 0.7346938775510204
Classification error: 0.26530612244897955
Overall Precision: 0.7448275862068966
Overall Recall: 0.7366666666666667
Overall F1 Score: 0.7329140461215933




Accuracy on: testing data
Recall of class 1 : 0.45161290322580644
Precision of class 1 : 0.7368421052631579

In [None]:
# Here noth the dataset Robot and Iris perform lower than scikit learn library, but both the model performs 
# lower than 55% which means KNN is not suited for these datasets.
# # Results can be imroved if the dataset is used with other methods. 