In [149]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()

loading the all the basic packages and datasets

In [122]:
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [123]:
import math
def one_nn_index(test_sample, train_data):
    
    # test_sample the sample for which we need to predict the label
    # train_data is the sample set. 
    # Note : train_data should be or shape (n,m) where n != 1 or else it causes issues, m is number of features
    
    min_dist, min_index = math.inf, 0
    num_of_samples = train_data.shape[0]
    for i in range(num_of_samples) :        
        cal = math.sqrt(sum((train_data[i] - test_sample) ** 2))
        if min_dist >= cal :
            min_dist = cal
            min_index = i
    
    return min_index 

Above function returns the index of the samples closes to test_sample. Here we use euclidean distance to find which is the nearest sample to the test_sample. We loop through the array and calculate the distance while simultaneously keep track of the minimum value and the minimum index.

In [124]:
def knn(sample, data, label):
    ind = one_nn_index(sample, data)
    return label[ind]

Above function acts as a driver (similar to knn.predict) to one_knn_index. It calls the one_knn_index function, gets the index and maps it to the corresponding label value.

In [150]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 412)

Splitting the iris dataset into default 3:1 split. We will now make predictions for the entire X_test set. 

In [126]:
y_pred = []
for i in X_test :
    y_pred.append(knn(i, X_train, y_train))

The predictions are stored in y_pred and we have the true labels in y_test. We can calculate the accuracy by using in np.mean function as shown below.

In [127]:
np.mean(y_pred == y_test)

0.9210526315789473

Above is the accuracy of the one nearest neighbour.

Below is the implementation of the conformal predictor. For each postulated label for every test sample pvalue is calculated and the value is returned. The conformity measure used is distance to nearest sample of different class divided by distance to nearest sample of same class. 

Step 1 : We first update the postulated label since the test_sample is part of the sample set

Step 2 : We use nested for loops to get the conformity scores of the entire sample set. The outer for loop loops through the sample set and the inner for loop calculates the 2 distances (different class and same class). The calculated conformity score is stored in the 'conformity_scores' array


Step 3 : We  calculate the rank for  our postulated label and further calculate the p value. Here we divide rank by number of samples because n+1 in our case is equal to number of samples.

In [128]:
import math
def calculate_pvalue(index, postulated_label, data, labels):
    
    # index is the index of the test sample in the entire sample
    # postulated label is the assumed label for the test sample
    # data is the sample set
    # label is the actual labels of the sample set
    
    updated_label = labels
    updated_label[index] = postulated_label # making the label for test sample to the postulated label
    
    
    num_of_samples = data.shape[0]
    conformity_scores = np.empty(num_of_samples)
    
    for i in range(num_of_samples) :
        
        min_dist_same, min_dist_diff = math.inf, math.inf #nearest distance to same class and difference class
        
        for j in range(num_of_samples) :
            
            if i != j : # Do not include the same sample in calculating the distances
                # calculates the nearest distance to same and different class
                cal = math.sqrt(sum((data[i] - data[j]) ** 2))
                if  updated_label[i]  == updated_label[j] and min_dist_same > cal:
                    min_dist_same = cal
                elif updated_label[i]  != updated_label[j] and min_dist_diff > cal :
                    min_dist_diff = cal
                    
        #Taking care of division problems
        if min_dist_diff == 0 and min_dist_same == 0: 
            conformity_scores[i] = 0
        elif min_dist_same == 0:
            conformity_scores[i] = math.inf
        elif min_dist_diff == 0:
            conformity_scores[i] = 0 
        else:
            conformity_scores[i] = min_dist_diff / min_dist_same     
        
    # end of both the loops, at this point we have the coformity scores of all the samples
    # we can calculate the rank of the test sample (we have it's index) followed by the p value
    
    rank = sum(conformity_scores[index] >= conformity_scores)
    p = rank / num_of_samples # test sample is included in all samples, hence n+1 in our case is equal to num_of_samples 
    return p

In [156]:
calculate_pvalue(15,0,X_test,y_test)

0.8947368421052632

In [157]:
calculate_pvalue(15, 1, X_test, y_test)

0.02631578947368421

In [158]:
calculate_pvalue(15, 2, X_test, y_test)

0.02631578947368421

In [159]:
y_test[15]

2

Below we calculate the average false p value. We calculate the p values for all samples but conditionally exclude true p values in the if condition. We store all the false p values in the array and then compute the average.

In [160]:
# Calculating the average false p value

false_pvalues = []

size = X_test.shape[0]

all_labels = list(set(y_test))
for i in range(size) :
    for j in range(len(all_labels)):
        if y_test[i] != y_test[j]:
            false_pvalues.append(calculate_pvalue(i, y_test[j], X_test, y_test))
    
average_pvalue = sum(false_pvalues) / size

In [161]:
average_pvalue

0.32340720221606645

IONOSPHERE DATASET


Below we run the code on the ionoshphere dataset.

In [173]:
X = np.genfromtxt("ionosphere.txt",delimiter=",",usecols=np.arange(34))

In [174]:
y = np.genfromtxt("ionosphere.txt", delimiter=",", usecols=34, dtype = 'int')

In [175]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 412)

In [176]:
y_pred = []
for i in X_test :
    y_pred.append(knn(i, X_train, y_train))

In [177]:
np.mean(y_pred == y_test)

0.9090909090909091

For the ionosphere we can an accuray of 90% as shown above. 

In [178]:
false_pvalues = []

size = X_test.shape[0]

all_labels = list(set(y_test))
for i in range(size) :
    for j in range(len(all_labels)):
        if y_test[i] != y_test[j]:
            false_pvalues.append(calculate_pvalue(i, y_test[j], X_test, y_test))
    
average_pvalue = sum(false_pvalues) / size

In [179]:
average_pvalue

0.2920971074380166