In [1]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
#read training data
train_set = pd.read_csv('train.csv', header=0)
train_set

Unnamed: 0,x,y,z,class
0,8.599291,9.729418,6.432371,1
1,6.592955,0.082556,1.969544,1
2,5.596471,9.815682,0.027295,1
3,2.743639,8.783177,4.041946,0
4,4.458362,5.750222,0.099070,0
...,...,...,...,...
995,4.617314,7.700236,5.907128,0
996,5.453472,1.798360,1.992616,0
997,2.553853,8.122934,3.970146,0
998,3.210456,3.342092,7.831479,0


In [3]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       1000 non-null   float64
 1   y       1000 non-null   float64
 2   z       1000 non-null   float64
 3   class   1000 non-null   int64  
dtypes: float64(3), int64(1)
memory usage: 31.4 KB


In [4]:
#separate features from labels
features = train_set.loc[:,['x','y','z']]
labels_train = train_set.loc[:,'class']

#normalize features
features_train = StandardScaler().fit_transform(features)
print('feature shape: ',features_train.shape)
features_train #display features

feature shape:  (1000, 3)


array([[ 1.2909373 ,  1.64214887,  0.49931501],
       [ 0.58765781, -1.74022951, -1.04276611],
       [ 0.23836075,  1.6723947 , -1.71388907],
       ...,
       [-0.82816586,  1.07888402, -0.35147993],
       [-0.59800699, -0.59737263,  0.98276143],
       [ 0.70588507, -0.80765324, -0.10670655]])

In [5]:
#train 3-nearest neighbor classifier from the training dataset

threeNeighbors = KNeighborsClassifier(n_neighbors=3)
threeClassifier = threeNeighbors.fit(features_train,labels_train)

In [6]:
#import testing data
test_set = pd.read_csv('test.csv')
test_set

Unnamed: 0,ID,x,y,z,actual-class
0,1,8.074807,5.988044,3.844979,1
1,2,4.952249,5.823205,1.612045,0
2,3,4.773178,0.078757,4.209442,0
3,4,9.845919,2.055448,3.525702,1
4,5,1.612492,1.320515,8.200455,0
5,6,7.987555,9.188111,7.222228,1
6,7,0.311558,3.97468,7.897371,0
7,8,1.219113,0.266045,2.741136,0
8,9,0.63634,1.831257,6.767459,0
9,10,0.890168,8.613714,2.884227,0


In [7]:
#separate features from labels in testing set
features_test = test_set.loc[:,['x','y','z']]
features_test = StandardScaler().fit_transform(features_test)
labels_test = test_set.loc[:,'actual-class']

In [8]:
#classify classes for the testing data
labels_pred = threeClassifier.predict(features_test)
labels_pred

array([1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0])

In [9]:
#estimate the probability for the testing data
labels_prob = threeClassifier.predict_proba(features_test)
labels_prob

array([[0.        , 1.        ],
       [0.33333333, 0.66666667],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ]])

In [10]:
#create confusion metrix and classification report
confusion_metrix_1 = confusion_matrix(labels_test,labels_pred)
confusion_metrix_1

array([[12,  2],
       [ 0,  6]])

In [11]:
print(classification_report(labels_test,labels_pred))

              precision    recall  f1-score   support

           0       1.00      0.86      0.92        14
           1       0.75      1.00      0.86         6

    accuracy                           0.90        20
   macro avg       0.88      0.93      0.89        20
weighted avg       0.93      0.90      0.90        20



In [12]:
#Question(2): create a classifier using Euclidean distance with weighted 3-nearest neighbors(1/d^2)
kpoints = threeClassifier.kneighbors()
distance = kpoints[0]
distance #display the distance

array([[0.21388347, 0.29744062, 0.30803045],
       [0.14763209, 0.28948863, 0.3027134 ],
       [0.27955823, 0.33040975, 0.4966061 ],
       ...,
       [0.23978173, 0.2421378 , 0.28354917],
       [0.08039089, 0.26069554, 0.29814222],
       [0.13211408, 0.15813885, 0.28982672]])

In [13]:
#calculate the weight from a user-defined function
def weighted_dist(distance):
    #weight = np.zeros((distance.shape[0],1))
    weight = np.zeros(distance.shape)
    for i in range(0,len(distance)):
        d_square = np.power(distance[i][0] - distance[i][1],2) + np.power(distance[i][0] - distance[i][2],2) + np.power(distance[i][1] - distance[i][2],2)
        weight[i] = 1/d_square 
            
    return weight

In [14]:
#this section is intended to display the weight values. However, the callable function is used as the argument.
weight = weighted_dist(distance)
weight #display the weights

array([[ 62.66608884,  62.66608884,  62.66608884],
       [ 22.54873608,  22.54873608,  22.54873608],
       [ 12.9337869 ,  12.9337869 ,  12.9337869 ],
       ...,
       [275.02437082, 275.02437082, 275.02437082],
       [ 12.29593932,  12.29593932,  12.29593932],
       [ 23.31423077,  23.31423077,  23.31423077]])

In [15]:
#classifier for 3-nearest neighbors with weight (1/d^2)
threeNeighborsWeighted = KNeighborsClassifier(n_neighbors=3, weights=weighted_dist)
threeClassifierWeighted = threeNeighborsWeighted.fit(features_train,labels_train)

In [16]:
#classify classes for the testing data
labels_pred_weighted = threeClassifierWeighted.predict(features_test)
labels_pred_weighted

array([1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0])

In [17]:
#estimate the weighted probability for the testing data
labels_prob_weighted = threeClassifierWeighted.predict_proba(features_test)
labels_prob_weighted

array([[0.        , 1.        ],
       [0.33333333, 0.66666667],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ]])

In [18]:
#create confusion metrix and classification report
confusion_metrix_2 = confusion_matrix(labels_test,labels_pred_weighted)
confusion_metrix_2

array([[12,  2],
       [ 0,  6]])

In [19]:
print(classification_report(labels_test,labels_pred_weighted))

              precision    recall  f1-score   support

           0       1.00      0.86      0.92        14
           1       0.75      1.00      0.86         6

    accuracy                           0.90        20
   macro avg       0.88      0.93      0.89        20
weighted avg       0.93      0.90      0.90        20

