In [72]:
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score

In [9]:
def splitdf(df):
    y = df[df.columns[-1]].to_numpy()
    x = df[df.columns[:-1]].to_numpy()
    return x,y

In [10]:
def euclid_dist(x,y):
    return np.sqrt(np.sum((np.array(x)-np.array(y))**2))

In [67]:
def KNN(k,x,y,xnew):
    dists = []
    for i in range(len(x)):
        dists.append((y[i],euclid_dist(x[i],xnew)))
    dists.sort(key=lambda x:x[1])
    neighbors=[]
    for i in range(k):
        neighbors.append(dists[i][0])
    return neighbors

In [68]:
def KNN_predict(k,X_train,y_train,xtest):
    preds = []
    for xnew in xtest:
        neighbors = KNN(k,X_train,y_train,xnew)
        pred = max(neighbors,key=neighbors.count)
        preds.append(pred)
    return preds

In [13]:
data = arff.loadarff('veh-prime.arff')
df = pd.DataFrame(data[0])

In [14]:
def pearson(x,y):
    sum_sq_x = 0
    sum_sq_y = 0
    sum_coproduct = 0
    mean_x = 0
    mean_y = 0
    N = len(x)
    for i in range(N):
        sum_sq_x += x[i] * x[i]
        sum_sq_y += y[i] * y[i]
        sum_coproduct += x[i] * y[i]
        mean_x += x[i]
        mean_y += y[i]
    mean_x = mean_x / N
    mean_y = mean_y / N
    pop_sd_x = np.sqrt((sum_sq_x/N) - (mean_x * mean_x))
    pop_sd_y = np.sqrt((sum_sq_y / N) - (mean_y * mean_y))
    cov_x_y = (sum_coproduct / N) - (mean_x * mean_y)
    correlation = cov_x_y / (pop_sd_x * pop_sd_y)
    return correlation


In [15]:
df['CLASS']=df['CLASS'].str.decode('ascii').map({"car": 1, "noncar": 0})

In [16]:
df

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f27,f28,f29,f30,f31,f32,f33,f34,f35,CLASS
0,0.063,0.160,0.509,-0.967,0.058,0.000,0.874,0.271,1.307,-0.011,...,-0.924,-0.077,0.108,-0.003,0.381,-0.314,0.929,0.184,-0.001,0
1,-0.037,-0.325,-0.626,-0.029,0.121,-0.409,-0.002,-0.835,-0.595,-0.253,...,0.270,0.533,0.152,-0.978,0.157,0.011,-0.254,0.453,-0.621,0
2,-0.000,1.253,0.833,-0.970,1.516,0.014,-0.378,1.197,0.546,-0.402,...,-0.408,1.550,0.010,-0.652,-0.403,-0.151,0.000,0.049,-0.113,1
3,-0.743,-0.082,-0.626,0.723,-0.006,-0.000,-0.080,-0.297,0.166,0.311,...,0.819,-0.077,-0.099,-0.001,-0.291,1.633,0.686,1.528,-0.000,0
4,-0.939,-1.054,-0.140,0.036,-0.766,0.000,-0.272,1.077,5.236,-0.366,...,0.676,0.533,-0.003,0.122,-0.179,-1.449,0.024,-1.698,0.083,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,0.072,-0.082,-0.950,0.359,0.311,0.368,-0.259,0.420,0.292,-0.070,...,0.005,0.127,0.380,-0.001,1.388,-0.151,0.291,-0.085,0.000,1
842,-0.555,-0.568,0.184,-0.920,0.121,0.883,-0.006,-0.177,0.546,0.282,...,0.039,-1.093,0.066,0.002,0.829,-0.476,0.924,0.184,0.170,0
843,-0.703,1.496,1.481,-0.529,1.199,-0.085,-0.980,1.585,0.673,-0.553,...,-0.704,-0.687,0.059,-0.294,-0.963,-0.314,-0.204,0.722,-0.386,1
844,0.296,-0.932,-1.436,0.113,-0.259,0.905,0.409,-0.685,-0.468,-0.053,...,-0.156,-1.297,-0.238,-0.000,1.388,0.173,0.016,-0.085,0.873,1


In [17]:
features = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
       'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
       'f31', 'f32', 'f33', 'f34', 'f35']

In [18]:
featdict = {}
for feat in features:
  featdict[feat] = np.abs(pearson(df[feat],df['CLASS']))

In [19]:
featdf = pd.DataFrame(featdict,index=['R'])
featdf = featdf.T

In [20]:
featdf.sort_values(by='R',ascending = False)

Unnamed: 0,R
f4,0.436922
f13,0.368269
f14,0.368224
f16,0.366025
f7,0.352141
f22,0.35135
f26,0.341043
f1,0.308811
f20,0.299049
f31,0.290783


In [28]:
feats = ['f4', 'f13', 'f14', 'f16', 'f7', 'f22', 'f26', 'f1', 'f20', 'f31',
       'f34', 'f2', 'f28', 'f25', 'f19', 'f17', 'f32', 'f8', 'f0', 'f10',
       'f21', 'f11', 'f33', 'f6', 'f15', 'f35', 'f29', 'f18', 'f27', 'f9',
       'f3', 'f30', 'f24', 'f23', 'f12', 'f5']

In [75]:
results = {}
for m in range(1,len(feats)):
  tempdf  = df[feats[:m]+['CLASS']]
  loo = LeaveOneOut()
  X,y = splitdf(tempdf)
  ytrue = []
  ypred = []
  for train_index, test_index in loo.split(tempdf):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    ytrue.append(y_test)
    ypred.append(KNN_predict(7,X_train,y_train,X_test))
  results[m] = accuracy_score(ytrue,ypred)
resultdf = pd.DataFrame(results,index=['accuracy'])

In [80]:
resultdf.T.sort_values(by='accuracy',ascending = False)

Unnamed: 0,accuracy
20,0.950355
18,0.946809
31,0.945626
29,0.945626
23,0.945626
24,0.944444
34,0.943262
30,0.943262
22,0.943262
33,0.94208


In [119]:
def SFS(df,features,selection,accuracy=0):



In [127]:
features = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'f12', 'f13','f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
       'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
       'f31', 'f32', 'f33', 'f34', 'f35']
selection = []
accuracy = 0
results = {}
while True:
  for feat in features:
    print(feat,selection)
    tempselect = []
    tempselect.extend(selection)
    tempselect.append(feat)
    tempdf  = df[tempselect+['CLASS']]
    loo = LeaveOneOut()
    X,y = splitdf(tempdf)
    ytrue = []
    ypred = []
    for train_index, test_index in loo.split(tempdf):
      X_train, X_test = X[train_index], X[test_index]
      y_train, y_test = y[train_index], y[test_index]
      ytrue.append(y_test)
      ypred.append(KNN_predict(7,X_train,y_train,X_test))
    results[feat] = accuracy_score(ytrue,ypred)
    print(results[feat])
  bestfeat = max(results, key=results.get)
  if results[max(results, key=results.get)] > accuracy:
    features.remove(bestfeat)
    selection.append(bestfeat)
    print(selection, results[max(results, key=results.get)])
    accuracy = results[max(results, key=results.get)]
  else:
    break

f0 []
0.508274231678487
f1 []
0.6335697399527187
f2 []
0.6749408983451537
f3 []
0.48817966903073284
f4 []
0.7222222222222222
f5 []
0.5130023640661938
f6 []
0.46099290780141844
f7 []
0.66548463356974
f8 []
0.5721040189125296
f9 []
0.5070921985815603
f10 []
0.5697399527186762
f11 []
0.46335697399527187
f12 []
0.5153664302600472
f13 []
0.7387706855791962
f14 []
0.7635933806146572
f15 []
0.5236406619385343
f16 []
0.710401891252955
f17 []
0.5094562647754137
f18 []
0.5047281323877069
f19 []
0.6442080378250591
f20 []
0.7434988179669031
f21 []
0.5023640661938534
f22 []
0.7411347517730497
f23 []
0.5106382978723404
f24 []
0.5118203309692672
f25 []
0.6382978723404256
f26 []
0.599290780141844
f27 []
0.5130023640661938
f28 []
0.5319148936170213
f29 []
0.5307328605200946
f30 []
0.5295508274231678
f31 []
0.6134751773049646
f32 []
0.5543735224586288
f33 []
0.5070921985815603
f34 []
0.5295508274231678
f35 []
0.5023640661938534
['f14'] 0.7635933806146572
f0 ['f14']
0.7411347517730497
f1 ['f14']
0.770685