In [1]:
# importing pandas
import pandas as pd
#fetch data and add headers
filepath = 'promise_CM1.csv'
column = ["loc","v(g)","ev(g)","n","v","l","d","i","e","b","t",
          "loCode","loComment","loBlank","locCodeAndComment",
          "uniq_Opnd","total_Op","branchCount","defects"]
df = pd.read_csv(filepath, names=column)

In [2]:
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,loc,v(g),ev(g),n,v,l,d,i,e,b,t,loCode,loComment,loBlank,locCodeAndComment,uniq_Opnd,total_Op,branchCount,defects
1.1,1.4,1.4,1.4,1.3,1.30,1.30,1.30,1.30,1.30,1.30,1.30,2,2,2,2,1.2,1.2,1.2,1.2,1.4,False
1.0,1.0,1.0,1.0,1.0,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1,1,1,1,1.0,1.0,1.0,1.0,1.0,True
24.0,5.0,1.0,3.0,63.0,309.13,0.11,9.50,32.54,2936.77,0.10,163.15,1,0,6,0,15.0,15.0,44.0,19.0,9.0,False
20.0,4.0,4.0,2.0,47.0,215.49,0.06,16.00,13.47,3447.89,0.07,191.55,0,0,3,0,16.0,8.0,31.0,16.0,7.0,False
24.0,6.0,6.0,2.0,72.0,346.13,0.06,17.33,19.97,5999.58,0.12,333.31,0,0,3,0,16.0,12.0,46.0,26.0,11.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47.0,3.0,1.0,3.0,256.0,1563.78,0.04,28.00,55.85,43785.90,0.52,2432.55,2,13,2,0,23.0,46.0,144.0,112.0,5.0,True
24.0,4.0,3.0,3.0,107.0,587.63,0.05,19.13,30.72,11241.58,0.20,624.53,1,7,4,0,22.0,23.0,67.0,40.0,7.0,True
82.0,11.0,3.0,10.0,475.0,3155.83,0.02,44.71,70.59,141084.24,1.05,7838.01,9,59,35,0,32.0,68.0,285.0,190.0,21.0,True
10.0,2.0,1.0,1.0,32.0,150.41,0.15,6.50,23.14,977.69,0.05,54.32,1,12,4,0,13.0,13.0,19.0,13.0,3.0,True


In [5]:
X = df.iloc[:,0:-1] #features
y = df.iloc[:,-1] #labels

In [9]:
#exhaustive filtering using K nearest neighbour
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

knn = KNeighborsClassifier(n_neighbors=3)

efs1 = EFS(knn, 
           min_features=1,
           max_features=5,
           scoring='accuracy',
           print_progress=True,
           cv=5)

efs1 = efs1.fit(X, y)

print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)

Features: 12615/12615

Best accuracy score: 0.90
Best subset (indices): (0, 3, 9, 17)
Best subset (corresponding names): ('loc', 'n', 'b', 'branchCount')


In [7]:
df = pd.DataFrame.from_dict(efs1.get_metric_dict()).T
df.sort_values('avg_score', inplace=True, ascending=False)
df

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1274,"(0, 3, 9, 17)","[0.87, 0.9, 0.9, 0.92, 0.9183673469387755]",0.901673,"(loc, n, b, branchCount)",0.0231592,0.0180186,0.00900932
12,"(12,)","[0.9, 0.9, 0.9, 0.9, 0.9081632653061225]",0.901633,"(loComment,)",0.00419687,0.00326531,0.00163265
3,"(3,)","[0.9, 0.9, 0.9, 0.9, 0.9081632653061225]",0.901633,"(n,)",0.00419687,0.00326531,0.00163265
74,"(3, 12)","[0.9, 0.9, 0.9, 0.9, 0.9081632653061225]",0.901633,"(n, loComment)",0.00419687,0.00326531,0.00163265
1518,"(0, 7, 9, 17)","[0.88, 0.9, 0.9, 0.89, 0.9183673469387755]",0.897673,"(loc, i, b, branchCount)",0.0163621,0.0127302,0.00636512
...,...,...,...,...,...,...,...
296,"(0, 12, 17)","[0.9, 0.52, 0.62, 0.69, 0.5408163265306123]",0.654163,"(loc, loComment, branchCount)",0.175981,0.136919,0.0684596
34,"(0, 17)","[0.9, 0.52, 0.62, 0.69, 0.5408163265306123]",0.654163,"(loc, branchCount)",0.175981,0.136919,0.0684596
17,"(17,)","[0.9, 0.51, 0.63, 0.55, 0.46938775510204084]",0.611878,"(branchCount,)",0.197328,0.153528,0.076764
160,"(12, 17)","[0.9, 0.51, 0.63, 0.55, 0.46938775510204084]",0.611878,"(loComment, branchCount)",0.197328,0.153528,0.076764


In [8]:
import matplotlib.pyplot as plt

metric_dict = efs1.get_metric_dict()

fig = plt.figure()
k_feat = sorted(metric_dict.keys())
avg = [metric_dict[k]['avg_score'] for k in k_feat]

upper, lower = [], []
for k in k_feat:
    upper.append(metric_dict[k]['avg_score'] +
                 metric_dict[k]['std_dev'])
    lower.append(metric_dict[k]['avg_score'] -
                 metric_dict[k]['std_dev'])

plt.fill_between(k_feat,
                 upper,
                 lower,
                 alpha=0.2,
                 color='blue',
                 lw=1)

plt.plot(k_feat, avg, color='blue', marker='o')
plt.ylabel('Accuracy +/- Standard Deviation')
plt.xlabel('Number of Features')
feature_min = len(metric_dict[k_feat[0]]['feature_idx'])
feature_max = len(metric_dict[k_feat[-1]]['feature_idx'])
plt.xticks(k_feat, 
           [str(metric_dict[k]['feature_names']) for k in k_feat], 
           rotation=90)
plt.show()

<matplotlib.figure.Figure at 0x11efb0828>