In [1]:
# importing pandas
import pandas as pd
from sklearn.model_selection import train_test_split

#fetch data and add headers
filepath = 'promise_CM1.csv'
column = ["loc","v(g)","ev(g)","n","v","l","d","i","e","b","t",
          "loCode","loComment","loBlank","locCodeAndComment",
          "uniq_Opnd","total_Op","branchCount","defects"]
df = pd.read_csv(filepath, names=column)

In [3]:
X = df.iloc[:,0:-1] #features
y = df.iloc[:,-1] #labels

#divide dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

In [4]:
#exhaustive filtering using K nearest neighbour
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

knn = KNeighborsClassifier(n_neighbors=3)

efs1 = EFS(knn, 
           min_features=3,
           max_features=5,
           scoring='accuracy',
           print_progress=True,
           cv=5)

efs1 = efs1.fit(X, y)

#print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)

Features: 12341/12444

Best subset (indices): (0, 3, 9, 17)
Best subset (corresponding names): ('loc', 'n', 'b', 'branchCount')


In [5]:
# Generate the new subsets based on the selected features
# Note that the transform call is equivalent to
# X_train[:, efs1.k_feature_idx_]

X_train_efs = efs1.transform(X_train)
X_test_efs = efs1.transform(X_test)

# Fit the estimator using the new feature subset
# and make a prediction on the test data
knn.fit(X_train_efs, y_train)
y_pred = knn.predict(X_test_efs)

# Compute the accuracy of the prediction
acc = float((y_test == y_pred).sum()) / y_pred.shape[0]
print('Test set accuracy: %.2f %%' % (acc*100))

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

accuracy_score = accuracy_score(y_test,y_pred)
precision_score = precision_score(y_test,y_pred)
recall_score = recall_score(y_test,y_pred)

print("Accuracy: ", accuracy_score)
print("Precision:",precision_score)
print("Recall:",recall_score)

Test set accuracy: 96.00 %
Accuracy:  0.96
Precision: 1.0
Recall: 0.5


In [6]:
df = pd.DataFrame.from_dict(efs1.get_metric_dict()).T
df.sort_values('avg_score', inplace=True, ascending=False)
df

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
5188,"(0, 3, 9, 12, 17)","[0.87, 0.9, 0.9, 0.92, 0.9183673469387755]",0.901673,"(loc, n, b, loComment, branchCount)",0.0231592,0.0180186,0.00900932
1103,"(0, 3, 9, 17)","[0.87, 0.9, 0.9, 0.92, 0.9183673469387755]",0.901673,"(loc, n, b, branchCount)",0.0231592,0.0180186,0.00900932
5106,"(0, 3, 7, 9, 17)","[0.87, 0.9, 0.9, 0.91, 0.9183673469387755]",0.899673,"(loc, n, i, b, branchCount)",0.0210162,0.0163513,0.00817565
6236,"(0, 11, 13, 16, 17)","[0.89, 0.89, 0.9, 0.91, 0.9081632653061225]",0.899633,"(loc, loCode, loBlank, total_Op, branchCount)",0.010996,0.00855528,0.00427764
5979,"(0, 7, 9, 12, 17)","[0.88, 0.9, 0.9, 0.89, 0.9183673469387755]",0.897673,"(loc, i, b, loComment, branchCount)",0.0163621,0.0127302,0.00636512
...,...,...,...,...,...,...,...
9868,"(3, 5, 9, 10, 16)","[0.79, 0.83, 0.8, 0.86, 0.9081632653061225]",0.837633,"(n, l, b, t, total_Op)",0.0551874,0.0429377,0.0214688
11535,"(5, 9, 10, 12, 16)","[0.79, 0.83, 0.8, 0.86, 0.9081632653061225]",0.837633,"(l, b, t, loComment, total_Op)",0.0551874,0.0429377,0.0214688
5682,"(0, 5, 9, 10, 16)","[0.79, 0.83, 0.79, 0.86, 0.9081632653061225]",0.835633,"(loc, l, b, t, total_Op)",0.0576259,0.0448349,0.0224175
11546,"(5, 9, 10, 16, 17)","[0.79, 0.83, 0.8, 0.85, 0.8979591836734694]",0.833592,"(l, b, t, total_Op, branchCount)",0.049626,0.0386107,0.0193054
