In [11]:
# importing pandas
import pandas as pd
from sklearn.model_selection import train_test_split

#fetch data and add headers
filepath = 'promise_CM1.csv'
column = ["loc","v(g)","ev(g)","n","v","l","d","i","e","b","t",
          "loCode","loComment","loBlank","locCodeAndComment",
          "uniq_Opnd","total_Op","branchCount","defects"]
df = pd.read_csv(filepath, names=column)

In [12]:
X = df.iloc[:,0:-1] #features
y = df.iloc[:,-1] #labels

#divide dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [13]:
#exhaustive filtering using K nearest neighbour
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

knn = KNeighborsClassifier(n_neighbors=3)

efs1 = EFS(knn, 
           min_features=1,
           max_features=4,
           scoring='accuracy',
           print_progress=True,
           cv=5)

efs1 = efs1.fit(X, y)

#print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)

Features: 4047/4047

Best subset (indices): (0, 3, 9, 17)
Best subset (corresponding names): ('loc', 'n', 'b', 'branchCount')


In [14]:
# Generate the new subsets based on the selected features
# Note that the transform call is equivalent to
# X_train[:, efs1.k_feature_idx_]

X_train_efs = efs1.transform(X_train)
X_test_efs = efs1.transform(X_test)

# Fit the estimator using the new feature subset
# and make a prediction on the test data
knn.fit(X_train_efs, y_train)
y_pred = knn.predict(X_test_efs)

# Compute the accuracy of the prediction
acc = float((y_test == y_pred).sum()) / y_pred.shape[0]
print('Test set accuracy: %.2f %%' % (acc*100))

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

accuracy_score = accuracy_score(y_test,y_pred)
precision_score = precision_score(y_test,y_pred)
recall_score = recall_score(y_test,y_pred)

print("Accuracy: ", accuracy_score)
print("Precision:",precision_score)
print("Recall:",recall_score)

Test set accuracy: 88.00 %
Accuracy:  0.88
Precision: 0.5
Recall: 0.1111111111111111


In [5]:
df = pd.DataFrame.from_dict(efs1.get_metric_dict()).T
df.sort_values('avg_score', inplace=True, ascending=False)
df

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1274,"(0, 3, 9, 17)","[0.87, 0.9, 0.9, 0.92, 0.9183673469387755]",0.901673,"(loc, n, b, branchCount)",0.0231592,0.0180186,0.00900932
12,"(12,)","[0.9, 0.9, 0.9, 0.9, 0.9081632653061225]",0.901633,"(loComment,)",0.00419687,0.00326531,0.00163265
3,"(3,)","[0.9, 0.9, 0.9, 0.9, 0.9081632653061225]",0.901633,"(n,)",0.00419687,0.00326531,0.00163265
74,"(3, 12)","[0.9, 0.9, 0.9, 0.9, 0.9081632653061225]",0.901633,"(n, loComment)",0.00419687,0.00326531,0.00163265
1518,"(0, 7, 9, 17)","[0.88, 0.9, 0.9, 0.89, 0.9183673469387755]",0.897673,"(loc, i, b, branchCount)",0.0163621,0.0127302,0.00636512
...,...,...,...,...,...,...,...
296,"(0, 12, 17)","[0.9, 0.52, 0.62, 0.69, 0.5408163265306123]",0.654163,"(loc, loComment, branchCount)",0.175981,0.136919,0.0684596
34,"(0, 17)","[0.9, 0.52, 0.62, 0.69, 0.5408163265306123]",0.654163,"(loc, branchCount)",0.175981,0.136919,0.0684596
17,"(17,)","[0.9, 0.51, 0.63, 0.55, 0.46938775510204084]",0.611878,"(branchCount,)",0.197328,0.153528,0.076764
160,"(12, 17)","[0.9, 0.51, 0.63, 0.55, 0.46938775510204084]",0.611878,"(loComment, branchCount)",0.197328,0.153528,0.076764


In [1]:
# import matplotlib.pyplot as plt

# metric_dict = efs1.get_metric_dict()

# fig = plt.figure()
# k_feat = sorted(metric_dict.keys())
# avg = [metric_dict[k]['avg_score'] for k in k_feat]

# upper, lower = [], []
# for k in k_feat:
#     upper.append(metric_dict[k]['avg_score'] +
#                  metric_dict[k]['std_dev'])
#     lower.append(metric_dict[k]['avg_score'] -
#                  metric_dict[k]['std_dev'])

# plt.fill_between(k_feat,
#                  upper,
#                  lower,
#                  alpha=0.2,
#                  color='blue',
#                  lw=1)

# plt.plot(k_feat, avg, color='blue', marker='o')
# plt.ylabel('Accuracy +/- Standard Deviation')
# plt.xlabel('Number of Features')
# feature_min = len(metric_dict[k_feat[0]]['feature_idx'])
# feature_max = len(metric_dict[k_feat[-1]]['feature_idx'])
# plt.xticks(k_feat, 
#            [str(metric_dict[k]['feature_names']) for k in k_feat], 
#            rotation=90)
# plt.show()