In [1]:
# importing pandas
import pandas as pd
#fetch data and add headers
filepath = 'promise_CM1.csv'
column = ["loc","v(g)","ev(g)","n","v","l","d","i","e","b","t",
          "loCode","loComment","loBlank","locCodeAndComment",
          "uniq_Opnd","total_Op","branchCount","defects"]
df = pd.read_csv(filepath, names=column)

In [2]:
X = df.iloc[:,0:-1] #features
y = df.iloc[:,-1] #labels

In [3]:
#divide dataset into train and test
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)
# model for Gausian NB

gnb = GaussianNB().fit(X_train, y_train)

#predict model on test data
#predicted = gnb.predict(X_test)

In [4]:
#exhaustive filtering using K nearest neighbour
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS



efs1 = EFS(gnb, 
           min_features=3,
           max_features=5,
           scoring='accuracy',
           print_progress=True,
           cv=5)

efs1 = efs1.fit(X, y)

#print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)

Features: 12444/12444

Best subset (indices): (3, 6, 10)
Best subset (corresponding names): ('n', 'd', 't')


In [5]:
X_train_efs = efs1.transform(X_train)
X_test_efs = efs1.transform(X_test)

gnb.fit(X_train_efs,y_train)
y_pred = gnb.predict(X_test_efs)

acc = float((y_test == y_pred).sum()) / y_pred.shape[0]
print('Test set accuracy: %.2f %%' % (acc*100))

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

precision_score = precision_score(y_test,y_pred)
recall_score = recall_score(y_test,y_pred)

print("Precision:",precision_score)
print("Recall:",recall_score)

Test set accuracy: 92.00 %
Precision: 1.0
Recall: 0.5


In [6]:
df = pd.DataFrame.from_dict(efs1.get_metric_dict()).T
df.sort_values('avg_score',inplace=True, ascending = False)
df

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
705,"(8, 10, 12)","[0.9, 0.92, 0.91, 0.88, 0.9081632653061225]",0.903633,"(e, t, loComment)",0.0172503,0.0134213,0.00671066
9965,"(3, 6, 7, 10, 12)","[0.9, 0.92, 0.91, 0.88, 0.9081632653061225]",0.903633,"(n, d, i, t, loComment)",0.0172503,0.0134213,0.00671066
2657,"(3, 6, 7, 10)","[0.9, 0.92, 0.91, 0.88, 0.9081632653061225]",0.903633,"(n, d, i, t)",0.0172503,0.0134213,0.00671066
389,"(3, 6, 10)","[0.9, 0.92, 0.91, 0.88, 0.9081632653061225]",0.903633,"(n, d, t)",0.0172503,0.0134213,0.00671066
598,"(6, 7, 10)","[0.9, 0.92, 0.91, 0.88, 0.9081632653061225]",0.903633,"(d, i, t)",0.0172503,0.0134213,0.00671066
...,...,...,...,...,...,...,...
9457,"(3, 4, 5, 7, 13)","[0.82, 0.85, 0.85, 0.79, 0.8673469387755102]",0.835469,"(n, v, l, i, loBlank)",0.0351767,0.0273687,0.0136843
8083,"(2, 3, 4, 5, 13)","[0.82, 0.85, 0.85, 0.79, 0.8673469387755102]",0.835469,"(ev(g), n, v, l, loBlank)",0.0351767,0.0273687,0.0136843
9500,"(3, 4, 5, 13, 17)","[0.83, 0.85, 0.85, 0.78, 0.8673469387755102]",0.835469,"(n, v, l, loBlank, branchCount)",0.038752,0.0301504,0.0150752
6718,"(1, 3, 4, 5, 13)","[0.83, 0.85, 0.85, 0.78, 0.8673469387755102]",0.835469,"(v(g), n, v, l, loBlank)",0.038752,0.0301504,0.0150752
