In [2]:
# importing pandas
import pandas as pd
#fetch data and add headers
filepath = 'promise_KC3.csv'
column = ["loc_blank","branch_count","call_pairs","loc_code_and_comment",
          "loc_comments","condition_count","cyclomatic_complexity",
          "cyclomatic_design","decision_count","decision_density","design_complexity",
          "design_density","edge_count","essential_complexity","essential_density",
          "loc_executable","parameter_count","global_data_complexity","global_data_density",
          "halstead_content","halstead_difficulty","halstead_effort","halstead_error_est",
         "halstead_length","halstead_level","halstead_prog_time","halstead_volumn",
         "maintenance_severity","modified_condition_count","multiple_condition_count",
          "node_count","normalized_cyclomatic_complexity","num_operands","num_operators",
          "num_unique_operands","num_unique_operators","number_of_lines","percent_comments",
          "loc_total","defects"]
    
df = pd.read_csv(filepath, names=column)

In [3]:
X = df.iloc[:,0:-1] #features
y = df.iloc[:,-1] #labels

In [4]:
#divide dataset into train and test
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)
# model for Gausian NB

gnb = GaussianNB().fit(X_train, y_train)

#predict model on test data
#predicted = gnb.predict(X_test)

In [5]:
#exhaustive filtering using K nearest neighbour
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

efs1 = EFS(gnb, 
           min_features=3,
           max_features=5,
           scoring='accuracy',
           print_progress=True,
           cv=5)

efs1 = efs1.fit(X, y)

#print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)

Features: 666561/667147

Best subset (indices): (1, 3, 7, 9)
Best subset (corresponding names): ('branch_count', 'loc_code_and_comment', 'cyclomatic_design', 'decision_density')


In [7]:
X_train_efs = efs1.transform(X_train)
X_test_efs = efs1.transform(X_test)

gnb.fit(X_train_efs,y_train)
y_pred = gnb.predict(X_test_efs)

acc = float((y_test == y_pred).sum()) / y_pred.shape[0]
print('Test set accuracy: %.2f %%' % (acc*100))

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

precision_score = precision_score(y_test,y_pred)
recall_score = recall_score(y_test,y_pred)

print("Precision:",precision_score)
print("Recall:",recall_score)

Test set accuracy: 100.00 %
Precision: 1.0
Recall: 1.0


In [8]:
df = pd.DataFrame.from_dict(efs1.get_metric_dict()).T
df.sort_values('avg_score',inplace=True, ascending = False)
df

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
18305,"(1, 3, 7, 9)","[0.9021739130434783, 0.9347826086956522, 0.923...",0.912613,"(branch_count, loc_code_and_comment, cyclomati...",0.0201985,0.0157151,0.00785755
18386,"(1, 3, 9, 31)","[0.9130434782608695, 0.9347826086956522, 0.913...",0.912613,"(branch_count, loc_code_and_comment, decision_...",0.0181634,0.0141317,0.00706586
307048,"(3, 7, 9, 12, 16)","[0.9130434782608695, 0.9347826086956522, 0.923...",0.91259,"(loc_code_and_comment, cyclomatic_design, deci...",0.0203898,0.015864,0.00793199
315757,"(3, 9, 11, 25, 35)","[0.9130434782608695, 0.9347826086956522, 0.923...",0.91259,"(loc_code_and_comment, decision_density, desig...",0.0239862,0.0186621,0.00933103
317031,"(3, 9, 16, 18, 36)","[0.9130434782608695, 0.9347826086956522, 0.923...",0.91259,"(loc_code_and_comment, decision_density, param...",0.0203898,0.015864,0.00793199
...,...,...,...,...,...,...,...
56901,"(7, 11, 24, 31)","[0.5869565217391305, 0.6630434782608695, 0.684...",0.628715,"(cyclomatic_design, design_density, halstead_l...",0.0497465,0.0387045,0.0193522
58286,"(7, 16, 24, 31)","[0.5652173913043478, 0.6847826086956522, 0.673...",0.626541,"(cyclomatic_design, parameter_count, halstead_...",0.0585552,0.0455579,0.022779
4545,"(7, 24, 31)","[0.5760869565217391, 0.6847826086956522, 0.673...",0.626517,"(cyclomatic_design, halstead_level, normalized...",0.0568154,0.0442043,0.0221022
489534,"(7, 16, 24, 31, 37)","[0.5543478260869565, 0.6739130434782609, 0.663...",0.624415,"(cyclomatic_design, parameter_count, halstead_...",0.0545749,0.0424611,0.0212305
