In [1]:
# importing pandas
import pandas as pd
from sklearn.model_selection import train_test_split

#fetch data and add headers
filepath = 'promise_KC3.csv'
column = ["loc_blank","branch_count","call_pairs","loc_code_and_comment",
          "loc_comments","condition_count","cyclomatic_complexity",
          "cyclomatic_design","decision_count","decision_density","design_complexity",
          "design_density","edge_count","essential_complexity","essential_density",
          "loc_executable","parameter_count","global_data_complexity","global_data_density",
          "halstead_content","halstead_difficulty","halstead_effort","halstead_error_est",
         "halstead_length","halstead_level","halstead_prog_time","halstead_volumn",
         "maintenance_severity","modified_condition_count","multiple_condition_count",
          "node_count","normalized_cyclomatic_complexity","num_operands","num_operators",
          "num_unique_operands","num_unique_operators","number_of_lines","percent_comments",
          "loc_total","defects"]
    
df = pd.read_csv(filepath, names=column)

In [2]:
X = df.iloc[:,0:-1] #features
y = df.iloc[:,-1] #labels

#divide dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

In [3]:
#exhaustive filtering using K nearest neighbour
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

knn = KNeighborsClassifier(n_neighbors=3)

efs1 = EFS(knn, 
           min_features=3,
           max_features=5,
           scoring='accuracy',
           print_progress=True,
           cv=5)

efs1 = efs1.fit(X, y)

#print('Best accuracy score: %.2f' % efs1.best_score_)
print('Best subset (indices):', efs1.best_idx_)
print('Best subset (corresponding names):', efs1.best_feature_names_)

Features: 667147/667147

Best subset (indices): (0, 3, 4, 8, 17)
Best subset (corresponding names): ('loc_blank', 'loc_code_and_comment', 'loc_comments', 'decision_count', 'global_data_complexity')


In [6]:
# Generate the new subsets based on the selected features
# Note that the transform call is equivalent to
# X_train[:, efs1.k_feature_idx_]

X_train_efs = efs1.transform(X_train)
X_test_efs = efs1.transform(X_test)

# Fit the estimator using the new feature subset
# and make a prediction on the test data
knn.fit(X_train_efs, y_train)
y_pred = knn.predict(X_test_efs)

# Compute the accuracy of the prediction
acc = float((y_test == y_pred).sum()) / y_pred.shape[0]
print('Test set accuracy: %.2f %%' % (acc*100))

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

accuracy_score = accuracy_score(y_test,y_pred)
precision_score = precision_score(y_test,y_pred)
recall_score = recall_score(y_test,y_pred)

print("Accuracy: ", accuracy_score)
print("Precision:",precision_score)
print("Recall:",recall_score)

Test set accuracy: 95.65 %
Accuracy:  0.9565217391304348
Precision: 0.0
Recall: 0.0


In [5]:
df = pd.DataFrame.from_dict(efs1.get_metric_dict()).T
df.sort_values('avg_score', inplace=True, ascending=False)
df

KeyboardInterrupt: 