In [1]:
import pandas as pd
from scipy.io import arff
import matplotlib.pyplot as plt 
import numpy as np 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn import tree
from sklearn.metrics import confusion_matrix
import seaborn as sn
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.cluster import KMeans
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.cluster.xmeans import xmeans, splitting_type
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from ensemble_models import adaBoost
from ensemble_models import bagging
from implicitModel import Implicit_ME
from explicitModel import Explicit_ME
import warnings
warnings.filterwarnings('ignore')

In [2]:
def DecisionTree(X_train,y_train,X_test,y_test):
    clf = DecisionTreeClassifier(criterion = "entropy", splitter = "random").fit(X_train,y_train)
    y_pred_test = clf.predict(X_test)
    
    #matrices
    fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_test)
    auc = metrics.roc_auc_score(y_test,  y_pred_test)
    result = {
      "Accuracy": metrics.accuracy_score(y_test, y_pred_test),
      "F1_score": f1_score(y_test, y_pred_test, average='macro'),
      "AUC_score": auc,
      "Prediction": y_pred_test,
      "MCC" : metrics.matthews_corrcoef(y_test,  y_pred_test)
    }
    return result

In [3]:
def SVM(X_train,y_train,X_test,y_test):
    clf = svm.SVC().fit(X_train,y_train)
    y_pred_test = clf.predict(X_test)
    
    #matrices
    fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_test)
    auc = metrics.roc_auc_score(y_test,  y_pred_test)
    result = {
      "Accuracy": metrics.accuracy_score(y_test, y_pred_test),
      "F1_score": f1_score(y_test, y_pred_test, average='macro'),
      "AUC_score": auc,
      "Prediction": y_pred_test,
      "MCC" : metrics.matthews_corrcoef(y_test,  y_pred_test)
      
    }
    return result

In [4]:
def KNN(X_train,y_train,X_test,y_test):
    clf = KNeighborsClassifier(n_neighbors=2).fit(X_train,y_train)
    y_pred_test = clf.predict(X_test)
    
    #matrices
    fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_test)
    auc = metrics.roc_auc_score(y_test,  y_pred_test)
    result = {
      "Accuracy": metrics.accuracy_score(y_test, y_pred_test),
      "F1_score": f1_score(y_test, y_pred_test, average='macro'),
      "AUC_score": auc,
      "Prediction": y_pred_test,
      "MCC" : metrics.matthews_corrcoef(y_test,  y_pred_test)
    }
    return result

In [5]:
def Logistic(X_train,y_train,X_test,y_test):
    clf = LogisticRegression(max_iter=100,penalty = 'none').fit(X_train,y_train)
    y_pred_test = clf.predict(X_test)
    
    #matrices
    fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_test)
    auc = metrics.roc_auc_score(y_test,  y_pred_test)
    result = {
      "Accuracy": metrics.accuracy_score(y_test, y_pred_test),
      "F1_score": f1_score(y_test, y_pred_test, average='macro'),
      "AUC_score": auc,
      "Prediction": y_pred_test,
      "MCC" : metrics.matthews_corrcoef(y_test,  y_pred_test)
      
    }
    return result

In [6]:
def gate_ntw(X_train,y_train,X_test,y_test):
    gateDT = DecisionTreeClassifier(criterion = "entropy", splitter = "best").fit(X_train,y_train)
    gateSVM = svm.SVC(probability=True).fit(X_train,y_train)
    gateKNN = KNeighborsClassifier(n_neighbors=2).fit(X_train,y_train)
    gateLOG = LogisticRegression(max_iter=10000).fit(X_train,y_train)
    voting_clf = VotingClassifier(
    estimators=[('DecisionTree',gateDT), ('SVM',gateSVM),('KNN',gateKNN),('Logistic',gateLOG)],voting='soft')
    voting_clf.fit(X_train, y_train)
    final_predictions = voting_clf.predict(X_test)
    result = {
      "Accuracy": metrics.accuracy_score(y_test, final_predictions),
      "F1_score": f1_score(y_test, final_predictions, average='macro'),
      "AUC_score": metrics.roc_auc_score(y_test,  final_predictions),
      "Prediction": final_predictions,
      "MCC" : metrics.matthews_corrcoef(y_test,  final_predictions)
    }
    return result
    

In [7]:
def main(data,name):
    
    #Data Preprocessing
    df = pd.DataFrame(data[0])
    X= df.iloc[ : , :-1].values
    y=[]
    if "AEEEM" in name:
        y = df['class'].str.decode("utf-8").map({'buggy': 1, 'clean': 0})
    elif "JIRA" in name:
        y= df['RealBugCount'].apply(lambda x : 1 if(x > 0) else 0)
    else:
        y= df['defects'].apply(lambda x : 1 if(x > 0) else 0)
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    st_x= StandardScaler()  
    X_train= st_x.fit_transform(X_train) 
    X_test= st_x.transform(X_test) 
    sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
    sel_.fit(X_train, y_train)
    sel_.get_support()
    X_train = sel_.transform(X_train)
    X_test = sel_.transform(X_test)
    oversample = SMOTE()
    X_train, y_train = oversample.fit_resample(X_train, y_train)
    
    #Models
    experts = [DecisionTree,KNN,SVM,Logistic]
    impME = Implicit_ME(X_train,y_train,X_test,y_test,5,experts,gate_ntw)
    expME = Explicit_ME(X_train,y_train,X_test,y_test,experts,gate_ntw)
    dt = DecisionTree(X_train,y_train,X_test,y_test)
    svm = SVM(X_train,y_train,X_test,y_test)
    knn = KNN(X_train,y_train,X_test,y_test)
    log = Logistic(X_train,y_train,X_test,y_test)
    bag = bagging(X_train,y_train,X_test,y_test)
    ada = adaBoost(X_train,y_train,X_test,y_test)
    
    #results
    acc = [dt["Accuracy"],svm["Accuracy"],knn["Accuracy"],log["Accuracy"],bag["Accuracy"],ada["Accuracy"],impME["Accuracy"],expME["Accuracy"]]
    f1 = [dt["F1_score"],svm["F1_score"],knn["F1_score"],log["F1_score"],bag["F1_score"],ada["F1_score"],impME["F1_score"],expME["F1_score"]]
    auc = [dt["AUC_score"],svm["AUC_score"],knn["AUC_score"],log["AUC_score"],bag["AUC_score"],ada["AUC_score"],impME["AUC_score"],expME["AUC_score"]]
    mcc = [dt["MCC"],svm["MCC"],knn["MCC"],log["MCC"],bag["MCC"],ada["MCC"],impME["MCC"],expME["MCC"]]
    res = {
        "acc" : acc,
        "f1" : f1,
        "auc" : auc,
        "mcc" : mcc
    }
    return res

In [8]:
datasets = []

datasets.append({"path": '../dataSet/AEEEM/EQ.arff', "name" : 'AEEEM-EQ' })
datasets.append({"path": '../dataSet/AEEEM/JDT.arff', "name" : 'AEEEM-JDT' })
datasets.append({"path": '../dataSet/AEEEM/Lucene.arff', "name" : 'AEEEM-LUCENE' })
datasets.append({"path": '../dataSet/AEEEM/Mylyn.arff', "name" : 'AEEEM-MYLYN' })
datasets.append({"path": '../dataSet/AEEEM/PDE.arff', "name" : 'AEEEM-PDE' })

datasets.append({"path": '../dataSet/JIRA/activemq-5.0.0.arff', "name" : 'JIRA-ACTIVEMQ' })
datasets.append({"path": '../dataSet/JIRA/derby-10.5.1.1.arff', "name" : 'JIRA-DERBY' })
datasets.append({"path": '../dataSet/JIRA/groovy-1_6_BETA_1.arff', "name" : 'JIRA-GROOVY' })
datasets.append({"path": '../dataSet/JIRA/hbase-0.94.0.arff', "name" : 'JIRA-HBASE' })
datasets.append({"path": '../dataSet/JIRA/hive-0.9.0.arff', "name" : 'JIRA-HIVE' })
datasets.append({"path": '../dataSet/JIRA/jruby-1.1.arff', "name" : 'JIRA-JRUBY' })
datasets.append({"path": '../dataSet/JIRA/wicket-1.3.0-beta2.arff', "name" : 'JIRA-WICKET' })

datasets.append({"path": '../dataSet/PROMISE/ant-1.7.arff', "name" : 'PROMISE-ANT' })
datasets.append({"path": '../dataSet/PROMISE/camel-1.4.arff', "name" : 'PROMISE-CAMEL' })
datasets.append({"path": '../dataSet/PROMISE/ivy-2.0.arff', "name" : 'PROMISE-IVY' })
datasets.append({"path": '../dataSet/PROMISE/jedit-4.0.arff', "name" : 'PROMISE-JEDIT' })
datasets.append({"path": '../dataSet/PROMISE/log4j-1.0.arff', "name" : 'PROMISE-LOG' })
datasets.append({"path": '../dataSet/PROMISE/poi-2.0.arff', "name" : 'PROMISE-POI' })
datasets.append({"path": '../dataSet/PROMISE/tomcat.arff', "name" : 'PROMISE-TOMCAT' })
datasets.append({"path": '../dataSet/PROMISE/velocity-1.6.arff', "name" : 'PROMISE-VELOCITY' })
datasets.append({"path": '../dataSet/PROMISE/xalan-2.4.arff', "name" : 'PROMISE-XALAN' })
datasets.append({"path": '../dataSet/PROMISE/xerces-1.3.arff', "name" : 'PROMISE-XERCES' })

mean_acc_all = []
mean_f1_all = []
mean_auc_all = []
mean_mcc_all = []
for i in range(10):
    acc_all = []
    f1_all = []
    auc_all = []
    mcc_all = []
    for singledata in datasets:
        data = arff.loadarff(singledata["path"])
        res = main(data,singledata["name"])    
        acc_all.append(res["acc"])
        f1_all.append(res["f1"])
        auc_all.append(res["auc"])
        mcc_all.append(res["mcc"])
        
    mean_acc_all.append(acc_all)
    mean_f1_all.append(f1_all)
    mean_auc_all.append(auc_all)
    mean_mcc_all.append(mcc_all)
    
    DF_acc = pd.DataFrame(acc_all)
    DF_f1 = pd.DataFrame(f1_all)
    DF_auc = pd.DataFrame(auc_all)
    DF_mcc = pd.DataFrame(mcc_all)
    DF_acc.to_csv("Results/accuracy"+str(i+1)+".csv")
    DF_f1.to_csv("Results/f1_score"+str(i+1)+".csv")
    DF_auc.to_csv("Results/auc_score"+str(i+1)+".csv")
    DF_mcc.to_csv("Results/mcc_score"+str(i+1)+".csv")
    print("Execution "+str(i+1) +" done")

print(mean_acc_all)
DF_acc = pd.DataFrame(np.mean( np.array(mean_acc_all), axis=0 ))
DF_f1 = pd.DataFrame(np.mean( np.array(mean_f1_all), axis=0 ))
DF_auc = pd.DataFrame(np.mean( np.array(mean_auc_all), axis=0 ))
DF_mcc = pd.DataFrame(np.mean( np.array(mean_mcc_all), axis=0 ))
DF_acc.to_csv("Results/mean_accuracy.csv")
DF_f1.to_csv("Results/mean_f1_score.csv")
DF_auc.to_csv("Results/mean_auc_score.csv")
DF_mcc.to_csv("Results/mean_mcc_score.csv")
print("Execution done")

Execution 1 done
Execution 2 done
Execution 3 done
Execution 4 done
Execution 5 done
Execution 6 done
Execution 7 done
Execution 8 done
Execution 9 done
Execution 10 done
[[[0.7538461538461538, 0.7692307692307693, 0.6615384615384615, 0.8153846153846154, 0.8, 0.7846153846153846, 0.8, 0.7846153846153846], [0.765, 0.82, 0.83, 0.825, 0.82, 0.775, 0.815, 0.81], [0.8489208633093526, 0.7553956834532374, 0.8345323741007195, 0.7122302158273381, 0.9064748201438849, 0.7769784172661871, 0.8057553956834532, 0.7985611510791367], [0.7721179624664879, 0.7882037533512064, 0.8096514745308311, 0.7479892761394102, 0.8739946380697051, 0.7882037533512064, 0.8150134048257373, 0.8257372654155496], [0.7933333333333333, 0.8033333333333333, 0.8533333333333334, 0.7733333333333333, 0.8666666666666667, 0.7766666666666666, 0.8333333333333334, 0.8266666666666667], [0.896551724137931, 0.8647214854111406, 0.8806366047745358, 0.8620689655172413, 0.9124668435013262, 0.8594164456233422, 0.9071618037135278, 0.9045092838196