In [4]:
from ensemble_models import bagging
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from scipy.io import arff
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import pandas as pd
import numpy as np
def main(data,name):
    
    #Data Preprocessing
    df = pd.DataFrame(data[0])
    X= df.iloc[ : , :-1].values
    y=[]
    if "AEEEM" in name:
        y = df['class'].str.decode("utf-8").map({'buggy': 1, 'clean': 0})
    elif "JIRA" in name:
        y= df['RealBugCount'].apply(lambda x : 1 if(x > 0) else 0)
    else:
        y= df['defects'].apply(lambda x : 1 if(x > 0) else 0)
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    st_x= StandardScaler()  
    X_train= st_x.fit_transform(X_train) 
    X_test= st_x.transform(X_test) 
    sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
    sel_.fit(X_train, y_train)
    sel_.get_support()
    X_train = sel_.transform(X_train)
    X_test = sel_.transform(X_test)
    oversample = SMOTE()
    X_train, y_train = oversample.fit_resample(X_train, y_train)
    
    #Models
    bag = bagging(X_train,y_train,X_test,y_test)
    
    #results
    acc = bag["Accuracy"]
    f1 = bag["F1_score"]
    auc = bag["AUC_score"]
    mcc = bag["MCC"]
    res = {
        "acc" : acc,
        "f1" : f1,
        "auc" : auc,
        "mcc" : mcc
    }
    return res

In [5]:
datasets = []

datasets.append({"path": '../dataSet/AEEEM/EQ.arff', "name" : 'AEEEM-EQ' })
datasets.append({"path": '../dataSet/AEEEM/JDT.arff', "name" : 'AEEEM-JDT' })
datasets.append({"path": '../dataSet/AEEEM/Lucene.arff', "name" : 'AEEEM-LUCENE' })
datasets.append({"path": '../dataSet/AEEEM/Mylyn.arff', "name" : 'AEEEM-MYLYN' })
datasets.append({"path": '../dataSet/AEEEM/PDE.arff', "name" : 'AEEEM-PDE' })

datasets.append({"path": '../dataSet/JIRA/activemq-5.0.0.arff', "name" : 'JIRA-ACTIVEMQ' })
datasets.append({"path": '../dataSet/JIRA/derby-10.5.1.1.arff', "name" : 'JIRA-DERBY' })
datasets.append({"path": '../dataSet/JIRA/groovy-1_6_BETA_1.arff', "name" : 'JIRA-GROOVY' })
datasets.append({"path": '../dataSet/JIRA/hbase-0.94.0.arff', "name" : 'JIRA-HBASE' })
datasets.append({"path": '../dataSet/JIRA/hive-0.9.0.arff', "name" : 'JIRA-HIVE' })
datasets.append({"path": '../dataSet/JIRA/jruby-1.1.arff', "name" : 'JIRA-JRUBY' })
datasets.append({"path": '../dataSet/JIRA/wicket-1.3.0-beta2.arff', "name" : 'JIRA-WICKET' })

datasets.append({"path": '../dataSet/PROMISE/ant-1.7.arff', "name" : 'PROMISE-ANT' })
datasets.append({"path": '../dataSet/PROMISE/camel-1.4.arff', "name" : 'PROMISE-CAMEL' })
datasets.append({"path": '../dataSet/PROMISE/ivy-2.0.arff', "name" : 'PROMISE-IVY' })
datasets.append({"path": '../dataSet/PROMISE/jedit-4.0.arff', "name" : 'PROMISE-JEDIT' })
datasets.append({"path": '../dataSet/PROMISE/log4j-1.0.arff', "name" : 'PROMISE-LOG' })
datasets.append({"path": '../dataSet/PROMISE/poi-2.0.arff', "name" : 'PROMISE-POI' })
datasets.append({"path": '../dataSet/PROMISE/tomcat.arff', "name" : 'PROMISE-TOMCAT' })
datasets.append({"path": '../dataSet/PROMISE/velocity-1.6.arff', "name" : 'PROMISE-VELOCITY' })
datasets.append({"path": '../dataSet/PROMISE/xalan-2.4.arff', "name" : 'PROMISE-XALAN' })
datasets.append({"path": '../dataSet/PROMISE/xerces-1.3.arff', "name" : 'PROMISE-XERCES' })

In [7]:
acc_all = []
f1_all = []
auc_all = []
mcc_all = []
for i in range(10):
    acc = []
    f1 = []
    auc = []
    mcc = []
    for singledata in datasets:
        data = arff.loadarff(singledata["path"])
        res = main(data,singledata["name"])    
        acc.append(res["acc"])
        f1.append(res["f1"])
        auc.append(res["auc"])
        mcc.append(res["mcc"])
    acc_all.append(acc)
    f1_all.append(f1)
    auc_all.append(auc)
    mcc_all.append(mcc)
finalacc = []
for y in range(22):
    temp = []
    for x in range(10):
        temp.append(acc_all[x][y])
    ans = np.mean(temp)
    finalacc.append(ans)
print(finalacc)
print(np.mean(finalacc))

[0.716923076923077, 0.8004999999999999, 0.8266187050359711, 0.8407506702412869, 0.8196666666666668, 0.8681697612732096, 0.8491682070240296, 0.8909090909090909, 0.8122641509433961, 0.8524647887323944, 0.8911564625850339, 0.875070821529745, 0.7691275167785235, 0.7205714285714285, 0.8140845070422535, 0.7532258064516129, 0.7666666666666667, 0.73015873015873, 0.8517441860465116, 0.6434782608695653, 0.8082758620689656, 0.7802197802197803]
0.8036915975789973
