In [1]:
!pip install tabulate xgboost 



In [2]:
import pandas as pd
import numpy as np
import math
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from tabulate import tabulate
from imblearn.over_sampling import ADASYN
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, matthews_corrcoef as mcc_score,  roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn import svm

## Data Processing

In [3]:
def process_aeeem_dataset(df):
    df.columns = df.columns.str.strip()
    df.rename(columns={df.columns[-1]: 'temp'}, inplace=True)
    columns_to_drop = ['temp', 'classname', 'nonTrivialBugs', 'majorBugs', 'criticalBugs', 'highPriorityBugs']
    columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    df = df.drop(columns=columns_to_drop)
    df.loc[df['bugs'] > 0, 'bugs'] = 1
    return df

def process_jira_dataset(df):
    columns_to_drop = ['File', 'RealBug', 'HeuBug', 'HeuBugCount']
    df = df.drop(columns=columns_to_drop)
    df.loc[df['RealBugCount'] > 0, 'RealBugCount'] = 1
    return df

def process_promise_ck_dataset(df):
    columns_to_drop = ['Name', 'version']
    columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    df = df.drop(columns=columns_to_drop)  
    df.loc[df['bug'] > 0, 'bug'] = 1
    return df

In [4]:
def normalize_dataset(df, n):
    num_features = df.shape[1]
    normalized_df = pd.DataFrame(index=df.index, columns=df.columns, dtype=int)
    for column in df.columns:
        feature_values = df[column]
        min_value = feature_values.min()
        max_value = feature_values.max()
        normalized_feature_values = np.round((feature_values - min_value) / (max_value - min_value) * (n - 1))
        normalized_df[column] = normalized_feature_values
    return normalized_df

In [5]:
def convert_binary(df, num_bits):
    def convert_to_binary_representation(column, num_bits):
        return column.apply(lambda x: format(int(x), f'0{num_bits}b'))

    binary_df = pd.DataFrame(index=df.index)
    
    for column in df.columns[:-1]:
        binary_representation = convert_to_binary_representation(df[column], num_bits)
        num_columns_needed = max(len(binary_representation.iloc[0]), num_bits)
        
        binary_columns = pd.DataFrame(binary_representation.apply(lambda x: pd.Series(list(x)).astype(int)))
        binary_columns = binary_columns.iloc[:, :num_columns_needed]
        
        binary_columns.columns = [f'{column}_bit_{i}' for i in range(num_columns_needed)]
        binary_df = pd.concat([binary_df, binary_columns], axis=1)
    
    binary_df = pd.concat([binary_df, df[df.columns[-1]]], axis=1)
    return binary_df

## Feature selection

In [6]:
def get_ATE_learner(df, col_name):
    X = df.drop(columns=[df.columns[-1]])
    y = df[df.columns[-1]]
    
    model = tree.DecisionTreeClassifier()
    model.fit(X, y)
    
    Xt1 = pd.DataFrame.copy(X)
    Xt1[col_name] = 1
    Xt0 = pd.DataFrame.copy(X)
    Xt0[col_name] = 0
    
    ate_est = np.mean(model.predict(Xt1) - model.predict(Xt0))
    return ate_est

def ate_based_feature_learner(df, num_of_bits, num_of_feat):
    norm_df = normalize_dataset(df, 1<<num_of_bits)
    binary_df = convert_binary(norm_df, num_of_bits)
    
    norm_df.loc[norm_df[norm_df.columns[-1]] > 0, norm_df.columns[-1]] = 1
    binary_df.loc[binary_df[binary_df.columns[-1]] > 0, binary_df.columns[-1]] = 1
    
    best_features = {}
    num_columns = len(binary_df.columns)
    for i in range(0, num_columns-1, num_of_bits):
        total = 0
        for j in range(0, num_of_bits):
            total = total + get_ATE_learner(binary_df, binary_df.columns[i+j])
        best_features[binary_df.columns[i]] = total
    
    sorted_features = sorted(best_features.items(), key=lambda x: x[1], reverse=True)
    top_k_features = [feat[0] for feat in sorted_features[:num_of_feat]]
    top_best_features = [element.replace("_bit_0", "") for element in top_k_features]
    indices = [df.columns.get_loc(feat) for feat in top_best_features]
    return indices

In [7]:
feature_algo = {
    'CFS-Learner': ate_based_feature_learner,
}

## Models

In [8]:
models_list = {
#     "KNN": KNeighborsClassifier(),
#     "Decision Tree": tree.DecisionTreeClassifier(),
#     "XgBoost": XGBClassifier(),
#     "Random Forest": RandomForestClassifier(),
#     "Logistic Regression": LogisticRegression(),
    "SVM":  svm.SVC(kernel='rbf')
}
models_search_params = {
    "KNN": {  
        'n_neighbors' : [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31],
        'weights' : ['uniform','distance'],
        'metric' : ['minkowski','euclidean','manhattan']
     },
    "Logistic Regression": {
        'penalty' : ['l1', 'l2', 'elasticnet'],
        'C' : [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001],
        'solver' : ['lbfgs','newton-cg','liblinear', 'newton-cholesky', 'sag', 'saga'],
        'max_iter' : [100, 1500, 3000]
     },
    "Decision Tree": {
        'max_features': [1, 2, 3, 5, 7, 10, 'log2','sqrt', None],
        'max_depth': [2, 3, 5, 7, 10, 20, 30, 40, 50, 60, 70, None],
        'min_samples_split': [2, 3, 5, 7, 9, 10, 0.1, 0.2, 0.3],
        'min_samples_leaf': [1, 2, 3, 5, 7, 9, 10, 0.1, 0.2],
     },
    "Random Forest": {
        'max_depth': [2, 5, 10, None],
        'max_features': ['log2', 'sqrt', None],
    },
    "XgBoost": {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 6, 10],
        'subsample': [0.5, 0.7, 1],
        'n_estimators': [100, 500]
    },
}

no_of_iter_for_cv = {
    "KNN": 30,
    "Decision Tree": 30,
    "Logistic Regression": 15,
    "Random Forest": 5,
    "XgBoost": 10,
}

## Metrics

In [9]:
def g_measure_score(y_test, y_pred):
    TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
    FPR = FP/(FP+TN)
    recall = recall_score(y_test, y_pred, average='macro')
    g_measure = (2*recall*(1-FPR))/(recall+(1-FPR))
    return g_measure

def bal_score(y_test, y_pred):
    TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
    recall = recall_score(y_test, y_pred, average='macro')
    FPR = FP/(FP+TN)
    PF = FPR
    PD = recall
    bal = 1 - (math.sqrt((1-PD)*(1-PD)+(0-PF)*(0-PF))/math.sqrt(2))
    return bal

In [10]:
def get_best_model_params(X_train, y_train, model):
    classifier = models_list[model]
    cv = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_obj = RandomizedSearchCV(classifier, models_search_params[model], n_iter = no_of_iter_for_cv[model], cv = cv, scoring='roc_auc')
    grid_obj.fit(X_train, y_train)
    best_model_params = grid_obj.best_params_
    return best_model_params

In [11]:
def build_model(file_name, X, y, model, feat_algo, col_list, num_of_bits, num_of_feat):
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    mcc_scores = []
    roc_auc_scores = []
    g_measure_scores = []
    bal_scores = []

#     best_model_params = {}
    features = []
    
    kf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
       
        adasyn = ADASYN()
        X_train, y_train = adasyn.fit_resample(X_train, y_train)
            
        if feat_algo != 'None' and len(features)<1:
            train_df = pd.concat([pd.DataFrame(X_train), pd.Series(y_train, name='bug')], axis=1)
            train_df.columns = col_list
            features = feature_algo[feat_algo](train_df, num_of_bits, num_of_feat)
        
        if len(features) >= 1 and feat_algo != 'None':
            X_train = X_train[:, features]
            X_test = X_test[:, features]
        elif feat_algo != 'None':
            return [file_name, feat_algo, '--', '--', '--', '--', '--', '--', '--']
        
#         if not best_model_params:
#             best_model_params = get_best_model_params(X_train, y_train, model)
#             print(file_name, feat_algo, model, best_model_params)
            
        classifier = models_list[model]
#         classifier.set_params(**best_model_params)
        
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        
        
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred, average='macro'))
        recall_scores.append(recall_score(y_test, y_pred, average='macro'))
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))
        mcc_scores.append(mcc_score(y_test, y_pred))
        roc_auc_scores.append(roc_auc_score(y_test, y_pred))
        g_measure_scores.append(g_measure_score(y_test, y_pred))
        bal_scores.append(bal_score(y_test, y_pred))
    
    return [file_name, feat_algo, num_of_bits, num_of_feat, round(np.mean(accuracy_scores),2), round(np.mean(precision_scores), 2), round(np.mean(recall_scores), 2), round(np.mean(f1_scores), 2), round(np.mean(mcc_scores), 2), round(np.mean(roc_auc_scores),2), round(np.mean(g_measure_scores), 2), round(np.mean(bal_scores), 2)]

In [12]:
def get_list_of_csv(folder_path):
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
    return csv_files

## JIRA

In [13]:
file_list = []
folder_path = '/kaggle/input/defect-prediction/JIRA-defect-dataset/'

for file_name in file_list:
    
    df = pd.read_csv(folder_path+file_name)
    df = process_jira_dataset(df)
    df = df.loc[:,df.apply(pd.Series.nunique) != 1]
    
    
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values

    for model in models_list.keys():
        print('----------------', model ,'---------------------------')
        table = []
        table.append(["dataset", "feat_algo", "no_of_bits", "no_of_feat", "acc", "prec", "recall", "f1", "mcc", "roc_auc", "g-m", "bal"])

        for num_of_bits in range(6, 15, 2): 
            num_of_feat = 6
            for feat_algo in feature_algo.keys():
                table.append(build_model(file_name, X, y, model, feat_algo, df.columns, num_of_bits, num_of_feat))
            
        print(tabulate(table))

## TERA-PROMISE-ck

In [14]:
file_list = ['prop-5.csv']
folder_path = '/kaggle/input/defect-prediction/TeraPromise-defect-dataset/ck/'

for file_name in file_list:
    
    df = pd.read_csv(folder_path+file_name)
    df = process_promise_ck_dataset(df)
    df = df.loc[:,df.apply(pd.Series.nunique) != 1]
    
    
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values

    for model in models_list.keys():
        print('----------------', model ,'---------------------------')
        table = []
        table.append(["dataset", "feat_algo", "no_of_bits", "no_of_feat", "acc", "prec", "recall", "f1", "mcc", "roc_auc", "g-m", "bal"])

        for num_of_bits in range(6, 15, 2): 
            num_of_feat = 4
            for feat_algo in feature_algo.keys():
                table.append(build_model(file_name, X, y, model, feat_algo, df.columns, num_of_bits, num_of_feat))
            
        print(tabulate(table))

---------------- SVM ---------------------------
----------  -----------  ----------  ----------  ----  ----  ------  ----  ----  -------  ----  ----
dataset     feat_algo    no_of_bits  no_of_feat  acc   prec  recall  f1    mcc   roc_auc  g-m   bal
prop-5.csv  CFS-Learner  6           4           0.61  0.57  0.62    0.53  0.18  0.62     0.61  0.61
prop-5.csv  CFS-Learner  8           4           0.52  0.56  0.61    0.48  0.16  0.61     0.54  0.54
prop-5.csv  CFS-Learner  10          4           0.62  0.57  0.63    0.54  0.19  0.63     0.62  0.62
prop-5.csv  CFS-Learner  12          4           0.59  0.56  0.62    0.52  0.17  0.62     0.6   0.6
prop-5.csv  CFS-Learner  14          4           0.58  0.56  0.62    0.51  0.18  0.62     0.59  0.59
----------  -----------  ----------  ----------  ----  ----  ------  ----  ----  -------  ----  ----


## AEEEM 

In [15]:
file_list = []
folder_path = '/kaggle/input/defect-prediction/AEEEM-defect-dataset/'

for file_name in file_list:
    
    df = pd.read_csv(folder_path+file_name, delimiter=';')
    df = process_aeeem_dataset(df)
    df = df.loc[:,df.apply(pd.Series.nunique) != 1]
    
    
    X = df.drop(columns=[df.columns[-1]]).values
    y = df[df.columns[-1]].values

    for model in models_list.keys():
        print('----------------', model ,'---------------------------')
        table = []
        table.append(["dataset", "feat_algo", "no_of_bits", "no_of_feat", "acc", "prec", "recall", "f1", "mcc", "roc_auc", "g-m", "bal"])

        for num_of_bits in range(6, 15, 2): 
            num_of_feat = 4
            for feat_algo in feature_algo.keys():
                table.append(build_model(file_name, X, y, model, feat_algo, df.columns, num_of_bits, num_of_feat))
            
        print(tabulate(table))