In [1]:
import os
import pandas as pd
import copy 
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

#metrics
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import geometric_mean_score
from hypopt import GridSearch
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
from sklearn import tree

from sklearn.model_selection import StratifiedKFold

import cleanlab
from cleanlab import Datalab
from cleanlab.classification import CleanLearning


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
#Globals
'''
    'SVM' : {
        'default': SVC(),
        'grid': {
            'C' :  [0.001, 0.01, 0.1, 1, 10, 100], 
            'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']
        }
    },
    'RF' : {
        'default': RandomForestClassifier(n_jobs = -1),
        'grid': {
            'max_depth' : [3, 5, 10, None], 
            'ccp_alpha' : [0.0, 0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035],
            'criterion' : ["gini", "entropy", "log_loss"], 
            'max_features' : ['sqrt', 'log2', None]
        }
    },
    
    'NB' : {
        "default": GaussianNB(), 
        'grid' : {
            
        }
    },
    'LR': {
        'default': LogisticRegression(n_jobs=-1),
        'grid': {
            'C' : [0.001, 0.01, 0.1, 1, 10, 100],
            'fit_intercept' : [True, False],
            'max_iter' :[100, 1000, 10000],
        }
    }
    
     
    
'''
DATA_PATH = '../data'
EXPERIMENT_NAME = 'Dataset'
EXPERIMENT_DATA_PATH = os.path.join(DATA_PATH, EXPERIMENT_NAME)
RESULTS_PATH = './results_hive_verif'
os.makedirs(RESULTS_PATH, exist_ok=True)

FEATURES = [
    'CountDeclMethodPrivate', 'AvgLineCode', 'CountLine',
       'MaxCyclomatic', 'CountDeclMethodDefault', 'AvgEssential',
       'CountDeclClassVariable', 'SumCyclomaticStrict', 'AvgCyclomatic',
       'AvgLine', 'CountDeclClassMethod', 'AvgLineComment',
       'AvgCyclomaticModified', 'CountDeclFunction', 'CountLineComment',
       'CountDeclClass', 'CountDeclMethod', 'SumCyclomaticModified',
       'CountLineCodeDecl', 'CountDeclMethodProtected',
       'CountDeclInstanceVariable', 'MaxCyclomaticStrict',
       'CountDeclMethodPublic', 'CountLineCodeExe', 'SumCyclomatic',
       'SumEssential', 'CountStmtDecl', 'CountLineCode', 'CountStmtExe',
       'RatioCommentToCode', 'CountLineBlank', 'CountStmt',
       'MaxCyclomaticModified', 'CountSemicolon', 'AvgLineBlank',
       'CountDeclInstanceMethod', 'AvgCyclomaticStrict',
       'PercentLackOfCohesion', 'MaxInheritanceTree', 'CountClassDerived',
       'CountClassCoupled', 'CountClassBase', 'CountInput_Max',
       'CountInput_Mean', 'CountInput_Min', 'CountOutput_Max',
       'CountOutput_Mean', 'CountOutput_Min', 'CountPath_Max',
       'CountPath_Mean', 'CountPath_Min', 'MaxNesting_Max', 'MaxNesting_Mean',
       'MaxNesting_Min', 'COMM', 'ADEV', 'DDEV', 'Added_lines', 'Del_lines',
       'OWN_LINE', 'OWN_COMMIT', 'MINOR_COMMIT', 'MINOR_LINE', 'MAJOR_COMMIT',
       'MAJOR_LINE'
]
TARGET = 'RealBug'
MODELS = {
  
    'RF' : {
        'default': RandomForestClassifier(n_jobs = -1),
        'grid': {
            'max_depth' : [3, 5, 10, None], 
            'ccp_alpha' : [0.0, 0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035],
            'criterion' : ["gini", "entropy", "log_loss"], 
            'max_features' : ['sqrt', 'log2', None]
        }
    },
    'DT' : {
        'default': DecisionTreeClassifier(),
        'grid' : {
            'max_depth' : [ 5, 10 , None], 
            'ccp_alpha' : [0.0, 0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035],
            'criterion' : ["gini", "entropy", "log_loss"]
        }
    },
    'NB' : {
        "default": GaussianNB(), 
        'grid' : {
            
        }
    },
    'LR': {
        'default': LogisticRegression(n_jobs=-1),
        'grid': {
            'C' : [0.001, 0.01, 0.1, 1, 10, 100],
            'fit_intercept' : [True, False],
            'max_iter' :[100, 1000, 10000],
        }
    }
    
  
}

"""
  'DT' : {
        'default': DecisionTreeClassifier(),
        'grid' : {
            'max_depth' : [ 5, 10 , None], 
            'ccp_alpha' : [0.0, 0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035],
            'criterion' : ["gini", "entropy", "log_loss"]
        }
    },
      'NB' : {
        "default": GaussianNB(), 
        'grid' : {
            
        }
    },
    'LR': {
        'default': LogisticRegression(n_jobs=-1),
        'grid': {
            'C' : [0.001, 0.01, 0.1, 1, 10, 100],
            'fit_intercept' : [True, False],
            'max_iter' :[100, 1000, 10000],
        }
    }
"""
APPLY_SMOTE = True
TUNE=True
COST_SENSTIVE=False

In [7]:
#helpers 
def evaluate_model_predictions(y_true, y_pred, y_prob): 
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()

    res ={
        'MCC':matthews_corrcoef(y_true, y_pred),
        'G' : geometric_mean_score(y_true, y_pred), 
        'f1' : f1_score(y_true, y_pred),
        'tpr': recall_score(y_true, y_pred, pos_label=1),
        'tnr' : recall_score(y_true, y_pred,pos_label=0),
        'precision': precision_score(y_true, y_pred), 
        'fpr': 1 - recall_score(y_true, y_pred,pos_label=0),
        'fnr': 1 - recall_score(y_true, y_pred,pos_label=1),
        'tp' : tp, 
        'tn': tn, 
        'fp': fp, 
        'fn': fn
    }
    return res
def compute_complexity(clf):
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    feature = clf.tree_.feature
    threshold = clf.tree_.threshold

    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()
        node_depth[node_id] = depth

        # If the left and right child of a node is not the same we have a split
        # node
        is_split_node = children_left[node_id] != children_right[node_id]
        # If a split node, append left and right children and depth to `stack`
        # so we can loop through them
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True
    num_leaves = np.sum(is_leaves)
    return {
        'total_node_number': n_nodes, 
        'decision_nodes_count': n_nodes - num_leaves, 
        'max_depth': clf.tree_.max_depth
    }
    

In [9]:
#main 
n_runs = 3
results = []

for run_id in range(n_runs):
    print('run:', run_id)
    for filename in os.listdir(EXPERIMENT_DATA_PATH): 
        if not ('train' in filename) : 
            continue
        if not('.csv' in filename): 
            continue
        #if not ('camel' in filename):
        #    continue 

        print('Working on file:', filename)
        train_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, filename))
        test_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, filename.replace('train', 'test')))

        X_train, y_train = train_data[FEATURES], train_data[TARGET]
        X_test, y_test = test_data[FEATURES], test_data[TARGET]
        
        data_dict = {"X": X_train.values, "y": y_train.values}

        if APPLY_SMOTE: 
            sm = SMOTE(random_state=42)
            X_train, y_train = sm.fit_resample(X_train, y_train)
            
        for model_name, model_data in MODELS.items(): 
            print('*** Training model:', model_name)
            clf_default = copy.deepcopy(model_data['default'])
            if COST_SENSTIVE: 
                if 'class_weight' in clf_default._get_param_names(): 
                    clf_default.set_params(**{'class_weight': 'balanced'})
            
            clf_params_grid = model_data['grid']
            final_model_name = model_name
            if TUNE:
                print('tunning...')
                final_model_name+= "_tunned"
                validation_schema = StratifiedKFold(n_splits=5)
                scorer_mcc = make_scorer(matthews_corrcoef)
                final_clf = GridSearchCV(estimator = clf_default, param_grid=clf_params_grid, n_jobs=-1,scoring = scorer_mcc, cv=validation_schema)
                _ = final_clf.fit(X_train, y_train)               
            else: 
                final_clf = copy.deepcopy(clf_default)
                final_clf.fit(X_train, y_train)

            if APPLY_SMOTE: 
                final_model_name += '_SMOTE'
            if COST_SENSTIVE: 
                final_model_name += '_cost_sensitive'

           
            #clf_default.fit(X_train, y_train)
            y_test_pred = final_clf.predict(X_test)
            print('predicted DR(%):', np.mean(y_test_pred))
            y_test_prob = final_clf.predict_proba(X_test)
            #print(clf.tree_.node_count)
            results_default = evaluate_model_predictions(y_test, y_test_pred, y_test_prob)
            final_model_name = model_name
            #tree = final_clf.best_estimator_.tree_
       
            #complexity_metrics = compute_complexity(final_clf.best_estimator_)
            new_row_heading = {
                'file_id': filename,
                'run_id': run_id, 
                'model': final_model_name, 
            }
            new_row=copy.deepcopy(new_row_heading)
            new_row.update(results_default)
            #new_row.update(complexity_metrics)
            results.append(new_row)
            print(results_default)
final_results = pd.DataFrame(results)
final_results_name = EXPERIMENT_NAME+'_ML_results'
if TUNE: 
    final_results_name += "_TUNED"
if APPLY_SMOTE :
    final_results_name += "_SMOTE"
if COST_SENSTIVE :       
    final_results_name += "_COST_SENSTIVE"
final_results_name += '.csv'
final_results.to_csv(os.path.join(RESULTS_PATH, final_results_name), index=False) 

run: 0
Working on file: activemq-5.0.0_train.csv
*** Training model: RF
tunning...
predicted DR(%): 0.4304568527918782
{'MCC': 0.09818602164765347, 'G': 0.5905117774243342, 'f1': 0.18363273453093815, 'tpr': 0.5974025974025974, 'tnr': 0.5837004405286343, 'precision': 0.10849056603773585, 'fpr': 0.41629955947136565, 'fnr': 0.4025974025974026, 'tp': 92, 'tn': 1060, 'fp': 756, 'fn': 62}
*** Training model: DT
tunning...
predicted DR(%): 0.8243654822335026
{'MCC': -0.11406247627642514, 'G': 0.3317754467698783, 'f1': 0.1169853768278965, 'tpr': 0.6753246753246753, 'tnr': 0.16299559471365638, 'precision': 0.06403940886699508, 'fpr': 0.8370044052863437, 'fnr': 0.3246753246753247, 'tp': 104, 'tn': 296, 'fp': 1520, 'fn': 50}
*** Training model: NB
tunning...
predicted DR(%): 0.17817258883248732
{'MCC': 0.29433100534650364, 'G': 0.6948438248683806, 'f1': 0.3445544554455445, 'tpr': 0.564935064935065, 'tnr': 0.8546255506607929, 'precision': 0.24786324786324787, 'fpr': 0.14537444933920707, 'fnr': 0.4