In [1]:
import os
import pandas as pd
import copy 

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

#metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import geometric_mean_score
from hypopt import GridSearch
from imblearn.over_sampling import SMOTE




In [27]:
#Globals
DATA_PATH = '../data'
EXPERIMENT_NAME = 'Dataset'
EXPERIMENT_DATA_PATH = os.path.join(DATA_PATH, EXPERIMENT_NAME)
RESULTS_PATH = './results'
os.makedirs(RESULTS_PATH, exist_ok=True)

FEATURES = [
    'CountDeclMethodPrivate', 'AvgLineCode', 'CountLine',
       'MaxCyclomatic', 'CountDeclMethodDefault', 'AvgEssential',
       'CountDeclClassVariable', 'SumCyclomaticStrict', 'AvgCyclomatic',
       'AvgLine', 'CountDeclClassMethod', 'AvgLineComment',
       'AvgCyclomaticModified', 'CountDeclFunction', 'CountLineComment',
       'CountDeclClass', 'CountDeclMethod', 'SumCyclomaticModified',
       'CountLineCodeDecl', 'CountDeclMethodProtected',
       'CountDeclInstanceVariable', 'MaxCyclomaticStrict',
       'CountDeclMethodPublic', 'CountLineCodeExe', 'SumCyclomatic',
       'SumEssential', 'CountStmtDecl', 'CountLineCode', 'CountStmtExe',
       'RatioCommentToCode', 'CountLineBlank', 'CountStmt',
       'MaxCyclomaticModified', 'CountSemicolon', 'AvgLineBlank',
       'CountDeclInstanceMethod', 'AvgCyclomaticStrict',
       'PercentLackOfCohesion', 'MaxInheritanceTree', 'CountClassDerived',
       'CountClassCoupled', 'CountClassBase', 'CountInput_Max',
       'CountInput_Mean', 'CountInput_Min', 'CountOutput_Max',
       'CountOutput_Mean', 'CountOutput_Min', 'CountPath_Max',
       'CountPath_Mean', 'CountPath_Min', 'MaxNesting_Max', 'MaxNesting_Mean',
       'MaxNesting_Min', 'COMM', 'ADEV', 'DDEV', 'Added_lines', 'Del_lines',
       'OWN_LINE', 'OWN_COMMIT', 'MINOR_COMMIT', 'MINOR_LINE', 'MAJOR_COMMIT',
       'MAJOR_LINE'
]
TARGET = 'RealBug'
MODELS = {
    'DT' : {
        'default': DecisionTreeClassifier(),
        'grid' : {
            'max_depth' : [3, 5, 10 , None], 
            'ccp_alpha' : [0.0, 0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035],
            'criterion' : ["gini", "entropy", "log_loss"]
        }
    },
    'RF' : {
        'default': RandomForestClassifier(n_jobs = -1),
        'grid': {
            'max_depth' : [3, 5, 10, None], 
            'ccp_alpha' : [0.0, 0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035],
            'criterion' : ["gini", "entropy", "log_loss"], 
            'max_features' : ['sqrt', 'log2', None]
        }
    },
    
    'NB' : {
        "default": GaussianNB(), 
        'grid' : {
            
        }
    },
    'LR': {
        'default': LogisticRegression(n_jobs=-1),
        'grid': {
            'C' : [0.001, 0.01, 0.1, 1, 10, 100],
            'fit_intercept' : [True, False],
            'max_iter' :[100, 1000, 10000],
        }
    }
}
APPLY_SMOTE = True
TUNE=True
FINAL_RESULTS_PATH = '../results'

In [25]:
#helpers 
def evaluate_model_predictions(y_true, y_pred, y_prob): 
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()

    res ={
        'MCC':matthews_corrcoef(y_true, y_pred),
        'G' : geometric_mean_score(y_true, y_pred), 
        'f1' : f1_score(y_true, y_pred),
        'tpr': recall_score(y_true, y_pred, pos_label=1),
        'tnr' : recall_score(y_true, y_pred,pos_label=0),
        'precision': precision_score(y_true, y_pred), 
        'fpr': 1 - recall_score(y_true, y_pred,pos_label=0),
        'fnr': 1 - recall_score(y_true, y_pred,pos_label=1),
        'tp' : tp, 
        'tn': tn, 
        'fp': fp, 
        'fn': fn
    }
    return res

In [26]:
#main 
results = []
for filename in os.listdir(EXPERIMENT_DATA_PATH): 
    if not ('train' in filename) : 
        continue
    if not('.csv' in filename): 
        continue
    print('Working on file:', filename)
    train_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, filename))
    val_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, filename.replace('train', 'val')))
    test_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, filename.replace('train', 'test')))

    X_train, y_train = train_data[FEATURES], train_data[TARGET]
    X_val, y_val = val_data[FEATURES], val_data[TARGET]
    X_test, y_test = test_data[FEATURES], test_data[TARGET]
    
    if APPLY_SMOTE: 
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
        
    for model_name, model_data in MODELS.items(): 
        print('*** Training model:', model_name)
        clf_default = copy.deepcopy(model_data['default'])
        
        clf_params_grid = model_data['grid']
        final_model_name = model_name
        if TUNE:
            print('tunning...')
            final_model_name+= "_tunned"
            final_clf = GridSearch(model = clf_default, param_grid=clf_params_grid, parallelize=False)
            _ = final_clf.fit(X_train, y_train, X_val, y_val, scoring = 'roc_auc')
        else: 
            final_clf = copy.deepcopy(clf_default)
            final_clf.fit(X_train, y_train)

        if APPLY_SMOTE: 
            final_model_name += '_SMOTE'
        
        clf_default.fit(X_train, y_train)
        y_test_pred = clf_default.predict(X_test)
        y_test_prob = clf_default.predict_proba(X_test)
        results_default = evaluate_model_predictions(y_test, y_test_pred, y_test_prob)
        final_model_name = model_name

       
            
        new_row_heading = {
            'file_id': filename,
            'model': final_model_name
        }
        new_row=copy.deepcopy(new_row_heading)
        new_row.update(results_default)
        results.append(new_row)
final_results = pd.DataFrame(results)
final_results_name = EXPERIMENT_NAME+'_ML_results'
if TUNE: 
    final_results_name += "_TUNED"
if APPLY_SMOTE :
    final_results_name += "_SMOTE"

final_results_name += '.csv'
final_results.to_csv(os.path.join(RESULTS_PATH, final_results_name), index=False) 

Working on file: activemq_train_0_first_stratify.csv
*** Training model: DT
tunning...
*** Training model: RF
tunning...
*** Training model: NB
tunning...
*** Training model: LR
tunning...
Working on file: activemq_train_1_first_stratify.csv
*** Training model: DT
tunning...
*** Training model: RF
tunning...
*** Training model: NB
tunning...
*** Training model: LR
tunning...
Working on file: activemq_train_2_first_stratify.csv
*** Training model: DT
tunning...
*** Training model: RF
tunning...
*** Training model: NB
tunning...
*** Training model: LR
tunning...
Working on file: activemq_train_3_first_stratify.csv
*** Training model: DT
tunning...
*** Training model: RF
tunning...
*** Training model: NB
tunning...
*** Training model: LR
tunning...
Working on file: activemq_train_4_first_stratify.csv
*** Training model: DT
tunning...
*** Training model: RF
tunning...
*** Training model: NB
tunning...
*** Training model: LR
tunning...
Working on file: activemq_train_5_first_stratify.csv
**