In [None]:
import os
import copy
import string
import random 
import joblib 


import numpy as np 
import pandas as pd 
from lightgbm import LGBMClassifier

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from hyperopt import fmin, tpe, rand, hp, STATUS_OK, space_eval
from sklearn.model_selection import StratifiedKFold

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.metrics import matthews_corrcoef, make_scorer, confusion_matrix, f1_score, recall_score, precision_score, roc_auc_score, accuracy_score, balanced_accuracy_score, log_loss
from sklearn.model_selection import GridSearchCV


from imblearn.metrics import geometric_mean_score

import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
#globals
DATA_PATH = '../studied data'
VALIDATION_PROCESS = "CRDP"
DATASET = "jira"
RESULTS_PATH = '../results'
RANDOM_STATE = 42
NRUNS = 10
CV = 5
MAX_EVAL = 150
KFOLD = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
PARAMETERS_GRID = {

    
    'DT' :{
        "default" : DecisionTreeClassifier(class_weight='balanced', random_state = RANDOM_STATE), 
        'grid': {
                'criterion': Categorical(['gini', 'entropy', 'log_loss']),
                'splitter': Categorical(['best', 'random']),
                'max_features': Categorical(['sqrt', 'log2', None]),
                'min_samples_split': Integer(2, 20),
                'min_samples_leaf': Integer(1, 8),
                'max_depth': Categorical([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None]),
                'ccp_alpha': Real(0.0, 1.0),
                'min_weight_fraction_leaf': Real(0.0, 0.5),  # Minimum weighted fraction of sum of total weights
                'max_leaf_nodes': Integer(2, 100),  # Maximum number of leaf nodes
                'min_impurity_decrease': Real(0.0, 0.5),  # Minimum impurity decrease for node splitting
               
        }
    }
    

}

'''
'RF': {
        'default': RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state = RANDOM_STATE),
        'grid': {
            'ccp_alpha': Real( 0.0, 1.0),
            'criterion': Categorical(['gini', 'entropy', 'log_loss']),
            'max_depth': Categorical([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, None]),
            'max_features':  Categorical( ["sqrt", "log2", None]), 
            'n_estimators': Categorical([10 * i for i in np.arange(1, 100)]),
            'min_samples_split': Real( 0.01, 0.5),  # Minimum samples required to split a node
            'min_samples_leaf': Real(0.01, 0.5),  # Minimum samples required for a leaf node
            'bootstrap': Categorical([True, False]),  # Bootstrap samples
            'min_weight_fraction_leaf': Real(0.0, 0.5),  # Minimum weighted fraction of sum of total weights
            'max_leaf_nodes': Integer(2, 100),  # Maximum number of leaf nodes            
        }
    },
    ,
    'KNN': {
        'default': KNeighborsClassifier( n_jobs= -1),
        'grid': {
            'n_neighbors':Integer(1, 15),
            'weights': Categorical(['uniform', 'distance']),
            'algorithm': Categorical(['auto', 'ball_tree', 'kd_tree', 'brute']),
            'p':Integer(1, 6),
            'leaf_size': Categorical([20, 30, 40]),
            'metric': Categorical(['euclidean', 'manhattan', 'chebyshev', 'minkowski'])
          
        }
    },

   
  
    
    }
    
    
'''
    

SEARCH = GridSearchCV

FEATURES = [
    'CountDeclMethodPrivate', 'AvgLineCode', 'CountLine',
       'MaxCyclomatic', 'CountDeclMethodDefault', 'AvgEssential',
       'CountDeclClassVariable', 'SumCyclomaticStrict', 'AvgCyclomatic',
       'AvgLine', 'CountDeclClassMethod', 'AvgLineComment',
       'AvgCyclomaticModified', 'CountDeclFunction', 'CountLineComment',
       'CountDeclClass', 'CountDeclMethod', 'SumCyclomaticModified',
       'CountLineCodeDecl', 'CountDeclMethodProtected',
       'CountDeclInstanceVariable', 'MaxCyclomaticStrict',
       'CountDeclMethodPublic', 'CountLineCodeExe', 'SumCyclomatic',
       'SumEssential', 'CountStmtDecl', 'CountLineCode', 'CountStmtExe',
       'RatioCommentToCode', 'CountLineBlank', 'CountStmt',
       'MaxCyclomaticModified', 'CountSemicolon', 'AvgLineBlank',
       'CountDeclInstanceMethod', 'AvgCyclomaticStrict',
       'PercentLackOfCohesion', 'MaxInheritanceTree', 'CountClassDerived',
       'CountClassCoupled', 'CountClassBase', 'CountInput_Max',
       'CountInput_Mean', 'CountInput_Min', 'CountOutput_Max',
       'CountOutput_Mean', 'CountOutput_Min', 'CountPath_Max',
       'CountPath_Mean', 'CountPath_Min', 'MaxNesting_Max', 'MaxNesting_Mean',
       'MaxNesting_Min', 'COMM', 'ADEV', 'DDEV', 'Added_lines', 'Del_lines',
       'OWN_LINE', 'OWN_COMMIT', 'MINOR_COMMIT', 'MINOR_LINE', 'MAJOR_COMMIT',
       'MAJOR_LINE'
]
TARGET = 'RealBug'

SCORERS = [ make_scorer(matthews_corrcoef), 'neg_log_loss', make_scorer(geometric_mean_score), 'accuracy', 'balanced_accuracy', 'roc_auc', 'f1', 'f1_weighted']

In [None]:
def optimze(X, y, model, metrics='MCC', opt_algo='TPE', max_eval = MAX_EVAL, cv = KFOLD): 
    def run_optimizer(param): 
        clf = copy.deepcopy(model['default'])
        clf.set_params(**param)
        metrics_ = []
        for train_index, val_index in cv.split(X, y):
            train_X, val_X = X.iloc[train_index], X.iloc[val_index]
            train_y, val_y = y.iloc[train_index], y.iloc[val_index]


            clf.fit(train_X, train_y)

            pred_y = clf.predict(val_X)
            prob_y = clf.predict_proba(val_X)[:, 1]

            if metrics == 'MCC':
                metrics_.append(matthews_corrcoef(val_y, pred_y))
            
            if metrics == 'F1':
                metrics_.append(f1_score(val_y, pred_y))

            if metrics == 'G':
                metrics_.append(geometric_mean_score(val_y, pred_y))
            
            if metrics == 'accuracy':
                metrics_.append(accuracy_score(val_y, pred_y))
            
            if metrics == 'balanced_accuracy':
                metrics_.append(balanced_accuracy_score(val_y, pred_y))
            
            if metrics == 'roc_auc': 
                metrics_.append(roc_auc_score(val_y, pred_y))
            
            if metrics == 'neg_log_loss': 
                metrics_.append(log_loss(val_y, prob_y))
        
        if metrics != 'neg_log_loss':
            return {
                'loss': 1 - np.mean(metrics_),
                'status': STATUS_OK
            }
        else: 
            return {
                'loss': np.mean(metrics_),
                'status': STATUS_OK
            }
    param_space = model['grid']

    if opt_algo == 'RAND':
        best = fmin(run_optimizer, param_space, algo=rand.suggest, max_evals=max_eval, show_progressbar=False)

    elif opt_algo == 'TPE':
        best = fmin(run_optimizer, param_space, algo=tpe.suggest, max_evals=max_eval, show_progressbar=False)

    else:
        print('PLEASE SET YOUR OPTIMIZATION ALGORITHM !!!')

    params = space_eval(param_space, best)
    model_tune = copy.deepcopy(model["default"]).set_params(**params)


    model_tune.fit(X, y)

    return model_tune, params

def compute_all_models_diversities(all_models_predictions, y_true, models, scorers = SCORERS):
    transposed_predictions = {}
    for imodel in models: 
        for scorer in scorers: 
            transposed_predictions[imodel + '-' + str(scorer)] = all_models_predictions[imodel][str(scorer)]
    
    return compute_model_diversities(transposed_predictions, y_true, scorers=list(transposed_predictions.keys()))


def compute_model_diversities(predictions, y_true, scorers = SCORERS): 
    results = []
    for index_i in range(len(scorers)): 
        for index_j in range(len(scorers)): 
            scorer_i = str(scorers[index_i])
            scorer_j = str(scorers[index_j])
            diversity_value = compute_diversity(predictions[scorer_i],predictions[scorer_j], y_true)
            results.append({
                'scorer_1': scorer_i , 
                'scorer_2': scorer_j, 
                'diversity': diversity_value
            })

    return pd.DataFrame(results)

def compute_diversity(classifier_1_predictions, classifier_2_predictions, y):
    classifier_1_correctly_classified_instances = np.where(np.equal(classifier_1_predictions, y))[0]
    classifier_1_notcorrect_classified_instances = np.where(np.not_equal(classifier_1_predictions, y))[0]

    classifier_2_correctly_classified_instances = np.where(np.equal(classifier_2_predictions, y))[0]
    classifier_2_notcorrect_classified_instances = np.where(np.not_equal(classifier_2_predictions, y))[0]

    N_1_0 = len(set(classifier_1_correctly_classified_instances).intersection(set(classifier_2_notcorrect_classified_instances)))
    N_0_1 = len(set(classifier_2_correctly_classified_instances).intersection(set(classifier_1_notcorrect_classified_instances)))

    N_1_1 = len(set(classifier_1_correctly_classified_instances).intersection(set(classifier_2_correctly_classified_instances)))
    N_0_0 = len(set(classifier_1_notcorrect_classified_instances).intersection(set(classifier_2_notcorrect_classified_instances)))

    return (N_1_0 + N_0_1)/(N_1_0 + N_0_1 + N_1_1 + N_0_0)


def evaluate_model_predictions(y_true, y_pred, y_prob): 
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()

    res ={
        'AUC': roc_auc_score(y_true, y_prob), 
        'MCC':matthews_corrcoef(y_true, y_pred),
        'G' : geometric_mean_score(y_true, y_pred), 
        'f1' : f1_score(y_true, y_pred),
        'tpr': recall_score(y_true, y_pred, pos_label=1),
        'tnr' : recall_score(y_true, y_pred,pos_label=0),
        'precision': precision_score(y_true, y_pred), 
        'fpr': 1 - recall_score(y_true, y_pred,pos_label=0),
        'fnr': 1 - recall_score(y_true, y_pred,pos_label=1),
        'tp' : tp, 
        'tn': tn, 
        'fp': fp, 
        'fn': fn
    }
    return res

def random_string(length):
    pool = string.ascii_letters + string.digits
    return ''.join(random.choice(pool) for i in range(length))


In [None]:
results = []


experiment_hash = random_string(32)
print('starting experiment:', experiment_hash)
os.makedirs(os.path.join(RESULTS_PATH, 'experiment_' + experiment_hash), exist_ok=True)
experiment_models_path = os.path.join(RESULTS_PATH, 'experiment_' + experiment_hash, 'MODELS')
figures_folder = os.path.join(RESULTS_PATH, 'experiment_' + experiment_hash,'Diversity_figures')
os.makedirs(experiment_models_path, exist_ok=True)
os.makedirs(figures_folder, exist_ok=True)

experiment_metadata = {
    'PARAMETERS_GRID': PARAMETERS_GRID, 
    'DATASET': DATASET, 
    'VALIDATION_PROCESS': VALIDATION_PROCESS, 
    'CV': CV, 
    'SEARCH':SEARCH, 
    'SCORES': SCORERS,
    'IS_DONE': False, 
}
all_models_train_predictions = {}
all_models_test_predictions = {}
for file in os.listdir(os.path.join(DATA_PATH, VALIDATION_PROCESS, DATASET)): 
    
    

    if not(".csv" in file):
        continue
    if not ("train" in file): 
        continue
    
    print('Working on:', file)

    file_models_train_predictions = {}
    file_models_test_predictions = {}
    project_performance = []

    train_data = pd.read_csv(os.path.join(DATA_PATH, VALIDATION_PROCESS, DATASET, file))
    test_data = pd.read_csv(os.path.join(DATA_PATH, VALIDATION_PROCESS, DATASET, file.replace("train", 'test')))

    X_train, y_train = train_data[FEATURES], train_data[TARGET] 
    X_test, y_test = test_data[FEATURES], test_data[TARGET] 

    for run in range(NRUNS):
        print("Run:", run)
    
        for model_name, model_data in PARAMETERS_GRID.items():

            model_train_predictions = {}
            model_test_predictions = {}
            model_performance = []
            print('******** Model:',model_name)
            for scorer in SCORERS:

                print('*****************', str(scorer))

                search = BayesSearchCV(estimator= model_data['default'],
                                        search_spaces=model_data['grid'], cv=KFOLD, scoring=scorer, refit=True, n_jobs= -1, 
                                        verbose=0, random_state=RANDOM_STATE, n_iter = MAX_EVAL)
                search.fit(X_train, y_train)
                #final_model, best_params = optimze(X_train, y_train, model_data, scorer)
                joblib.dump(search, os.path.join(experiment_models_path, f'{file}_-_{model_name}_-_run-{str(run)}_-_{str(scorer)}.joblib'))
                print("best parameters:", search.best_params_)
                y_train_pred = search.predict(X_train) 
                y_test_pred = search.predict(X_test)

                model_train_predictions[str(scorer)] = y_train_pred
                model_test_predictions[str(scorer)] = y_test_pred

                y_test_prob = search.predict_proba(X_test)[:, 1]

                evaluation = evaluate_model_predictions(y_test, y_test_pred, y_test_prob)

                print(evaluation)
                new_row = {
                    'file': file,
                    'run': run, 
                    'validation_process': VALIDATION_PROCESS, 
                    'model' : model_name,
                    'CV': 5, 
                    'search': 'Grid',
                    'scorer': str(scorer)
                }
                new_row.update(evaluation)
                new_row['best_params'] = str(search.best_params_)
                model_performance.append(new_row)
                project_performance.append(new_row)
                results.append(new_row)
            
        
            train_div_scores = compute_model_diversities(model_train_predictions, y_train, scorers = SCORERS)
            test_div_scores = compute_model_diversities(model_test_predictions, y_test, scorers = SCORERS)
            fig_train, ax_train = plt.subplots(figsize = (10, 10))
            fig_test, ax_test = plt.subplots(figsize = (10, 10))
            sns.heatmap(train_div_scores.pivot(index= "scorer_1", columns="scorer_2", values="diversity"),ax=ax_train, annot=True)
            sns.heatmap(test_div_scores.pivot(index= "scorer_1", columns="scorer_2", values="diversity"), ax=ax_test, annot=True)
            os.makedirs(os.path.join(figures_folder, f'run_{run}', file.replace('.csv', ''),f'{model_name}'), exist_ok=True)
            fig_train.tight_layout()
            fig_test.tight_layout()
            fig_train.savefig(os.path.join(figures_folder, f'run_{run}', file.replace('.csv', ''), f'{model_name}', 'div_train.png'))
            fig_test.savefig(os.path.join(figures_folder,f'run_{run}', file.replace('.csv', ''), f'{model_name}', 'div_test.png'))
            test_div_scores.to_csv(os.path.join(figures_folder, f'run_{run}', file.replace('.csv', ''), f'{model_name}', 'div_test.csv'), index=False)
            train_div_scores.to_csv(os.path.join(figures_folder, f'run_{run}', file.replace('.csv', ''), f'{model_name}', 'div_train.csv'), index=False)
            pd.DataFrame(model_performance).to_csv(os.path.join(figures_folder, f'run_{run}', file.replace('.csv', ''), f'{model_name}', f'run-{run}_{file.replace(".csv", "")}_{model_name}_performance.csv'), index=False)
            
            file_models_train_predictions[model_name] = model_train_predictions
            file_models_test_predictions[model_name] = model_test_predictions
        all_models_train_predictions[file.replace('.csv', '')] = file_models_train_predictions
        all_models_test_predictions[file.replace('.csv', '')] = file_models_test_predictions

        all_models_train_diversities = compute_all_models_diversities(file_models_train_predictions, models= PARAMETERS_GRID.keys(), y_true=y_train)
        all_models_test_diversities = compute_all_models_diversities(file_models_test_predictions, models= PARAMETERS_GRID.keys(),  y_true=y_test)
       
        fig_all_models_train, ax_all_models_train = plt.subplots(figsize = (40, 40))
        fig_all_models_test, ax_all_models_test = plt.subplots(figsize = (40, 40))

        sns.heatmap(all_models_train_diversities.pivot(index= "scorer_1", columns="scorer_2", values="diversity"),ax=ax_all_models_train, annot=True)
        sns.heatmap(all_models_test_diversities.pivot(index= "scorer_1", columns="scorer_2", values="diversity"), ax=ax_all_models_test, annot=True)

        fig_all_models_train.tight_layout()
        fig_all_models_test.tight_layout()
        fig_all_models_train.savefig(os.path.join(figures_folder, f'run_{run}', file.replace('.csv', ''), 'div_train_all.png'))
        fig_all_models_test.savefig(os.path.join(figures_folder, f'run_{run}', file.replace('.csv', ''), 'div_test_all.png'))
        all_models_train_diversities.to_csv(os.path.join(figures_folder, f'run_{run}', file.replace('.csv', ''), 'div_train_all.csv'), index=False)
        all_models_test_diversities.to_csv(os.path.join(figures_folder, f'run_{run}', file.replace('.csv', ''), 'div_test_all.csv'), index=False)
        pd.DataFrame(project_performance).to_csv(os.path.join(figures_folder, f'run_{run}', file.replace('.csv', ''), f'run-{run}_{file.replace(".csv", "")}_performance.csv'), index=False)


    experiment_metadata['IS_DONE'] = True 
    joblib.dump(experiment_metadata, os.path.join(RESULTS_PATH, 'experiment_' + experiment_hash, 'METADATA.joblib'))
    final_results = pd.DataFrame(results)
    final_results.to_csv(os.path.join(RESULTS_PATH, 'experiment_' + experiment_hash, 'results.csv'), index=False)
