In [None]:
import os
import pandas as pd
import copy 
import sys
sys.path.append(".")

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression



#metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import geometric_mean_score
#from hypopt import GridSearch
from imblearn.over_sampling import SMOTE
from fasttrees.fasttrees import FastFrugalTreeClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, log_loss

#from sklearn.model_selection import GridSearch

In [None]:
#Globals
DATA_PATH = '../data/CRDP'
EXPERIMENT_NAME = 'jira'
EXPERIMENT_DATA_PATH = os.path.join(DATA_PATH, EXPERIMENT_NAME)
RESULTS_PATH = './results_IAC'
os.makedirs(RESULTS_PATH, exist_ok=True)
RANDOM_STATE = 42 
FEATURES = [
    'CountDeclMethodPrivate', 'AvgLineCode', 'CountLine',
       'MaxCyclomatic', 'CountDeclMethodDefault', 'AvgEssential',
       'CountDeclClassVariable', 'SumCyclomaticStrict', 'AvgCyclomatic',
       'AvgLine', 'CountDeclClassMethod', 'AvgLineComment',
       'AvgCyclomaticModified', 'CountDeclFunction', 'CountLineComment',
       'CountDeclClass', 'CountDeclMethod', 'SumCyclomaticModified',
       'CountLineCodeDecl', 'CountDeclMethodProtected',
       'CountDeclInstanceVariable', 'MaxCyclomaticStrict',
       'CountDeclMethodPublic', 'CountLineCodeExe', 'SumCyclomatic',
       'SumEssential', 'CountStmtDecl', 'CountLineCode', 'CountStmtExe',
       'RatioCommentToCode', 'CountLineBlank', 'CountStmt',
       'MaxCyclomaticModified', 'CountSemicolon', 'AvgLineBlank',
       'CountDeclInstanceMethod', 'AvgCyclomaticStrict',
       'PercentLackOfCohesion', 'MaxInheritanceTree', 'CountClassDerived',
       'CountClassCoupled', 'CountClassBase', 'CountInput_Max',
       'CountInput_Mean', 'CountInput_Min', 'CountOutput_Max',
       'CountOutput_Mean', 'CountOutput_Min', 'CountPath_Max',
       'CountPath_Mean', 'CountPath_Min', 'MaxNesting_Max', 'MaxNesting_Mean',
       'MaxNesting_Min', 'COMM', 'ADEV', 'DDEV', 'Added_lines', 'Del_lines',
       'OWN_LINE', 'OWN_COMMIT', 'MINOR_COMMIT', 'MINOR_LINE', 'MAJOR_COMMIT',
       'MAJOR_LINE'
]
TARGET = 'RealBug'
FINAL_RESULTS_PATH = '../results_camel'

In [None]:
os.listdir(EXPERIMENT_DATA_PATH)

In [None]:
#helpers 
def neg_log_loss(y_true, y_pred): 
    return -log_loss(y_true=y_true, y_pred=y_pred)


def evaluate_model_predictions(y_true, y_pred, y_prob): 
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()

    res ={
        'MCC':matthews_corrcoef(y_true, y_pred),
        'G' : geometric_mean_score(y_true, y_pred), 
        'f1' : f1_score(y_true, y_pred),
        'tpr': recall_score(y_true, y_pred, pos_label=1),
        'tnr' : recall_score(y_true, y_pred,pos_label=0),
        'precision': precision_score(y_true, y_pred), 
        'fpr': 1 - recall_score(y_true, y_pred,pos_label=0),
        'fnr': 1 - recall_score(y_true, y_pred,pos_label=1),
        'tp' : tp, 
        'tn': tn, 
        'fp': fp, 
        'fn': fn
    }
    return res

SCORERS = {"MCC":matthews_corrcoef,
           'G': geometric_mean_score,
            "ACC": accuracy_score,
            "BAL_ACC":balanced_accuracy_score,
             'F1': f1_score, 
             'F1_W': lambda y_true, y_pred : f1_score(y_true, y_pred, average='weighted')}

In [None]:
import numpy as np
def dist2heaven(y_true, y_pred): 
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    tpr = tp/(tp + fn)
    fpr = fp/(fp + tn)
    return -np.sqrt(((1 - tpr)**2 + fpr**2)*0.5)

def MCC_TIMES_G(y_true, y_pred): 
    mcc = matthews_corrcoef(y_pred=y_pred, y_true=y_true)
    g = geometric_mean_score(y_pred=y_pred, y_true=y_true)
    return mcc

In [None]:
results_SMOTE = []
for apply_smote in [False]:
    for scorer_name, scorer in SCORERS.items():
        print("scorer:", scorer_name)
        for filename in os.listdir(EXPERIMENT_DATA_PATH): 
            for run in range(1):
                
                if not ('train' in filename) : 
                    continue
                if apply_smote:
                    print('Applying SMOTE...')
                if not('.csv' in filename): 
                    continue
                print('Working on file:', filename)

                train_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, filename))
                test_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, filename.replace('train', 'test')))

                X_train, y_train = train_data[FEATURES], train_data[TARGET]
                if apply_smote:
                    sm = SMOTE()
                    X_train, y_train = sm.fit_resample(X_train, y_train)
                X_test, y_test = test_data[FEATURES], test_data[TARGET]
                final_cols = X_train.columns
                
                X_test = test_data[final_cols]
                fc = FastFrugalTreeClassifier(scorer=scorer)
                fc.fit(X_train, y_train)
                print(fc.get_tree(decision_view = True))
                y_pred = fc.predict(X_test)
                y_pred_train = fc.predict(X_train)
                #y__proba = fc.predict_proba(X_train)
                #train_data.to_csv(os.path.join(path_new_data, filename), index=False)
                #test_data.to_csv(os.path.join(path_new_data, filename.replace('train', 'test')), index=False)
                #print(fc.all_trees)
                #print('best tree idx:', fc.best_tree)
                print('MCC:',matthews_corrcoef(y_test, y_pred))
                print('F1:',f1_score(y_test, y_pred))
                print('G:',geometric_mean_score(y_test, y_pred))

                print('MCC train:',matthews_corrcoef(y_train, y_pred_train))
                print('F1 train:',f1_score(y_train, y_pred_train))
                print('G train:',geometric_mean_score(y_train, y_pred_train))
                model_name = 'FFT'
                if apply_smote:
                    model_name += '_SMOTE'
                results_SMOTE.append({
                    'file_id': filename, 
                    'model_id': model_name,
                    'run': run, 
                    'scorer': scorer_name,
                    'MCC': matthews_corrcoef(y_test, y_pred), 
                    'G': geometric_mean_score(y_test, y_pred),
                    'F1': f1_score(y_test, y_pred),
                    'AUC': '-'
                    
                })


In [None]:
abc = pd.DataFrame(results_SMOTE)

In [None]:
abc.to_csv('FFT_model_selection.csv', index=False)