In [None]:
import os
import pandas as pd
import copy 
import sys
sys.path.append(".")
from autospearman import * 

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression



#metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import geometric_mean_score
#from hypopt import GridSearch
from imblearn.over_sampling import SMOTE
from fasttrees.fasttrees import FastFrugalTreeClassifier
from sklearn.feature_selection import VarianceThreshold

#from sklearn.model_selection import GridSearch

In [None]:
#Globals
DATA_PATH = 'C:/Users/Motaz/Desktop/work/IAC_defect_prediction'
EXPERIMENT_NAME = 'new_data'
EXPERIMENT_DATA_PATH = os.path.join(DATA_PATH, EXPERIMENT_NAME)
RESULTS_PATH = './results_IAC'
os.makedirs(RESULTS_PATH, exist_ok=True)

FEATURES =

[
    'Exp', 'LHereDocs', 'DeprFunc','ExplResrDep', 'isTerraform',
    'numImplicitDependentEach', 'CompOpers', 'Nloc', 'Strs', 'ElemTups', 'TmpExpr', 'DynBlc', 'Lps', 'LogiOpers',
    'isResource', 'numTokens', 'Loc', 'SuffStr', 'isModule', 'ImplDepResr', 'isData', 'NetBlc', 'ImplDepProviders', 
    'FnCall', 'LtrExpr', 'sumMccabeCC', 'Vars', 'MetaArgs', 'Attrs', 'LookUpFunc', 'ElemObjs', 'Tups', 'TextEntropy',
    'block_identifiers', 'ImplDepLocals', 'Refs', 'EmptStr', 'Dept', 'DebugFunc', 'HereDocs', 'sumLengthStringValues',
    'Objs', 'ImplDepData', 'Params', 'IndexAccess', 'MOpers', 'ImplDepVars', 'ImplDepModules', 'isProvider',
    'SplatExpr', 'StarStr', 'Conds', 'isLocals', 'isOutput', 'isVariable','Ndevs', 'NChanges', 'Owner',
    'Rexp', 'Sexp', 'Bexp', 'Age', 'RecentAge', 'num_defects_before', 'SimilarChange', 'KindExperience',
    'Nuc', 'EditDistance', 'La', 'Ld', 'Churn', 'BbChange', 'NLa', 'NLd', 'NChurn', 'DLa', 'Dld',
    'isAddDefault', 'isDelDefault', 'isAddType', 'isDelType', 'isAddValue', 'isDelValue', 'isAddVersion',
    'isDelVersion', 'additions_contains_description_change', 'deletions_contains_description_change',
    'additions_contains_meta_args_change', 'deletions_contains_meta_args_change', 'CompOpers_delta',
    'Conds_delta', 'LogiOpers_delta', 'DynBlc_delta', 'NetBlc_delta', 'FnCall_delta', 'Params_delta', 'HereDocs_delta',
    'LHereDocs_delta','IndexAccess_delta', 'LtrExpr_delta', 'Strs_delta', 'sumLengthStringValues_delta', 'Lps_delta',
    'MOpers_delta', 'sumMccabeCC_delta', 'MetaArgs_delta', 'Objs_delta', 'ElemObjs_delta', 'Refs_delta',
    "Vars_delta", '_delta' ,'SplatExpr_delta' ,'textEntropyMeasure_delta', 'TmpExpr_delta', 'Tups_delta',
    'ElemTups_delta', 'Dept_delta', 'Loc_delta', 'Nloc_delta', 'Attrs_delta', 'numResourceDependency_delta',
    'ImplDepResr_delta', 'ImplDepData_delta', 'ImplDepModules_delta', 'ImplDepProviders_delta',
    'ImplDepLocals_delta', 'ImplDepVars_delta', 'numImplicitDependentEach_delta', 'EmptStr_delta',
    'SuffStr_delta','StarStr_delta','numDebuggingFunctions_delta','DeprFunc_delta','ExplResrDep_delta'
]
TARGET = 'RealBug'
APPLY_SMOTE = True
TUNE=True
FINAL_RESULTS_PATH = '../results'

In [None]:
os.listdir(EXPERIMENT_DATA_PATH)

In [None]:
a = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, 'aws-observability__terraform-aws-observability-accelerator_long_val_train.csv'))
for col in pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, 'aws-observability__terraform-aws-observability-accelerator_long_val_train.csv')).columns:
    print(col)
    print(a[col].unique())

In [None]:
#helpers 
def remove_constant_features(df, features):
    # Create a VarianceThreshold instance with the specified threshold
    selector = VarianceThreshold(threshold=0.0)

    # Separate the target feature from the other features
    X = df[features]

    # Fit the selector to the features (this computes the variances)
    selected_features_indices = selector.fit(X).get_support(indices=True)

    return X.iloc[:, selected_features_indices]

def evaluate_model_predictions(y_true, y_pred, y_prob): 
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()

    res ={
        'MCC':matthews_corrcoef(y_true, y_pred),
        'G' : geometric_mean_score(y_true, y_pred), 
        'f1' : f1_score(y_true, y_pred),
        'tpr': recall_score(y_true, y_pred, pos_label=1),
        'tnr' : recall_score(y_true, y_pred,pos_label=0),
        'precision': precision_score(y_true, y_pred), 
        'fpr': 1 - recall_score(y_true, y_pred,pos_label=0),
        'fnr': 1 - recall_score(y_true, y_pred,pos_label=1),
        'tp' : tp, 
        'tn': tn, 
        'fp': fp, 
        'fn': fn
    }
    return res

In [None]:
import numpy as np
def dist2heaven(y_true, y_pred): 
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    tpr = tp/(tp + fn)
    fpr = fp/(fp + tn)
    return -np.sqrt(((1 - tpr)**2 + fpr**2)*0.5)

def MCC_TIMES_G(y_true, y_pred): 
    mcc = matthews_corrcoef(y_pred=y_pred, y_true=y_true)
    g = geometric_mean_score(y_pred=y_pred, y_true=y_true)
    return (mcc)

In [None]:
path_new_data = '../data/Dataset'
results_SMOTE = []
for apply_smote in [False]:
    for filename in os.listdir(EXPERIMENT_DATA_PATH): 
        for run in range(1):
            
            if not ('train' in filename) : 
                continue
            if apply_smote:
                print('Applying SMOTE...')
            if not('.csv' in filename): 
                continue
            print('Working on file:', filename)
            project_dir_name = os.path.join(path_new_data, filename.replace('.csv', ''))
            os.makedirs(project_dir_name, exist_ok=True)

            train_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, filename))
            test_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, filename.replace('train', 'test')))

            train_data= train_data[train_data['isTerraform']==0]
            test_data= test_data[test_data['isTerraform']==0]

            train_data_GP = train_data[FEATURES + [TARGET]].astype('float64')
            test_data_GP = test_data[FEATURES + [TARGET]].astype('float64')
            train_data_GP.to_csv(os.path.join(project_dir_name, filename), index=False)
            test_data_GP.to_csv(os.path.join(project_dir_name, filename.replace('train', 'test')), index=False)


            X_train, y_train = train_data[FEATURES], train_data[TARGET]
            if apply_smote:
                sm = SMOTE()
                X_train, y_train = sm.fit_resample(X_train, y_train)
            X_test, y_test = test_data[FEATURES], test_data[TARGET]
            #X_train = remove_constant_features(X_train, features=FEATURES)
            #X_train = AutoSpearman(X_train, correlation_threshold=0.7, correlation_method='spearman', VIF_threshold=5)
            final_cols = X_train.columns
            print('final features for', filename)
            for col in final_cols: 
                print(f'cols.put({col}, 1);')
            X_test = test_data[final_cols]
            fc = FastFrugalTreeClassifier(scorer=dist2heaven)
            fc.fit(X_train, y_train)
            print(fc.get_tree(decision_view = False))
            y_pred = fc.predict(X_test)
            y_pred_train = fc.predict(X_train)
            train_data['FFT_prediction'] = y_pred_train.astype(float)
            test_data['FFT_prediction'] = y_pred.astype(float)
            #train_data.to_csv(os.path.join(path_new_data, filename), index=False)
            #test_data.to_csv(os.path.join(path_new_data, filename.replace('train', 'test')), index=False)
            print(fc.all_trees)
            #print('best tree idx:', fc.best_tree)
            print('MCC:',matthews_corrcoef(y_test, y_pred))
            print('F1:',f1_score(y_test, y_pred))
            print('G:',geometric_mean_score(y_test, y_pred))

            print('MCC train:',matthews_corrcoef(y_train, y_pred_train))
            print('F1 train:',f1_score(y_train, y_pred_train))
            print('G train:',geometric_mean_score(y_train, y_pred_train))
            model_name = 'FFT'
            if apply_smote:
                model_name += '_SMOTE'
            results_SMOTE.append({
                'file_id': filename, 
                'model_id': model_name,
                'run_id': run, 
                'MCC': matthews_corrcoef(y_test, y_pred), 
                'G': geometric_mean_score(y_test, y_pred),
                'F1': f1_score(y_test, y_pred),
            })


In [None]:
y_train

In [None]:
abc = pd.DataFrame(results_SMOTE)

In [None]:
abc.to_csv("IAC_FFT_MCC_without_noise.csv", index=False)

In [None]:
fft_smote = abc[abc['model_id'] == 'FFT_SMOTE']
fft_ns = abc[abc['model_id'] == 'FFT']
fft_smote['model_id'] = 'FFT'
fft_smote.to_csv('FFT_SMOTE.csv', index=False)
fft_ns.to_csv('FFT.csv', index=False)

In [None]:
abc

In [None]:
abc.to_csv('FFT.csv',index=False)

In [None]:
abc['file_id'].unique()

In [None]:
abc['MCC'].describe()

In [None]:
SELECTED_FEATURES = ['numTuples', 'numElemTuples_delta', 'rexp', 'numFunctionCall_delta',
       'numEmptyString_delta', 'numExplicitResourceDependency_delta',
       'numLinesHereDocs_delta', 'isModule', 'exp', 'numObjects_delta',
       'additions_diffusion', 'num_defects_before',
       'numImplicitDependentModules_delta', 'numMathOperations_delta',
       'numDeprecatedFunctions_delta', 'numMetaArg_delta',
       'numConditions_delta', 'numImplicitDependentResources_delta',
       'numImplicitDependentProviders', 'deletions_contains_versioning_change',
       'num_unique_change', 'numImplicitDependentLocals_delta',
       'numIndexAccess_delta', 'additions_contains_value_output_change',
       'numComparisonOperators_delta', 'additions_contains_versioning_change',
       'deletions_contains_value_output_change', 'numLogiOpers', 'isProvider',
       'numExplicitResourceDependency', 'isResource',
       'numSplatExpressions_delta', 'isLocals', 'code_ownership',
       'numDynamicBlocks_delta', 'num_same_instances_changed_before',
       'nloc_delta', 'kexp', 'numLoops',
       'additions_contains_default_change', 'additions_contains_type_change',
       'deletions_contains_default_change', 'deletions_contains_type_change',
       'numImplicitDependentVars_delta', 'numImplicitDependentProviders_delta',
       'deletions_lines_normalized', 'numImplicitDependentData_delta',
       'containDescriptionField', 'textEntropyMeasure', 'additions_normalized',
       'numNestedBlocks_delta']

In [None]:
studied_file = 'CDCgov__prime-simplereport_train.csv'
train_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, studied_file))
test_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, studied_file.replace('train', 'test')))

for feature in SELECTED_FEATURES: 
    best_threshold = feature
    best_val = -2 
    val_val = -2 
    quantiles  = train_data[feature].quantile([i/100 for i in range(0, 101)])#train_data[feature].unique()
    for quantile in quantiles: 
        
        y_pred = (train_data[feature] > quantile).astype(int)
        mcc_score = f1_score(y_pred=y_pred, y_true=train_data[TARGET])
        
        if mcc_score > best_val: 
            best_val = mcc_score
            best_threshold = f'this.rules.add(new Greater_than_threshold("{feature}", -1.0, -1.0, {float(quantile)}, 0));'
            y_val = (test_data[feature] > quantile).astype(int)
            val_val = f1_score(y_pred=y_val, y_true=test_data[TARGET])
        
        y_pred = (train_data[feature] < quantile).astype(int)
        mcc_score = f1_score(y_pred=y_pred, y_true=train_data[TARGET])
        
        if mcc_score > best_val: 
            best_val = mcc_score
            best_threshold = f'this.rules.add(new Lesser_than_threshold("{feature}", -1.0, -1.0, {float(quantile)}, 0));'
            y_val = (test_data[feature] < quantile).astype(int)
            val_val = f1_score(y_pred=y_val, y_true=test_data[TARGET])
    
    print(best_threshold)
    #print('MCC train:', best_val)
    #print('MCC test:', val_val)



In [None]:
train_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, studied_file))
test_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, studied_file.replace('train', 'test')))

for feature in SELECTED_FEATURES: 
    best_threshold = feature
    best_val = -2 
    val_val = -2 
    quantiles  = train_data[feature].quantile([i/100 for i in range(0, 101)])#train_data[feature].unique()
    for quantile in quantiles: 
        
        y_pred = (train_data[feature] > quantile).astype(int)
        mcc_score = matthews_corrcoef(y_pred=y_pred, y_true=train_data[TARGET])
        
        if mcc_score > best_val: 
            best_val = mcc_score
            best_threshold = f'this.rules.add(new Greater_than_threshold("{feature}", -1.0, -1.0, {float(quantile)}, 0));'
            y_val = (test_data[feature] > quantile).astype(int)
            val_val = matthews_corrcoef(y_pred=y_val, y_true=test_data[TARGET])
        
        y_pred = (train_data[feature] < quantile).astype(int)
        mcc_score = matthews_corrcoef(y_pred=y_pred, y_true=train_data[TARGET])
        
        if mcc_score > best_val: 
            best_val = mcc_score
            best_threshold = f'this.rules.add(new Lesser_than_threshold("{feature}", -1.0, -1.0, {float(quantile)}, 0));'
            y_val = (test_data[feature] < quantile).astype(int)
            val_val = matthews_corrcoef(y_pred=y_val, y_true=test_data[TARGET])
    
    print(best_threshold)
    #print('MCC train:', best_val)
    #print('MCC test:', val_val)



In [None]:
train_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, studied_file))
test_data = pd.read_csv(os.path.join(EXPERIMENT_DATA_PATH, studied_file.replace('train', 'test')))

for feature in SELECTED_FEATURES: 
    best_threshold = feature
    best_val = -2 
    val_val = -2 
    quantiles  = train_data[feature].quantile([i/100 for i in range(0, 101)])#train_data[feature].unique()
    for quantile in quantiles: 
        
        y_pred = (train_data[feature] > quantile).astype(int)
        mcc_score = geometric_mean_score(y_pred=y_pred, y_true=train_data[TARGET])
        
        if mcc_score > best_val: 
            best_val = mcc_score
            best_threshold = f'this.rules.add(new Greater_than_threshold("{feature}", -1.0, -1.0, {float(quantile)}, 0));'
            y_val = (test_data[feature] > quantile).astype(int)
            val_val = geometric_mean_score(y_pred=y_val, y_true=test_data[TARGET])
        
        y_pred = (train_data[feature] < quantile).astype(int)
        mcc_score = geometric_mean_score(y_pred=y_pred, y_true=train_data[TARGET])
        
        if mcc_score > best_val: 
            best_val = mcc_score
            best_threshold = f'this.rules.add(new Lesser_than_threshold("{feature}", -1.0, -1.0, {float(quantile)}, 0));'
            y_val = (test_data[feature] < quantile).astype(int)
            val_val = geometric_mean_score(y_pred=y_val, y_true=test_data[TARGET])
    
    print(best_threshold)
    #print('MCC train:', best_val)
    #print('MCC test:', val_val)



In [None]:
import random 
random.sample([1, 2, 3, 4], 2)

In [None]:
cols = ['algorithm', 'project_name', 'hv', 'gd']
hv_gd_all = pd.concat([
    pd.read_csv('C:/Users/Motaz/Desktop/work/TSE_R3/hv_gd_baselines/MEG_indicators.csv')[cols],
    pd.read_csv('C:/Users/Motaz/Desktop/work/TSE_R3/hv_gd_baselines/MOLR_hv_GD.csv')[cols],
    pd.read_csv('C:/Users/Motaz/Desktop/work/TSE_R3/hv_gd_baselines/MOPSO_hv_gd_CRDP.csv')[cols],
    pd.read_csv('C:/Users/Motaz/Desktop/work/TSE_R3/hv_gd_baselines/ibea_indicators.csv')[cols]
])

In [None]:
hv_gd_all.to_csv("all_hv_gd.csv", index=False)

In [None]:
%pip -V