In [None]:
import pandas as pd
import os
import sys
import pickle
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np
import ast
import copy
sys.path.append(".")

In [None]:
#GLOBALS 
SUMMERIES_PATH = "./GP_RESULTS_SUMMERIES"
METRICS = 'TPR_TNR'
PROJECT = "rq1_results"
EXP_ID = 'default'
ML_DATA_PATH = './ML_RES_predcr'
ORGS = ['Eclipse','Libreoffice','Gerrithub']

In [None]:
#helpers
def GET_DATA() :
    return pd.read_csv(os.path.join(SUMMERIES_PATH,f'GP_{PROJECT}_{METRICS}_{EXP_ID}.csv'))
#creating the select_subset selection function to select the appropriate data

def get_ML_data(path,project,model_name) : 
    return pd.read_csv(os.path.join(ML_RESULTS_PATH,f'{project}_{model_name}_test_result_cross.csv'))

def prepare_ML_DATA(ML_DATA_PATH) : 
    all_ML_DATA = pd.concat([
        pd.read_csv(os.path.join(ML_DATA_PATH,'Eclipse_all_models.csv')),
        pd.read_csv(os.path.join(ML_DATA_PATH,'Libreoffice_all_models.csv')),
        pd.read_csv(os.path.join(ML_DATA_PATH,'Gerrithub_all_models.csv'))

    ])
    final_ml_data = process_ml_data(all_ML_DATA)
    return final_ml_data

def prepare_GP_results(GP_DATA_PATH) : 
    all_GP_data = pd.concat([
        pd.read_csv(os.path.join(GP_DATA_PATH,'MULTICR_performance.csv')),
        pd.read_csv(os.path.join(GP_DATA_PATH,'other_moeas.csv'))
    ])
    all_GP_data = all_GP_data.rename(columns={"tnr": "recall_M", "tpr": "recall_A",'f1': 'f1_A','precision':"precision_A"})
    all_GP_data = add_metric(all_GP_data,'precision_M',precision_M)
    all_GP_data = add_metric(all_GP_data,'f1_M',f1_M)
    all_GP_data.loc[all_GP_data["project_name"] == "eclipse",'project_name'] = 'Eclipse'
    all_GP_data['file_id'] = all_GP_data.apply(lambda row: captilize_first_letter(row['project_name']) + '_' + str(row['fold'] ),axis=1)
    
    all_GP_data = select_all_but_best(all_GP_data)
    return all_GP_data

def process_ml_data(ml_data) : 
    result = ml_data.copy()
    result = result.rename(columns={"recall_m": "recall_M", "recall_a": "recall_A",
                                    'f1_score_a': 'f1_A','precision_a':"precision_A",
                                    'f1_score_m' : 'f1_M','precision_m' : 'precision_M',
                                    'project': 'project_name'
                                   })
    result["fold"] = result['fold'] - 1 
    result['file_id'] = result.apply(lambda row: row['project_name'] + '_' + str(row['fold'] ),axis=1)
    
    return result 
def select_subset(data,project_name,algorithm_name,file_id =None) : 
    if file_id != None : 
        selected_data = data.loc[(data["projet_name"] == project_name) & (data["algorithm"] == algorithm_name) & (data["file_id"] == file_id)]
    else : 
        selected_data = data.loc[(data["projet_name"] == project_name) & (data["algorithm"] == algorithm_name)]
   
    return selected_data

def select_by_model(data,models_ids) : 
    selected_data = data.loc[data["model_id"].isin(models_ids)] 
    return selected_data

def select_by_run(data,runs) : 
    selected_data = data.loc[data["run"].isin(runs)] 
    return selected_data
    
def select_by_project_list(data,projects) : 
    selected_data = data.loc[data["projet_name"].isin(projects)]
    return selected_data

def select_by_filenames(data,files) : 
    selected_data = data.loc[data["file_id"].isin(files)]
    return selected_data

def select_algorithms(data,selected_algos) : 
    selected_data = data.loc[data["algorithm"].isin(selected_algos)]
    return selected_data

def select_only_best_model(data) : 
    selected_data = data.loc[(data["model_id"] == "best_model_performance")]
    return selected_data

def select_all_but_best(data) : 
    selected_data = data.loc[(data["model_id"] != "best_model_performance")]
    return selected_data

def select_train_or_test(data,train_or_test) : 
    selected_data = data.loc[(data["train_or_test"] == train_or_test)]
    return selected_data

def add_metrics_product(data,metrics): 
    result = data.copy()
    def lambda_fun(row,metrics) :
        res = 1.0 
        for metric in metrics: 
            res *= row[metric]
        return res
    col_name = str(metrics[0])
    for col in metrics[1:] : 
        col_name+= "_times_"+col
    result[col_name] = result.apply (lambda row: lambda_fun(row,metrics), axis=1)
    return result 

def add_project_name(data): 
    result = data.copy()
    result["project_name"] = result.apply (lambda row: PROJECT, axis=1)
    return result 

def generate_project_name(data) : 
    result = data.copy()
    result["project_name"] = result.apply (lambda row: row['file_id'].split('_')[0], axis=1)
    return result 

def add_metric(data,metric_name,metric_func): 
    data_copy=data.copy()
    data_copy[metric_name] = data_copy.apply (lambda row: metric_func(row), axis=1)
    return data_copy 

def find_best_model(data,criteria,max_is_better=True) :
    data_copy = data.copy()
    data_copy = data_copy.reset_index()
    criteria_col = data_copy[criteria]
    
    if max_is_better: 
        best_index = criteria_col.idxmax()
    else :
        best_index = criteria_col.idxmin()
    return data_copy.loc[best_index,'model_id']

def apply_selection_criteria(data,criteria,metrics = ['f1','G','MCC','tpr','tnr','precision'],max_is_better = True): 
    file_ids = data['file_id'].unique()
    runs = data["run"].unique()
    algos = data['algorithm'].unique()
    train_data =  select_train_or_test(data,"train")
    test_data =  select_train_or_test(data,"test")
    #train_data = train_data[(train_data['recall_A'] > 0.8) &(train_data['recall_M'] > 0.8)  ]
    #print(len(train_data))
    print(file_ids)
    result = []
    for file_id in file_ids:
        for run in runs :
            for algo in algos: 
                file_id_train = select_all_but_best(select_by_run(select_algorithms(select_by_filenames(train_data,[file_id]),[algo]),[run]))
                #print(file_id_train)
                file_id_test = select_all_but_best(select_by_run(select_algorithms(select_by_filenames(test_data,[file_id]),[algo]),[run]))
                #print(file_id_test)
                try:
                    best_model = find_best_model(file_id_train,criteria,max_is_better)
                except : 
                    continue 
                print(best_model)
                new_row = {
                    'file_id':file_id,
                    'run' : run,
                    'algorithm' : algo,
                    'model_id':best_model
                }
                print("=====================================================================================================")
                print(file_id_test[file_id_test['model_id'] == best_model][metrics])
                print('len selected:',len(file_id_test[file_id_test['model_id'] == best_model][metrics]))
                print("=====================================================================================================")
                model_performance = list(file_id_test[file_id_test['model_id'] == best_model][metrics].to_dict(orient = 'index').values())[0]
                print(model_performance)
                new_row.update(model_performance)
                result.append(new_row)
    return pd.DataFrame(result)

#new_metrics 
def precision_M(row) :
    if (row['tn'] + row['fn']) == 0 :
        return 1
    return row['tn']/(row['tn'] + row['fn'])

def f1_M(row) : 
    if 'precision_M' in row: 
        prec_M = row['precision_M']
    else :
        prec_M = row['tn']/(row['tn'] + row['fn'])
    
    if 'tnr' in row: 
        rec_M = row['tnr']
    elif 'recall_M' in row : 
        rec_M = row['recall_M']
    else:
        rec_M = row['tn']/(row['tn'] + row['fp'])
    
    if prec_M + rec_M == 0 :
        return 1
    return 2*prec_M*rec_M/(prec_M + rec_M)

def captilize_first_letter(word) :
    return word[0].upper() + word[1:]

## RQ2: Within-project validation

In [None]:
#main
GP_DATA = prepare_GP_results(GP_RESULTS_PATH)
ML_DATA = prepare_ML_DATA(ML_DATA_PATH)
print(GP_DATA.columns)
#print(ML_DATA.columns)
#print(ML_DATA['file_id'].unique())
print(GP_DATA['file_id'].unique())
STUDIED_COLUMNS = [ 'project_name','file_id','fold','run','algorithm','model_id','train_or_test','MCC','f1_A','f1_M',  'recall_A', 'recall_M',  'precision_A','precision_M']
FINAL_DATA = pd.concat([
    GP_DATA[STUDIED_COLUMNS], ML_DATA[STUDIED_COLUMNS]
])
FINAL_DATA = add_metrics_product(FINAL_DATA,['MCC','f1_M'])
FINAL_DATA = add_metrics_product(FINAL_DATA,['MCC','f1_A'])

FINAL_RESULT = apply_selection_criteria(FINAL_DATA,'MCC_times_f1_M',metrics = ['project_name','MCC','f1_M','f1_A', 'recall_A', 'recall_M',  'precision_A','precision_M'])


In [None]:
MCC_F1_G = add_metrics_product(new_data,['MCC','f1_A'])
res = apply_selection_criteria(MCC_F1_G,'MCC_times_f1_A',metrics = ['MCC','G','f1_M','precision_M','recall_M','f1_A','precision_A','recall_A'])
res = generate_project_name(res)

## RQ3: Cross-project validation

In [None]:
GP_CP_RESULTS_PATh = '../GP_summeries\GP_RESULTS/cross_project'
ML_CP_RESULTS_PATh = '../GP_summeries/ML_RESULTS/cross_project'
ML_CP_1_RESULTS_PATh = os.path.join(ML_CP_RESULTS_PATh,'ML_scenario_1_results.csv')
ML_CP_2_RESULTS_PATh = os.path.join(ML_CP_RESULTS_PATh,'ML_scenario_2_results.csv')
GP_CP_1_RESULTS_PATh = os.path.join(GP_CP_RESULTS_PATh,'cp_scenario_1_multicr.csv')
GP_CP_2_RESULTS_PATh = os.path.join(GP_CP_RESULTS_PATh,'cp_scenario_2_multicr.csv')

In [None]:
def prepare_cp_1_ML_DATA(ML_DATA_PATH) : 
    all_data = pd.read_csv(ML_DATA_PATH)
    all_data['project_name'] = all_data.apply(lambda row : row['Source'] + "_" + row['Target'],axis = 1)
    return all_data

def prepare_cp_1_GP_results(GP_DATA_PATH) : 
    all_data = pd.read_csv(GP_DATA_PATH)
    all_data = all_data.rename(columns={"tnr": "recall_M", "tpr": "recall_A",'f1': 'f1_A','precision':"precision_A"})
    all_data = add_metric(all_data,'precision_M',precision_M)
    all_data = add_metric(all_data,'f1_M',f1_M)
    all_data = add_metrics_product(all_data,['MCC','f1_M'])
    all_data['project_name'] = all_data.apply(lambda row : row['Source'] + "_" + row['Target'],axis = 1 )
    return all_data

def prepare_cp_2_ML_DATA(ML_DATA_PATH) : 
    all_data = pd.read_csv(ML_DATA_PATH)
    return all_data

def prepare_cp_2_GP_results(GP_DATA_PATH) : 
    all_data = pd.read_csv(GP_DATA_PATH)
    all_data = all_data.rename(columns={"tnr": "recall_M", "tpr": "recall_A",'f1': 'f1_A','precision':"precision_A"})
    all_data = add_metric(all_data,'precision_M',precision_M)
    all_data = add_metric(all_data,'f1_M',f1_M)
    all_data = add_metrics_product(all_data,['MCC','f1_M'])
    
    return all_data

### Cross project scenario 1: One project source One target 

In [None]:
#ml_cp1_data = prepare_cp_1_ML_DATA(ML_CP_1_RESULTS_PATh)
#print(ml_cp1_data.columns)
gp_cp1_data = prepare_cp_1_GP_results(GP_CP_1_RESULTS_PATh)
print(gp_cp1_data.columns)
gp_cp1_data_selected = apply_selection_criteria(gp_cp1_data,'MCC_times_f1_M',metrics = ['Source', 'Target', 'run', 'algorithm', 'MCC', 'f1_A', 'f1_M',
       'model_id','project_name'])
print(gp_cp1_data)
all_cp1_data = pd.concat([gp_cp1_data_selected[['Source', 'Target', 'run', 'algorithm', 'MCC', 'f1_A', 'f1_M',
       'model_id','project_name']]])

### Cross project scenario 2: All projects source One target 

In [None]:
ml_cp2_data = prepare_cp_2_ML_DATA(ML_CP_2_RESULTS_PATh)
print(ml_cp2_data.columns)
gp_cp2_data = prepare_cp_2_GP_results(GP_CP_2_RESULTS_PATh)
print(gp_cp2_data.columns)
gp_cp2_data_selected = apply_selection_criteria(gp_cp2_data,'MCC_times_f1_M',metrics = ['project_name','run', 'algorithm', 'MCC', 'f1_A', 'f1_M',
       'model_id','project_name'])
gp_cp2_data
all_cp2_data = pd.concat([ml_cp2_data[['project_name', 'algorithm', 'run', 'model_id', 'MCC', 'f1_M', 'f1_A']],gp_cp2_data_selected[['project_name', 'algorithm', 'run', 'model_id', 'MCC', 'f1_M', 'f1_A']]])

## Models complexity 

In [None]:
import networkx as nx
import pydot

In [None]:
def load_cv_data(path,org,run,model_name) : 
    return pickle.load(open(os.path.join(path,org,model_name,f'{org}_{model_name}_cross_val_run_{run}.pk'),'rb'))

def compute_DT_max_depth(dt_model) : 
    return dt_model.tree_.max_depth 

def nodes_count(dt_model) : 
    return dt_model.tree_.node_count

def compute_DT_model_complexity(dt_model, internal_node_complexity = 3, leaf_node_complexity = 1) :
    complexity=0
    children_left = dt_model.tree_.children_left
    children_right = dt_model.tree_.children_right
    for node_id in range(dt_model.tree_.node_count): 
        is_split_node = children_left[node_id] != children_right[node_id]
        if is_split_node: 
            complexity += internal_node_complexity 
        else : 
            complexity += leaf_node_complexity

    return complexity

def compute_dt_models_complexities(orgs,runs,data_path,folds=range(10)) : 
    results = []
    for org in orgs: 
        for fold in folds: 
            for run in runs: 
                new_row = {'project_name' : org,'fold' : fold, 'run' : run,'algorithm' : "DT"}
                try: 
                    cv_dt_data = load_cv_data(data_path,org,run,'DT')
                except Exception as e:
                    print(e) 
                    print(f'loading data problem with project {org} run {run} model DT')
                    continue
                new_row['complexity'] = compute_DT_model_complexity(cv_dt_data.best_estimator_)
                new_row['max_depth'] =compute_DT_max_depth(cv_dt_data.best_estimator_)
                new_row['nodes_count'] =nodes_count(cv_dt_data.best_estimator_)
                results.append(new_row)
    return pd.DataFrame(results)

def compute_GP_rule_complexity(rule,pondiration = {'Or' : 3, 'Xor' : 3, 'And' : 3, "Terminal" : 1}) : 
    nodes_labels = rule.split(';')
    total_complexity = 0 
    for node_label in nodes_labels:
        if '[label = "Xor" ]' in node_label: 
             total_complexity += pondiration['Xor']

        elif '[label = "Or" ]' in node_label: 
             total_complexity += pondiration['Or']
        
        elif 'And' in node_label: 
             total_complexity += pondiration['And']
        
        else : 
            total_complexity += pondiration['Terminal']
    
    return total_complexity
        
def max_depth_rule(rule) :
    max_d = []
    #dfs_depth base Code reference networkx
    print('new_rule')
    graphs = pydot.graph_from_dot_data('digraph {' + rule + ' }')
    graph = graphs[0]
    G = nx.DiGraph(nx.nx_pydot.from_pydot(graph))
    n_nodes = len(G)
    depth = int(np.log2(n_nodes))
    print('looking for max depth')
    return depth

def rule_nodes_count(rule) : 
    print('new_rule depth')
    graphs = pydot.graph_from_dot_data('digraph {' + rule + ' }')
    graph = graphs[0]
    G = nx.DiGraph(nx.nx_pydot.from_pydot(graph))

    return len(G.nodes)




In [None]:
multicr_rules = pd.read_csv('MULTICR_rules.csv')
multicr_rules.loc[multicr_rules["project_name"] == "eclipse",'project_name'] = 'Eclipse'
multicr_selected_rules = FINAL_RESULT[FINAL_RESULT['algorithm'] == 'ibea'][['project_name','algorithm','fold','run','model_id']]
multicr_selected_rules_complexities = pd.merge(multicr_selected_rules,multicr_rules,on=['project_name', 'algorithm', 'fold', 'run', 'model_id'],how='inner')
print(len(multicr_selected_rules_complexities))
print(multicr_selected_rules_complexities)
multicr_selected_rules_complexities['complexity'] = multicr_selected_rules_complexities.apply(lambda row: compute_GP_rule_complexity(rule = str(row["rule"])),axis=1)
multicr_selected_rules_complexities['nodes_count'] = multicr_selected_rules_complexities.apply(lambda row: rule_nodes_count(rule = str(row["rule"])),axis=1)
multicr_selected_rules_complexities['max_depth'] = multicr_selected_rules_complexities.apply(lambda row: max_depth_rule(rule = str(row["rule"])),axis=1)

FINAL_RESULT['fold'] = FINAL_RESULT.apply(lambda row: int(row['file_id'].split('_')[1]),axis=1)
print(multicr_selected_rules_complexities.head())
DT_complexity=compute_dt_models_complexities(['Eclipse','Libreoffice','Gerrithub'],range(5), ML_MODELS_PATH)
MULTICR_DT_COMPLEXITIES_ALL = pd.concat([DT_complexity[['project_name', 'fold', 'run', 'algorithm', 'complexity', 'max_depth', 'nodes_count']],
                                          multicr_selected_rules_complexities[['project_name', 'fold', 'run', 'algorithm', 'complexity', 'max_depth', 'nodes_count']]
                                                        ])

## RQ5: Rules analysis

In [None]:
import sys 
sys.path.append('.')
import graphviz
import networkx as nx
import pydot
from sympy import symbols
from sympy import *
from sympy.logic.boolalg import to_cnf
import copy
import lime
import lime.lime_tabular
from sklearn.metrics import matthews_corrcoef
from sklearn.preprocessing import StandardScaler
from pathos.multiprocessing import ProcessingPool as Pool
from explainer import * 

In [None]:
TOP_N = 5
FEATURES = ['author_experience','author_merge_ratio', 'author_changes_per_week',
       'author_merge_ratio_in_project', 'total_change_num',
       'author_review_num', 'description_length', 'is_documentation',
       'is_bug_fixing', 'is_feature', 'project_changes_per_week',
       'project_merge_ratio', 'changes_per_author', 'num_of_reviewers',
       'num_of_bot_reviewers', 'avg_reviewer_experience',
       'avg_reviewer_review_count', 'lines_added', 'lines_deleted',
       'files_added', 'files_deleted', 'files_modified', 'num_of_directory',
       'modify_entropy', 'subsystem_num'
            #, 'text_prob_ngram'
           ]
TARGET = 'status'

In [1]:

def extract_features_data(topn_models_per_run, studied_features= FEATURES) : 
    file_ids = topn_models_per_run['file_id'].unique()
    runs = topn_models_per_run['run'].unique()
    topn_models_per_run['Terminals'] = topn_models_per_run.apply(lambda row : extract_terminals(row['rule']),axis = 1 )
    projects = topn_models_per_run['project_name'].unique()
    print(projects)
    occurence_per_projects = {project : {feature: 0 for feature in studied_features} for project in projects}
    occurence_per_file_id = {file : {feature: 0 for feature in studied_features} for file in file_ids}
    frequency_per_projects = {project : {feature: [] for feature in studied_features} for project in projects}
    frequency_per_file_id = {file : {feature: [] for feature in studied_features} for file in file_ids}

    for file_id in file_ids: 
        for run in runs :
            selected_models = select_by_run(select_by_filenames(topn_models_per_run,[file_id]),[run])
            
            for index, row in selected_models.iterrows() :
                run_occurencies = {feature : 0 for feature in studied_features} 
                terminals_list = ast.literal_eval(str(row['Terminals']))
                for terminal in terminals_list: 
                    feature_name, op, threshold_value = parse_feature_name(terminal)
                    frequency_per_projects[row['project_name']][feature_name].append((op, threshold_value))
                    frequency_per_file_id[file_id][feature_name].append((op, threshold_value))
                    if run_occurencies[feature_name] == 0 : 
                        run_occurencies[feature_name] = 1 
                        occurence_per_projects[row['project_name']][feature_name] += 1 
                        occurence_per_file_id[file_id][feature_name] += 1 
                    else: 
                        print(file_id, row, row['model_id'], feature_name, 'already encounted')
    return frequency_per_projects, frequency_per_file_id, occurence_per_projects, occurence_per_file_id

def select_topn_per_run(df,rules_df,criteria = 'MCC_times_f1_M',topn=TOP_N): 
    file_ids = df['file_id'].unique()
    print(file_ids)
    print(rules_df['file_id'].unique())
    runs = df['run'].unique()
    train_data =  select_train_or_test(df,"test")
    test_data =  select_train_or_test(df,"test")
    all_selected_rules=[]
    for file_id in file_ids: 
        for run in runs : 
            file_id_train = select_all_but_best(select_by_run(select_by_filenames(train_data,[file_id]),[run]))
            file_id_test = select_all_but_best(select_by_run(select_by_filenames(test_data,[file_id]),[run]))
            top_n_models = list(select_top_n(file_id_train,criteria=criteria, n=topn)['model_id'])
            selected_rules = select_by_model(select_by_run(select_by_filenames(rules_df,[file_id]),[run]),top_n_models)
            all_selected_rules.append(selected_rules)
    return pd.concat(all_selected_rules)

def select_top_n(data,criteria,n=TOP_N) : 
    data["selection_criteria"] = data.apply(criteria,axis=1)
    top_n_models = data.nlargest(n,['selection_criteria'])
    data.drop(columns = ["selection_criteria"])
    return top_n_models

def extract_terminals(rule_str) : 
    graphs = pydot.graph_from_dot_data('digraph {' + rule_str + ' }')
    graph = graphs[0]
    G = nx.DiGraph(nx.nx_pydot.from_pydot(graph))
    terminals = []
    for node in G.nodes: 
        if str(G._node[str(node)]['label'].replace('"','')) in ["Or","Xor","And"]:
            continue 
        terminals.append(str(G._node[str(node)]['label']).replace('"',''))
    return terminals 

def parse_feature_name(terminal) : 
    parsed_feature = terminal.split(' ')
    return str(parsed_feature[0]), str(parsed_feature[1]), float(parsed_feature[2]) 
class RuleWarpper: 
    def __init__(self,rule_dot_str) -> None:
        self.rule_dot_str = rule_dot_str
        self.cleaned_dot_rule = RuleWarpper.simplfy_dot_rule(rule_dot_str)
        #print(self.cleaned_dot_rule)
        self.renamed_rule, self.rename_dict=RuleWarpper.rename_terminals_dot_rule(self.cleaned_dot_rule)
        self.sympy_rule_str = RuleWarpper.from_dot_to_sympy(self.renamed_rule)
        self.symbols_list = [symbol for symbol in self.rename_dict]
        self.sympy_negatition_rule_str = '~ (' + self.sympy_rule_str +')'
        #print(self.sympy_negatition_rule_str)
        self.sympy_rule_sympified = to_dnf(self.sympy_rule_str,simplify=True,force=True)
        print(self.sympy_rule_sympified)
        self.sympy_negatition_rule_sympified = to_dnf(self.sympy_negatition_rule_str,simplify=True,force=True)
        print(self.sympy_negatition_rule_sympified)
        print(self.rename_dict)
        #print(self.sympy_rule_str)
        #print(self.sympy_rule_sympified)
        #print(self.sympy_negatition_rule_sympified)
    
    def explain_instance(self,instance_to_explain): 
        rule_literals = str(self.sympy_rule_sympified).split('|')
        prediction = None 
        explanation = '' 
        for literal in rule_literals: 
            literal_sympy = sympify(literal)
            value = RuleWarpper.evaluate_sympy_rule(literal_sympy,self.rename_dict,instance_to_explain)
            if value : 
                prediction = True
                explanation += literal + ' '
        
        if len(explanation) > 0 : 
            return prediction, explanation

        rule_literals = str(self.sympy_negatition_rule_sympified).split('|')
        for literal in rule_literals: 
            literal_sympy = sympify(literal)
            value = RuleWarpper.evaluate_sympy_rule(literal_sympy,self.rename_dict,instance_to_explain)
            if value: 
                prediction = False
                explanation += literal + ' '
        
        if len(explanation) > 0 : 
            return prediction, explanation

    @staticmethod
    def evaluate_sympy_rule(sympy_rule,rename_dict,features_values) : 
        terminal_values = {variable: RuleWarpper.evaluate_terminal( rename_dict[variable],env=features_values) for variable in rename_dict}
        terminals = symbols([var for var in rename_dict])
        #print(sympy_rule.subs({var: terminal_values[str(var)] for var in terminals}))
        return sympy_rule.subs({var: terminal_values[str(var)] for var in terminals})

    @staticmethod
    def evaluate_terminal(terminal_expression, env) : 
        terminal_name, op, threshold_value = terminal_expression.split(' ')
        if op == '=':
            return env[terminal_name] == float(threshold_value)
        
        if op == '>=' : 
            return env[terminal_name] >= float(threshold_value)
        
        if op == '<=' : 
            return env[terminal_name] <= float(threshold_value)
        
    @staticmethod
    def from_dot_to_sympy(rule_dot) : 
        graphs = pydot.graph_from_dot_data('digraph {' + rule_dot + ' }')
        graph = graphs[0]
        G = nx.DiGraph(nx.nx_pydot.from_pydot(graph))
        is_visited = []
        tree_root = [n for n,d in G.in_degree() if d==0][0]
        return RuleWarpper.build_expression_from_graph(G, tree_root)

    @staticmethod
    def build_expression_from_graph(G,node_id) : 
        label = G._node[str(node_id)]['label'].replace('"','')
        successors = G.successors(str(node_id))
        if label == 'Xor' : 
            first_element = next(successors)
            second_element = next(successors)
            return f'({RuleWarpper.build_expression_from_graph(G, first_element)} & ~({RuleWarpper.build_expression_from_graph(G, second_element)})) | ({RuleWarpper.build_expression_from_graph(G, second_element)} & ~({RuleWarpper.build_expression_from_graph(G, first_element)})) '
            #return f'({RuleWarpper.build_expression_from_graph(G,next(successors))} ^ {RuleWarpper.build_expression_from_graph(G,next(successors))})'
        
        if label == 'Or' :
             return f'({RuleWarpper.build_expression_from_graph(G,next(successors))} | {RuleWarpper.build_expression_from_graph(G,next(successors))})'
        
        if label == 'And' :
             return f'({RuleWarpper.build_expression_from_graph(G,next(successors))} & {RuleWarpper.build_expression_from_graph(G,next(successors))})'
        
        return str(label)
            
    @staticmethod
    def rename_terminals_dot_rule(rule_str) : 
        terminal_rename = dict()
        renamed_rule = copy.deepcopy(rule_str)
        rule_terminals = list(set(RuleWarpper.extract_terminals_from_dot(rule_str)))
        print(rule_terminals)
        for terminal_index, terminal in enumerate(rule_terminals) : 
            terminal_rename['X'+str(terminal_index)] = terminal
            renamed_rule = renamed_rule.replace(terminal, 'X'+str(terminal_index))
        return renamed_rule, terminal_rename 
    

    @staticmethod
    def extract_terminals_from_dot(rule_dot) : 
        graphs = pydot.graph_from_dot_data('digraph {' + rule_dot + ' }')
        graph = graphs[0]
        G = nx.DiGraph(nx.nx_pydot.from_pydot(graph))
        terminals = []
        for node in G.nodes: 
            if str(G._node[str(node)]['label'].replace('"','')) in ["Or","Xor","And","True","False"]:
                continue 
            terminals.append(str(G._node[str(node)]['label']).replace('"',''))
        return terminals 
    
    @staticmethod
    def simplfy_dot_rule(rule_dot,non_negative_features = ['num_of_reviewers'], binary_features=['is_documentation','is_bug_fixing', 'is_feature']): 
        # setting negative values to simplified_rule = copy.deepcopy(rule_dot)for non_negative_feature in non_negative_features:
        simplified_rule = copy.deepcopy(rule_dot)
        for non_negative_feature in non_negative_features:
            simplified_rule = simplified_rule.replace(f'{non_negative_feature} <= 0',f'{non_negative_feature} = 0')

        for binary_feature in binary_features: 
            simplified_rule = simplified_rule.replace(f'{binary_feature} >= 0', 'True')
            simplified_rule = simplified_rule.replace(f'{binary_feature} <= 0', f'{binary_feature} = 0')
            simplified_rule = simplified_rule.replace(f'{binary_feature} <= 1', 'True')
            simplified_rule = simplified_rule.replace(f'{binary_feature} >= 1', f'{binary_feature} = 1')
        return simplified_rule

NameError: name 'FEATURES' is not defined

In [None]:
#main
multicr_rules = pd.read_csv('MULTICR_rules.csv')
GP_DATA = prepare_GP_results(GP_RESULTS_PATH)
print(GP_DATA['project_name'].unique())
print(GP_DATA.columns)
STUDIED_COLUMNS = [ 'project_name','file_id','fold','run','algorithm','model_id','train_or_test','MCC','f1_A','f1_M']
multicr_data = GP_DATA[GP_DATA["algorithm"] == 'ibea'][STUDIED_COLUMNS]
multicr_rules = multicr_rules[multicr_rules['algorithm'] == 'ibea']
multicr_rules.loc[multicr_rules["project_name"] == "eclipse",'project_name'] = 'Eclipse'
multicr_rules['file_id'] = multicr_rules.apply(lambda row: captilize_first_letter(row['file_id']),axis=1)
multicr_data = add_metrics_product(multicr_data,['MCC','f1_M'])
print(multicr_rules['project_name'].unique())
print(multicr_data['project_name'].unique())
selected_rules = select_topn_per_run(multicr_data, multicr_rules)
freq_per_project, freq_per_file_id, occ_project, occ_file_id = extract_features_data(selected_rules)

In [None]:
#selected_rules = select_topn_per_run(multicr_data, multicr_rules)
freq_per_project, freq_per_file_id, occ_project, occ_file_id = extract_features_data(selected_rules)

In [None]:
class RuleWarpper: 
    def __init__(self,rule_dot_str) -> None:
        self.rule_dot_str = rule_dot_str
        self.cleaned_dot_rule = RuleWarpper.simplfy_dot_rule(rule_dot_str)
        #print(self.cleaned_dot_rule)
        self.renamed_rule, self.rename_dict=RuleWarpper.rename_terminals_dot_rule(self.cleaned_dot_rule)
        self.sympy_rule_str = RuleWarpper.from_dot_to_sympy(self.renamed_rule)
        self.symbols_list = [symbol for symbol in self.rename_dict]
        self.sympy_negatition_rule_str = '~ (' + self.sympy_rule_str +')'
        #print(self.sympy_negatition_rule_str)
        self.sympy_rule_sympified = to_dnf(self.sympy_rule_str,simplify=True,force=True)
        print(self.sympy_rule_sympified)
        self.sympy_negatition_rule_sympified = to_dnf(self.sympy_negatition_rule_str,simplify=True,force=True)
        print(self.sympy_negatition_rule_sympified)
        print(self.rename_dict)
        #print(self.sympy_rule_str)
        #print(self.sympy_rule_sympified)
        #print(self.sympy_negatition_rule_sympified)
    
    def explain_instance(self,instance_to_explain): 
        rule_literals = str(self.sympy_rule_sympified).split('|')
        prediction = None 
        explanation = '' 
        for literal in rule_literals: 
            literal_sympy = sympify(literal)
            value = RuleWarpper.evaluate_sympy_rule(literal_sympy,self.rename_dict,instance_to_explain)
            if value : 
                prediction = True
                explanation += literal + ' '
        
        if len(explanation) > 0 : 
            return prediction, explanation

        rule_literals = str(self.sympy_negatition_rule_sympified).split('|')
        for literal in rule_literals: 
            literal_sympy = sympify(literal)
            value = RuleWarpper.evaluate_sympy_rule(literal_sympy,self.rename_dict,instance_to_explain)
            if value: 
                prediction = False
                explanation += literal + ' '
        
        if len(explanation) > 0 : 
            return prediction, explanation

    @staticmethod
    def evaluate_sympy_rule(sympy_rule,rename_dict,features_values) : 
        terminal_values = {variable: RuleWarpper.evaluate_terminal( rename_dict[variable],env=features_values) for variable in rename_dict}
        terminals = symbols([var for var in rename_dict])
        #print(sympy_rule.subs({var: terminal_values[str(var)] for var in terminals}))
        return sympy_rule.subs({var: terminal_values[str(var)] for var in terminals})

    @staticmethod
    def evaluate_terminal(terminal_expression, env) : 
        terminal_name, op, threshold_value = terminal_expression.split(' ')
        if op == '=':
            return env[terminal_name] == float(threshold_value)
        
        if op == '>=' : 
            return env[terminal_name] >= float(threshold_value)
        
        if op == '<=' : 
            return env[terminal_name] <= float(threshold_value)
        
    @staticmethod
    def from_dot_to_sympy(rule_dot) : 
        graphs = pydot.graph_from_dot_data('digraph {' + rule_dot + ' }')
        graph = graphs[0]
        G = nx.DiGraph(nx.nx_pydot.from_pydot(graph))
        is_visited = []
        tree_root = [n for n,d in G.in_degree() if d==0][0]
        return RuleWarpper.build_expression_from_graph(G, tree_root)

    @staticmethod
    def build_expression_from_graph(G,node_id) : 
        label = G._node[str(node_id)]['label'].replace('"','')
        successors = G.successors(str(node_id))
        if label == 'Xor' : 
            first_element = next(successors)
            second_element = next(successors)
            return f'({RuleWarpper.build_expression_from_graph(G, first_element)} & ~({RuleWarpper.build_expression_from_graph(G, second_element)})) | ({RuleWarpper.build_expression_from_graph(G, second_element)} & ~({RuleWarpper.build_expression_from_graph(G, first_element)})) '
            #return f'({RuleWarpper.build_expression_from_graph(G,next(successors))} ^ {RuleWarpper.build_expression_from_graph(G,next(successors))})'
        
        if label == 'Or' :
             return f'({RuleWarpper.build_expression_from_graph(G,next(successors))} | {RuleWarpper.build_expression_from_graph(G,next(successors))})'
        
        if label == 'And' :
             return f'({RuleWarpper.build_expression_from_graph(G,next(successors))} & {RuleWarpper.build_expression_from_graph(G,next(successors))})'
        
        return str(label)
            
    @staticmethod
    def rename_terminals_dot_rule(rule_str) : 
        terminal_rename = dict()
        renamed_rule = copy.deepcopy(rule_str)
        rule_terminals = list(set(RuleWarpper.extract_terminals_from_dot(rule_str)))
        print(rule_terminals)
        for terminal_index, terminal in enumerate(rule_terminals) : 
            terminal_rename['X'+str(terminal_index)] = terminal
            renamed_rule = renamed_rule.replace(terminal, 'X'+str(terminal_index))
        return renamed_rule, terminal_rename 
    

    @staticmethod
    def extract_terminals_from_dot(rule_dot) : 
        graphs = pydot.graph_from_dot_data('digraph {' + rule_dot + ' }')
        graph = graphs[0]
        G = nx.DiGraph(nx.nx_pydot.from_pydot(graph))
        terminals = []
        for node in G.nodes: 
            if str(G._node[str(node)]['label'].replace('"','')) in ["Or","Xor","And","True","False"]:
                continue 
            terminals.append(str(G._node[str(node)]['label']).replace('"',''))
        return terminals 
    
    @staticmethod
    def simplfy_dot_rule(rule_dot,non_negative_features = ['num_of_reviewers'], binary_features=['is_documentation','is_bug_fixing', 'is_feature']): 
        # setting negative values to simplified_rule = copy.deepcopy(rule_dot)for non_negative_feature in non_negative_features:
        simplified_rule = copy.deepcopy(rule_dot)
        for non_negative_feature in non_negative_features:
            simplified_rule = simplified_rule.replace(f'{non_negative_feature} <= 0',f'{non_negative_feature} = 0')

        for binary_feature in binary_features: 
            simplified_rule = simplified_rule.replace(f'{binary_feature} >= 0', 'True')
            simplified_rule = simplified_rule.replace(f'{binary_feature} <= 0', f'{binary_feature} = 0')
            simplified_rule = simplified_rule.replace(f'{binary_feature} <= 1', 'True')
            simplified_rule = simplified_rule.replace(f'{binary_feature} >= 1', f'{binary_feature} = 1')
        return simplified_rule

In [None]:
#main 
def prepare_data_GP(df) : 
	clean_df = df.copy()
	boolean_cols = ['is_bug_fixing','is_documentation','is_feature'] 
	clean_df = clean_df.drop(columns = ['project','change_id','created','subject'])
	clean_df['status'] = 1 - clean_df['status'] 
	for col in boolean_cols: 
		clean_df[col] = clean_df[col].astype(int) 
	
	return clean_df

PROJECT = 'Eclipse'
DATA_PATH = './data'
folds = 11 
fold = 10
project_data = pd.read_csv(os.path.join(DATA_PATH,PROJECT+'.csv'))
train_size = project_data.shape[0] * fold // folds
test_size = min(project_data.shape[0] * (fold + 1) // folds, project_data.shape[0])

x_train, y_train = project_data.loc[:train_size - 1, FEATURES], project_data.loc[:train_size - 1, TARGET]
x_test, y_test = project_data.loc[train_size:test_size - 1, FEATURES], project_data.loc[train_size:test_size - 1, TARGET]
			
clean_df = prepare_data_GP(project_data) 
train_df = clean_df.iloc[:train_size - 1]
test_df = clean_df.iloc[train_size:test_size - 1]
SAMPLE_RULE = ' 1 [label = "Xor" ] 1->2; 2 [label = "Or" ] 2->4; 4 [label = "Or" ] 4->8; 8 [label = "lines_deleted >= 1318157" ] 4->9; 9 [label = "author_merge_ratio_in_project <= 0.63" ] 2->5; 5 [label = "Or" ] 5->10; 10 [label = "lines_added >= 7547947" ] 5->11; 11 [label = "changes_per_author <= 58.35" ] 1->3; 3 [label = "And" ] 3->6; 6 [label = "And" ] 6->12; 12 [label = "files_modified <= 10294" ] 6->13; 13 [label = "num_of_reviewers >= 1" ] 3->7; 7 [label = "Or" ] 7->14; 14 [label = "changes_per_author <= 57.19" ] 7->15; 15 [label = "total_change_num >= 460" ]'
myrule = RuleWarpper(SAMPLE_RULE)
predictions = []
explanations = []
for index, row in test_df.iterrows() : 
	prediction, explanation = myrule.explain_instance(row)
	predictions.append(int(prediction))
	explanations.append(explanation)

print(matthews_corrcoef(test_df[TARGET],predictions))

In [None]:
# lime 
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def explain_instance(explainer, data, instances_idx, model, num_features=10, top_labels=1) :
    results = {}
    for instance_index in instances_idx : 
        instance = data.iloc[i]
        explanation = explainer.explain_instance(instance, model.predict_proba, num_features=num_features, top_labels=top_labels)
        results[i] = explanation
    return results
PROJECTS = ['Eclipse', 'Libreoffice', 'Gerrithub']
PROJECT = 'Eclipse'
DATA_PATH = './data'
MODEL_NAME = 'RF'
MODEL_PATH = f'C:/Users/Moataz/Desktop/work/code_review_delay_prediction/islam_data/Results/{PROJECT}/{PROJECT}/{MODEL_NAME}/{PROJECT}_{MODEL_NAME}_cross_val_run_0.pk'
PROJECT_CV_DATA = pickle.load(open(MODEL_PATH,'rb'))
MODEL = PROJECT_CV_DATA.best_estimator_
NB_PROCESS = 10 
FOLDS = 11 
fold = 10
for porject in PROJECTS:
    project_data = pd.read_csv(os.path.join(DATA_PATH,porject+'.csv'))
    for fold in range(1,FOLDS): 
        train_size = project_data.shape[0] * fold // FOLDS
        test_size = min(project_data.shape[0] * (fold + 1) // FOLDS, project_data.shape[0])
        scaler = StandardScaler()
        x_train, y_train = project_data.loc[:train_size - 1, FEATURES], project_data.loc[:train_size - 1, TARGET]
        x_test, y_test = project_data.loc[train_size:test_size - 1, FEATURES], project_data.loc[train_size:test_size - 1, TARGET]
        x_train=scaler.fit_transform(x_train)
        y_train=scaler.transform(x_test)
        explainer = lime.lime_tabular.LimeTabularExplainer(x_train, feature_names=MODEL.feature_names_in_, class_names=[0, 1], discretize_continuous=True)
        new_row = {
            'Explained_model' :MODEL_NAME, 
            'Iteration' : fold - 1, 
        }
        
        idex_chunks = chunks(list(range(len(x_test))),NB_PROCESS) 
        fold_explanations = {}
        with Pool(NB_PROCESS) as p:
            arguments = [(explainer,x_test, chunk ,MODEL,10,1) for chunk in idex_chunks]
            print(p.map(explain_instances,arguments))
            fold_explanations.update(p.map(explain_instances,*arguments))
        
        explanations = [None for _ in range(len(x_test))]
        fold_lime_preds = [None for _ in range(len(x_test))]
        for index, exp in fold_explanations.items() : 
            explanations[index] = exp
            fold_lime_preds[index] = exp.top_labels[0]
        

        print(matthews_corrcoef(y_test,fold_lime_preds))



In [None]:
explainer.BaseDiscretizer

In [None]:
print(matthews_corrcoef(y_test,lime_preds))

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(x_train, feature_names=MODEL.feature_names_in_, class_names=[0, 1], discretize_continuous=True)

In [None]:
i = np.random.randint(0, x_test.shape[0])
print(i)
exp = explainer.explain_instance(x_test.iloc[i], MODEL.predict_proba, num_features=20, top_labels=1)

In [None]:
explainer

In [None]:
exp.show_in_notebook(show_table=True, show_all=False)

In [None]:
exp.top_labels

In [None]:
print(set(explanations))

In [None]:
print(predictions[i])

In [None]:
test_df.head()

In [None]:
SAMPLE_RULE = '''
  1 [label = "Or"];
    3 [label = "Or"];
    2 [label = "And"];
    4 [label = "And"];
    5 [label = "And"];
    6 [label = "Or"];
    7 [label = "Or"];
    8 [label = "num_of_reviewers <= 1"]
    9 [label = "num_of_directory < 54"]
    10 [label = "description_length < 2"]
    11 [label= "num_of_directory >= 2"]
    12 [label = "description_length < 2"]
    13 [label = "total_change_num < 43"]
    14 [label = "lines_added > 221"]
    15 [label  = "is_documentation = 0"]
    1 -> 2
    1->3
    3 -> 4
    3 -> 5
    2 -> 6 
    2 -> 7
    4->8
    4->9
    5->10
    5->11
    6->12
    6->13
    7->14
    7->15
'''


In [None]:
myrule = RuleWarpper(SAMPLE_RULE)

In [None]:
for literal in str(myrule.sympy_rule_sympified).split('|'): 
    print(sympify(literal))

In [None]:
str(symp_symbols[0])

## Discussion: Performance against the other MOEAs

In [None]:
from pymoo.indicators.gd import GD
from pymoo.indicators.hv import HV


In [None]:
def compute_performance_indicator(df, train_or_test = "test") : 
    files_ids = df['file_id'].unique() 
    runs = df['run'].unique() 
    algorithms  = df['algorithm'].unique() 
    results = []
    for file_id in files_ids : 
        for run in runs: 
            for algo in algorithms: 
                print('file:',file_id)
                print('run:',run)
                print('algo:',algo)
                data = select_all_but_best(select_train_or_test(select_algorithms(select_by_run(select_by_filenames(df,[file_id]),[run]),[algo]),train_or_test))
                print(data)
                objs = data[['recall_A','recall_M']]
                hv = compute_hv(objs*-1)
                gd = compute_gd(objs)
                new_row = {
                    'project_name' : data['project_name'].unique()[0],
                    'file_id': file_id,
                    'run': run,
                    'algorithm': algo,
                    'HV': hv,
                    'GD' : gd
                }
                results.append(new_row)
    return pd.DataFrame(results)
def compute_hv(data,ref_point = np.array([0,0])) : 
    hv = HV(ref_point=ref_point)
    print(data)
    return hv(np.array(data.values.tolist()))

def compute_gd(data,pf =np.array([[1.0,1.0]])) : 
    gd = GD(pf)
    return gd(np.array(data.values.tolist()))





In [None]:
#main
GP_DATA = prepare_GP_results(GP_RESULTS_PATH)
print(GP_DATA['file_id'].unique())
STUDIED_COLUMNS = [ 'project_name','file_id','fold','run','algorithm','model_id','train_or_test','MCC','f1_A','f1_M','recall_A','recall_M']
FINAL_DATA = pd.concat([
    GP_DATA[STUDIED_COLUMNS]
])
FINAL_DATA = add_metrics_product(FINAL_DATA,['MCC','f1_M'])


In [None]:
FINAL_DATA

In [None]:
#classification performance data
FINAL_RESULT[(FINAL_RESULT["algorithm"].isin(['ibea','nsga2','nsga3','spea2']))].to_csv('multicr_VS_other_moeas.csv',index=False)

In [None]:
indicators = compute_performance_indicator(FINAL_DATA,"train")

In [None]:
FINAL_DATA.columns

In [None]:
indicators.tail(50)

In [None]:
indicators.to_csv('all_moeas_indicators_train.csv',index=False)

## Discussion: Concept drift validation

In [None]:
#globals
GP_CONCEPT_DRIFT_RESULTS = 'C:/Users/Motaz/Desktop/work/code_review_delay_prediction/early_abondon_prediction/GP_summeries/GP_RESULTS/concept_drift'
ML_CONCEPT_DRIFT_RESULTS = 'C:/Users/Motaz/Desktop/work/code_review_delay_prediction/early_abondon_prediction/GP_summeries/ML_RESULTS/concept_drift'


In [None]:
def prepare_concept_drift_ML_DATA(ML_DATA_PATH) : 
    all_data = pd.concat([
        pd.read_csv(os.path.join(ML_DATA_PATH,'Eclipse_concept_drift_all_models.csv')),
        pd.read_csv(os.path.join(ML_DATA_PATH,'Gerrithub_concept_drift_all_models.csv')),
        pd.read_csv(os.path.join(ML_DATA_PATH,'Libreoffice_concept_drift_all_models.csv'))
        ])
    final_ml_data = process_ml_concept_drift_data(all_data)
    return final_ml_data

def prepare_concept_drift_GP_DATA(GP_DATA_PATH) : 
    all_data = pd.read_csv(GP_DATA_PATH)
    all_data = all_data.rename(columns={"tnr": "recall_M", "tpr": "recall_A",'f1': 'f1_A','precision':"precision_A"})
    all_data = add_metric(all_data,'precision_M',precision_M)
    all_data = add_metric(all_data,'f1_M',f1_M)
    all_data = add_metrics_product(all_data,['MCC','f1_M'])
    all_data['is_new_or_old'] = all_data.apply(lambda row:  'new' if 'new' in row['algorithm'] else 'old' ,axis=1)
    return all_data

def process_ml_concept_drift_data(ml_data) : 
    result = ml_data.copy()
    result = result.rename(columns={"recall_m": "recall_M", "recall_a": "recall_A",
                                    'f1_score_a': 'f1_A','precision_a':"precision_A",
                                    'f1_score_m' : 'f1_M','precision_m' : 'precision_M',
                                    'project': 'project_name'
                                   })
    result["fold"] = result['fold'] - 1 
    result['file_id'] = result.apply(lambda row: extract_file_name_concept_drift(row) ,axis=1)
    result['is_new_or_old'] = result.apply(lambda row:  'new' if 'new' in row['algorithm'] else 'old' ,axis=1)
    return result 

def extract_file_name_concept_drift(row) : 
    file_name = row['project_name'] + '_'
    if 'new' in row['algorithm'] : 
        file_name+= 'new_' 
    else :
        file_name+= 'old_'
    file_name += str(row['fold'])
    return file_name

In [None]:
#main
GP_CONCEPT_DRIFT_DATA = prepare_concept_drift_GP_DATA(os.path.join(GP_CONCEPT_DRIFT_RESULTS,"multicr_concept_drift.csv"))
ML_CONCEPT_DRIFT_DATA = prepare_concept_drift_ML_DATA(ML_CONCEPT_DRIFT_RESULTS)
print(GP_CONCEPT_DRIFT_DATA['file_id'].unique())
print(ML_CONCEPT_DRIFT_DATA['file_id'].unique())
STUDIED_COLUMNS_CONCEPT_DRIFT = [ 'project_name','file_id','fold','run','algorithm','model_id','train_or_test','MCC','f1_A','f1_M','is_new_or_old']
FINAL_DATA_CONCEPT_DRIFT = pd.concat([
    GP_CONCEPT_DRIFT_DATA[STUDIED_COLUMNS_CONCEPT_DRIFT], ML_CONCEPT_DRIFT_DATA[STUDIED_COLUMNS_CONCEPT_DRIFT]
])
FINAL_DATA_CONCEPT_DRIFT = add_metrics_product(FINAL_DATA_CONCEPT_DRIFT,['MCC','f1_M'])
FINAL_RESULT_CONCEPT_DRIFT = apply_selection_criteria(FINAL_DATA_CONCEPT_DRIFT,'MCC_times_f1_M',metrics = ['project_name','MCC','f1_M','f1_A','is_new_or_old'])
FINAL_RESULT_CONCEPT_DRIFT.to_csv("GP_VS_ML_CONCEPT_DRIFT.csv",index=False)


In [None]:
FINAL_RESULT_CONCEPT_DRIFT.head(50)

In [None]:
FINAL_RESULT_CONCEPT_DRIFT[500:].head(50)

In [None]:
FINAL_RESULT_CONCEPT_DRIFT.tail(50)

In [None]:
FINAL_RESULT_CONCEPT_DRIFT['algorithm'].unique()

## Descussion: Bias toward new developers

In [None]:
#globals
GP_NEW_DEVELOPERS_RESULTS = 'C:/Users/Moataz/Desktop/work/code_review_delay_prediction/early_abondon_prediction/GP_summeries/GP_RESULTS/new_developers'


In [None]:
def prepare_GP_new_developers_data_results(GP_DATA_PATH) : 
    all_GP_data = pd.concat([
        pd.read_csv(os.path.join(GP_DATA_PATH,'new_developers_multicr.csv')),
    ])
    all_GP_data = all_GP_data.rename(columns={"tnr": "recall_M", "tpr": "recall_A",'f1': 'f1_A','precision':"precision_A"})
    all_GP_data = add_metric(all_GP_data,'precision_M',precision_M)
    all_GP_data = add_metric(all_GP_data,'f1_M',f1_M)
    all_GP_data.loc[all_GP_data["project_name"] == "eclipse",'project_name'] = 'Eclipse'
    all_GP_data['file_id'] = all_GP_data.apply(lambda row: captilize_first_letter(row['project_name']) + '_' + str(row['fold'] ),axis=1)
    all_GP_data = select_all_but_best(all_GP_data)
    return all_GP_data


In [None]:
#main
GP_DATA_NEW_DEVELOPER = prepare_GP_new_developers_data_results(GP_NEW_DEVELOPERS_RESULTS)
STUDIED_COLUMNS = [ 'project_name','file_id','fold','run','algorithm','model_id','train_or_test','MCC','f1_A','f1_M']
FINAL_DATA_NEW_DEVELOPER = pd.concat([
    GP_DATA_NEW_DEVELOPER[STUDIED_COLUMNS]
])
FINAL_DATA_NEW_DEVELOPER = add_metrics_product(FINAL_DATA_NEW_DEVELOPER,['MCC','f1_M'])
FINAL_RESULTS_NEW_DEVELOPER = apply_selection_criteria(FINAL_DATA_NEW_DEVELOPER,'MCC_times_f1_M',metrics = ['project_name','MCC','f1_M','f1_A'])


In [None]:
FINAL_RESULTS_NEW_DEVELOPER.head(50)

In [None]:
FINAL_RESULTS_NEW_DEVELOPER.to_csv('NEW_DEVELOPERS_RESULTS.csv',index=False)

In [None]:
project = 'Gerrithub'
metric = 'f1_A'
FINAL_RESULTS_NEW_DEVELOPER[(FINAL_RESULTS_NEW_DEVELOPER['project_name'] == project) & (FINAL_RESULTS_NEW_DEVELOPER['algorithm'] == 'ibea')][metric].describe()