In [None]:
import os
import copy 
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from imblearn.metrics import geometric_mean_score
from imblearn.metrics import specificity_score
from imblearn.over_sampling import SMOTE
from  tqdm.notebook import tqdm

In [None]:
class DeepRandomForest: 
    def __init__(self,configs = {}) -> None:
        self.default_configs = {
            'estimator' : RandomForestClassifier(n_estimators=500, max_features='sqrt',n_jobs=-1),
            'param_M' : 4,
            'standarize' : False,
            'param_tolerance' : 0.0,
            'maximum_layer_number' : np.inf,
            'validation_size' : 0.2
        }
        self.actual_configs = copy.deepcopy(self.default_configs)
        for param_name, param_value in configs : 
            self.actual_configs[param_name] = param_value
    
    def fit(self,X,y) :
        self.original_X = X 
        self.final_X = np.array(X)
        self.original_y = y 
        if self.actual_configs['standarize'] : 
            scaler = StandardScaler()
            self.final_X = scaler.fit_transform(X)
            self.actual_configs['scaler'] = scaler

        X_train, X_val, y_train, y_val  = train_test_split(self.final_X, self.original_y, 
                                                            test_size=self.actual_configs['validation_size'])
       
        
        self.layers = [self.create_layer()]
        self.fit_layer(self.layers[0],X_train,y_train)
        layer_predictions_train = self.layer_predict(self.layers[0],X_train)
        layer_proba_prediction_train = self.compute_layer_prediction(self.layers[0], X_train)
        layer_predictions_val = self.layer_predict(self.layers[0],X_val)
        layer_proba_prediction_val = self.compute_layer_prediction(self.layers[0], X_val)
        ref_accuracy = self.cascade_accuracy(layer_predictions_val,y_val)
        n_layers = 1 
        max_acc_reached = False 
        # error definition is missing 
        while n_layers < self.actual_configs['maximum_layer_number']: 
            X_train = update_X(X_train, layer_proba_prediction_train)
            X_val = update_X(X_val, layer_proba_prediction_val)
            new_layer = self.create_layer()
            self.fit_layer(new_layer,X_train,y_train)
            

            layer_predictions_val = self.layer_predict(new_layer,X_val)
            
            layer_proba_prediction_val = self.compute_layer_prediction(new_layer,X_val)

            new_ref_accuracy = self.cascade_accuracy(layer_predictions_val,y_val)
            if new_ref_accuracy <=  ref_accuracy:
                max_acc_reached = True 
                break
            else : 
                ref_accuracy = new_ref_accuracy 
                n_layers += 1 
                self.layers.append(new_layer)
                layer_proba_prediction_train = self.compute_layer_prediction(new_layer,X_train)
    def create_layer(self) : 
        return [copy.deepcopy(self.actual_configs['estimator']) for _ in range(self.actual_configs['param_M'])]
    
    def fit_layer(self,layer,X,y) : 
        for estimator in layer : 
            estimator.fit(X,y)
    
    def predict(self,X) : 
        probs = self.predict_proba(X) 
        return probs.argmax(axis=1)

    def predict_proba(self,X) : 
        X_for_prediction = np.array(X)
        for layer_index in range(len(self.layers)-1)  : 
            layer_predictions = self.compute_layer_prediction(self.layers[layer_index],X_for_prediction)
            X_for_prediction = update_X(X_for_prediction,layer_predictions)
        last_layer = self.layers[-1]
        #last_layer_predictions_probs = self.compute_layer_prediction(self.layers[layer_index],X_for_prediction).reshape(len(X),self.actual_configs['param_M'],-1)
        return self.layer_predict_proba(last_layer,X_for_prediction)

        

    def compute_layer_prediction(self,layer,X) :
        layer_perdictions = []
        for estimator_index,estimator in enumerate(layer) : 
            estimator_proba = estimator.predict_proba(X)
            layer_perdictions.append(estimator_proba)
        return np.concatenate((layer_perdictions),axis=1)

    
    def layer_predict(self,layer,X) : 
        probs = self.layer_predict_proba(layer,X)
        return probs.argmax(axis=1)

    def layer_predict_proba(self,layer,X) : 
        return self.compute_layer_prediction(layer,X).reshape(len(X),self.actual_configs['param_M'],-1).mean(axis = 1)
    
    
    def cascade_accuracy(self,cascade_preds,y_test) : 
        return accuracy_score(y_true=y_test,y_pred=cascade_preds)
        
def update_X(X,predictions) : 
        result = copy.deepcopy(X)
        return np.concatenate((result,predictions),axis=1)   


In [None]:
#main
DATA_PATH = './CRDP_data'
RESULTS_PATH = "./Results"
os.makedirs(RESULTS_PATH,exist_ok=True)
datasets = {
    'ambros' : {
        "features" : ["numberOfVersionsUntil:","numberOfFixesUntil:","numberOfRefactoringsUntil:","numberOfAuthorsUntil:",
                      "linesAddedUntil:","maxLinesAddedUntil:","avgLinesAddedUntil:","linesRemovedUntil:","maxLinesRemovedUntil:",
                      "avgLinesRemovedUntil:","codeChurnUntil:","maxCodeChurnUntil:","avgCodeChurnUntil:","ageWithRespectTo:",
                      "weightedAgeWithRespectTo:"],
        "outcome" : 'bugs'
    },
        "ck" : {
        'features' :['wmc', 'dit', 'noc', 'cbo', 'rfc', 'lcom',
       'ca', 'ce', 'npm', 'lcom3', 'loc', 'dam', 'moa', 'mfa', 'cam', 'ic',
       'cbm', 'amc', 'max_cc', 'avg_cc'],
       'outcome' : 'bug'
    },
    

    'eclipse' : {
        'features' :  ["pre","ACD","FOUT_avg","FOUT_max","FOUT_sum","MLOC_avg","MLOC_max","MLOC_sum","NBD_avg",
                       "NBD_max","NBD_sum","NOF_avg","NOF_max","NOF_sum","NOI","NOM_avg","NOM_max","NOM_sum","NOT"
                       ,"NSF_avg","NSF_max","NSF_sum","NSM_avg","NSM_max","NSM_sum","PAR_avg","PAR_max","PAR_sum",
                       "TLOC","VG_avg","VG_max","VG_sum"],
        'outcome' : 'post'
    }
}
for APPLY_SMOTE in [True, False]:
    all_results = []
    for dataset, dataset_data in datasets.items() : 
        
        dataset_path = os.path.join(DATA_PATH,dataset)
        if not(os.path.exists(dataset_path)): 
            continue 
        print('working on dataset:',dataset)
        all_files = [file for file in os.listdir(dataset_path)]
        pbar = tqdm(total= len(all_files)/2)
        features = dataset_data['features']
        outcome = dataset_data['outcome']
        dataset_results = []
        for file in os.listdir(dataset_path):
            if not ('train' in file) : 
                continue

            train_df = pd.read_csv(os.path.join(dataset_path,file))
            test_df =  pd.read_csv(os.path.join(dataset_path,file.replace('train','test')))
            X_train, y_train = train_df[features],  train_df[outcome]
            X_test, y_test = test_df[features],  test_df[outcome]
            if APPLY_SMOTE  : 
                sm = SMOTE(random_state=43)
                try:
                    X_train, y_train = sm.fit_resample(X=train_df.loc[:,features].values, y=train_df.loc[:,outcome].values)
                except: 
                    print(file)
                    print("resampling problem")
                    pbar.update(1)
                    continue
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            dfdp_model =  DeepRandomForest()
            try:
                dfdp_model.fit(X_train,y_train)
            except: 
                print(file)
                print("fitting problem")
                pbar.update(1)
                continue
            train_predictions = dfdp_model.predict(X_train)
            test_predictions = dfdp_model.predict(X_test)
            project_name = file.split('_')[0]
            train_row = {
                "file_id" : file,
                "algorithm" : "DFDP",
                'model_id' : 'best_performance_model',
                'train_or_test': 'train'
            }
            train_row.update({
                "f1" :f1_score(y_true=y_train,y_pred=train_predictions),
                'MCC' : matthews_corrcoef(y_true=y_train,y_pred=train_predictions),
                "G": geometric_mean_score(y_true=y_train,y_pred=train_predictions),
                "precision": precision_score(y_true=y_train,y_pred=train_predictions),
                "tpr" : recall_score(y_true=y_train,y_pred=train_predictions),
                "tnr" : specificity_score(y_true=y_train,y_pred=train_predictions)
            })
            test_row = copy.deepcopy(train_row)
            test_row['train_or_test'] = 'test'
            test_row.update({
                "f1" :f1_score(y_true=y_test,y_pred=test_predictions),
                'MCC' : matthews_corrcoef(y_true=y_test,y_pred=test_predictions),
                "G": geometric_mean_score(y_true=y_test,y_pred=test_predictions),
                "precision": precision_score(y_true=y_test,y_pred=test_predictions),
                "tpr" : recall_score(y_true=y_test,y_pred=test_predictions),
                "tnr" : specificity_score(y_true=y_test,y_pred=test_predictions)
            })
            dataset_results.append(train_row)
            dataset_results.append(test_row)
            pbar.update(1)
        all_results += dataset_results
        dataset_results = pd.DataFrame(all_results)
        if APPLY_SMOTE  : 
            dataset_results.to_csv(os.path.join(RESULTS_PATH,f"DFDP_CRDP_SMOTE_{dataset}.csv"),index=False)
        else:
            dataset_results.to_csv(os.path.join(RESULTS_PATH,f"DFDP_CRDP_{dataset}.csv"),index=False)
    all_results = pd.DataFrame(all_results)
    if APPLY_SMOTE : 
        all_results.to_csv("DFDP_CRDP_SMOTE_all.csv",index=False)
    else :
        all_results.to_csv("DFDP_CRDP_all.csv",index=False)