In [2]:
import os
import copy 
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from imblearn.metrics import geometric_mean_score
from imblearn.metrics import specificity_score
from imblearn.over_sampling import SMOTE
from  tqdm.notebook import tqdm

In [3]:
class DeepRandomForest: 
    def __init__(self,configs = {}) -> None:
        self.default_configs = {
            'estimator' : RandomForestClassifier(n_estimators=500, max_features='sqrt',n_jobs=-1),
            'param_M' : 4,
            'standarize' : False,
            'param_tolerance' : 0.0,
            'maximum_layer_number' : np.inf,
            'validation_size' : 0.2
        }
        self.actual_configs = copy.deepcopy(self.default_configs)
        for param_name, param_value in configs : 
            self.actual_configs[param_name] = param_value
    
    def fit(self,X,y) :
        self.original_X = X 
        self.final_X = np.array(X)
        self.original_y = y 
        if self.actual_configs['standarize'] : 
            scaler = StandardScaler()
            self.final_X = scaler.fit_transform(X)
            self.actual_configs['scaler'] = scaler

        X_train, X_val, y_train, y_val  = train_test_split(self.final_X, self.original_y, 
                                                            test_size=self.actual_configs['validation_size'])
       
        
        self.layers = [self.create_layer()]
        self.fit_layer(self.layers[0],X_train,y_train)
        layer_predictions_train = self.layer_predict(self.layers[0],X_train)
        layer_proba_prediction_train = self.compute_layer_prediction(self.layers[0], X_train)
        layer_predictions_val = self.layer_predict(self.layers[0],X_val)
        layer_proba_prediction_val = self.compute_layer_prediction(self.layers[0], X_val)
        ref_accuracy = self.cascade_accuracy(layer_predictions_val,y_val)
        n_layers = 1 
        max_acc_reached = False 
        # error definition is missing 
        while n_layers < self.actual_configs['maximum_layer_number']: 
            X_train = update_X(X_train, layer_proba_prediction_train)
            X_val = update_X(X_val, layer_proba_prediction_val)
            new_layer = self.create_layer()
            self.fit_layer(new_layer,X_train,y_train)
            

            layer_predictions_val = self.layer_predict(new_layer,X_val)
            
            layer_proba_prediction_val = self.compute_layer_prediction(new_layer,X_val)

            new_ref_accuracy = self.cascade_accuracy(layer_predictions_val,y_val)
            if new_ref_accuracy <=  ref_accuracy:
                max_acc_reached = True 
                break
            else : 
                ref_accuracy = new_ref_accuracy 
                n_layers += 1 
                self.layers.append(new_layer)
                layer_proba_prediction_train = self.compute_layer_prediction(new_layer,X_train)
    def create_layer(self) : 
        return [copy.deepcopy(self.actual_configs['estimator']) for _ in range(self.actual_configs['param_M'])]
    
    def fit_layer(self,layer,X,y) : 
        for estimator in layer : 
            estimator.fit(X,y)
    
    def predict(self,X) : 
        probs = self.predict_proba(X) 
        return probs.argmax(axis=1)

    def predict_proba(self,X) : 
        X_for_prediction = np.array(X)
        for layer_index in range(len(self.layers)-1)  : 
            layer_predictions = self.compute_layer_prediction(self.layers[layer_index],X_for_prediction)
            X_for_prediction = update_X(X_for_prediction,layer_predictions)
        last_layer = self.layers[-1]
        #last_layer_predictions_probs = self.compute_layer_prediction(self.layers[layer_index],X_for_prediction).reshape(len(X),self.actual_configs['param_M'],-1)
        return self.layer_predict_proba(last_layer,X_for_prediction)

        

    def compute_layer_prediction(self,layer,X) :
        layer_perdictions = []
        for estimator_index,estimator in enumerate(layer) : 
            estimator_proba = estimator.predict_proba(X)
            layer_perdictions.append(estimator_proba)
        return np.concatenate((layer_perdictions),axis=1)

    
    def layer_predict(self,layer,X) : 
        probs = self.layer_predict_proba(layer,X)
        return probs.argmax(axis=1)

    def layer_predict_proba(self,layer,X) : 
        return self.compute_layer_prediction(layer,X).reshape(len(X),self.actual_configs['param_M'],-1).mean(axis = 1)
    
    
    def cascade_accuracy(self,cascade_preds,y_test) : 
        return accuracy_score(y_true=y_test,y_pred=cascade_preds)
        
def update_X(X,predictions) : 
        result = copy.deepcopy(X)
        return np.concatenate((result,predictions),axis=1)   


In [5]:
#main
DATA_PATH = 'C:/Users/Motaz/Desktop/work/DP_performance_complexity/data/Dataset'
RESULTS_PATH = "./Results"
NRUNS = 31 

os.makedirs(RESULTS_PATH,exist_ok=True)

FEATURES = [
    'CountDeclMethodPrivate', 'AvgLineCode', 'CountLine',
       'MaxCyclomatic', 'CountDeclMethodDefault', 'AvgEssential',
       'CountDeclClassVariable', 'SumCyclomaticStrict', 'AvgCyclomatic',
       'AvgLine', 'CountDeclClassMethod', 'AvgLineComment',
       'AvgCyclomaticModified', 'CountDeclFunction', 'CountLineComment',
       'CountDeclClass', 'CountDeclMethod', 'SumCyclomaticModified',
       'CountLineCodeDecl', 'CountDeclMethodProtected',
       'CountDeclInstanceVariable', 'MaxCyclomaticStrict',
       'CountDeclMethodPublic', 'CountLineCodeExe', 'SumCyclomatic',
       'SumEssential', 'CountStmtDecl', 'CountLineCode', 'CountStmtExe',
       'RatioCommentToCode', 'CountLineBlank', 'CountStmt',
       'MaxCyclomaticModified', 'CountSemicolon', 'AvgLineBlank',
       'CountDeclInstanceMethod', 'AvgCyclomaticStrict',
       'PercentLackOfCohesion', 'MaxInheritanceTree', 'CountClassDerived',
       'CountClassCoupled', 'CountClassBase', 'CountInput_Max',
       'CountInput_Mean', 'CountInput_Min', 'CountOutput_Max',
       'CountOutput_Mean', 'CountOutput_Min', 'CountPath_Max',
       'CountPath_Mean', 'CountPath_Min', 'MaxNesting_Max', 'MaxNesting_Mean',
       'MaxNesting_Min', 'COMM', 'ADEV', 'DDEV', 'Added_lines', 'Del_lines',
       'OWN_LINE', 'OWN_COMMIT', 'MINOR_COMMIT', 'MINOR_LINE', 'MAJOR_COMMIT',
       'MAJOR_LINE'
]
TARGET = 'RealBug'

for APPLY_SMOTE in [True, False]:
    all_results = []
        
    for file in os.listdir(DATA_PATH):

        if not ('train' in file) : 
            continue

        print("Working on:", file)
        dataset_results = []
        train_df = pd.read_csv(os.path.join(DATA_PATH,file))
        test_df =  pd.read_csv(os.path.join(DATA_PATH,file.replace('train','test')))
        
       
        for run in range(NRUNS):
            X_train, y_train = train_df[FEATURES],  train_df[TARGET]
            X_test, y_test = test_df[FEATURES],  test_df[TARGET]
            print("*******run:", run)
            if APPLY_SMOTE  : 
                sm = SMOTE(random_state=42)
                try:
                    X_train, y_train= sm.fit_resample(X=X_train, y=y_train)
                except Exception as e:
                    print(e) 
                    print(file)
                    print("resampling problem")
                    continue
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            dfdp_model =  DeepRandomForest()
            try:
                print("fitting DFDP")
                dfdp_model.fit(X_train,y_train)
                print("fitting done!")
            except: 
                print(file)
                print("fitting problem")
                continue
            train_predictions = dfdp_model.predict(X_train)
            test_predictions = dfdp_model.predict(X_test)
            project_name = file.split('_')[0]
            train_row = {
                "file_id" : file,
                "algorithm" : "DFDP",
                'model_id' : 'best_performance_model',
                'train_or_test': 'train',
                "run": run
            }
            train_row.update({
                "f1" :f1_score(y_true=y_train,y_pred=train_predictions),
                'MCC' : matthews_corrcoef(y_true=y_train,y_pred=train_predictions),
                "G": geometric_mean_score(y_true=y_train,y_pred=train_predictions),
                "precision": precision_score(y_true=y_train,y_pred=train_predictions),
                "tpr" : recall_score(y_true=y_train,y_pred=train_predictions),
                "tnr" : specificity_score(y_true=y_train,y_pred=train_predictions)
            })
            test_row = copy.deepcopy(train_row)
            test_row['train_or_test'] = 'test'
            test_row.update({
                "f1" :f1_score(y_true=y_test,y_pred=test_predictions),
                'MCC' : matthews_corrcoef(y_true=y_test,y_pred=test_predictions),
                "G": geometric_mean_score(y_true=y_test,y_pred=test_predictions),
                "precision": precision_score(y_true=y_test,y_pred=test_predictions),
                "tpr" : recall_score(y_true=y_test,y_pred=test_predictions),
                "tnr" : specificity_score(y_true=y_test,y_pred=test_predictions)
            })
            #sm = SMOTE(random_state=43)
            #X_res, y_res = sm.fit_resample(X_train.loc[:,features].values, train_df.loc[:,outcome].values)
            dataset_results.append(train_row)
            dataset_results.append(test_row)
        all_results += dataset_results
        dataset_results = pd.DataFrame(dataset_results)
        if APPLY_SMOTE  : 
            dataset_results.to_csv(os.path.join(RESULTS_PATH,f"DFDP_CRDP_SMOTE_{file.replace('.csv', '')}.csv"),index=False)
        else:
            dataset_results.to_csv(os.path.join(RESULTS_PATH,f"DFDP_CRDP_{file.replace('.csv', '')}.csv"),index=False)
        
    
    all_results = pd.DataFrame(all_results)
    if APPLY_SMOTE : 
        all_results.to_csv("DFDP_CRDP_SMOTE_all.csv",index=False)
    else :
        all_results.to_csv("DFDP_CRDP_all.csv",index=False)

Working on: activemq-5.0.0_train.csv
*******run: 0
fitting DFDP
fitting done!
*******run: 1
fitting DFDP
fitting done!
*******run: 2
fitting DFDP
fitting done!
*******run: 3
fitting DFDP
fitting done!
*******run: 4
fitting DFDP
fitting done!
*******run: 5
fitting DFDP
fitting done!
*******run: 6
fitting DFDP
fitting done!
*******run: 7
fitting DFDP
fitting done!
*******run: 8
fitting DFDP
fitting done!
*******run: 9
fitting DFDP
fitting done!
*******run: 10
fitting DFDP
fitting done!
*******run: 11
fitting DFDP
fitting done!
*******run: 12
fitting DFDP
fitting done!
*******run: 13
fitting DFDP
fitting done!
*******run: 14
fitting DFDP
fitting done!
*******run: 15
fitting DFDP
fitting done!
*******run: 16
fitting DFDP
fitting done!
*******run: 17
fitting DFDP
fitting done!
*******run: 18
fitting DFDP
fitting done!
*******run: 19
fitting DFDP
fitting done!
*******run: 20
fitting DFDP
fitting done!
*******run: 21
fitting DFDP
fitting done!
*******run: 22
fitting DFDP
fitting done!
*******

In [None]:
dataset_path = os.path.join(DATA_PATH,"eclipse")
example_file  = pd.read_csv(os.path.join(dataset_path,"eclipse-2.0_exp_2_train.csv"))

In [10]:
example_file

Unnamed: 0,post,pre,ACD,FOUT_avg,FOUT_max,FOUT_sum,MLOC_avg,MLOC_max,MLOC_sum,NBD_avg,...,NSM_avg,NSM_max,NSM_sum,PAR_avg,PAR_max,PAR_sum,TLOC,VG_avg,VG_max,VG_sum
0,0,0,0.0,0.000000,0.0,0.0,1.000000,1.0,1.0,1.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,8.0,1.000000,1.0,1.0
1,1,7,3.0,4.470588,15.0,76.0,8.117647,26.0,138.0,2.470588,...,2.0,2.0,2.0,0.588235,2.0,10.0,202.0,2.235294,9.0,38.0
2,0,0,1.0,7.285714,24.0,51.0,11.285714,34.0,79.0,1.571429,...,0.0,0.0,0.0,0.571429,1.0,4.0,111.0,2.857143,10.0,20.0
3,0,0,0.0,1.666667,8.0,35.0,4.095238,22.0,86.0,1.190476,...,0.0,0.0,0.0,0.380952,2.0,8.0,166.0,2.000000,5.0,42.0
4,0,0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,8.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6724,0,0,0.0,1.000000,3.0,10.0,5.900000,23.0,59.0,1.500000,...,0.0,0.0,0.0,1.000000,3.0,10.0,87.0,2.500000,8.0,25.0
6725,0,2,0.0,0.777778,2.0,7.0,1.000000,1.0,9.0,1.000000,...,0.0,0.0,0.0,2.333333,5.0,21.0,32.0,1.000000,1.0,9.0
6726,0,0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,3.000000,3.0,3.0,7.0,0.000000,0.0,0.0
6727,1,0,0.0,4.200000,13.0,21.0,9.000000,20.0,45.0,1.800000,...,0.0,0.0,0.0,1.400000,2.0,7.0,66.0,5.200000,12.0,26.0
