In [None]:
import pandas as pd 
import random 
import copy
import numpy as np
import math
from copy import deepcopy
import os
import traceback


from pymoo.core.problem import ElementwiseProblem
from pymoo.core.variable import Choice, Variable, Real, Binary, Integer

from pymoo.algorithms.base.genetic import GeneticAlgorithm
from pymoo.algorithms.soo.nonconvex.ga import FitnessSurvival
from pymoo.core.duplicate import ElementwiseDuplicateElimination
from pymoo.core.individual import Individual
from pymoo.core.infill import InfillCriterion
from pymoo.core.population import Population
from pymoo.core.problem import Problem
from pymoo.core.sampling import Sampling
from pymoo.core.variable import Choice, Real, Integer, Binary, BoundedVariable
from pymoo.operators.crossover.sbx import SBX
from pymoo.operators.crossover.ux import UX
from pymoo.operators.mutation.bitflip import BFM
from pymoo.operators.mutation.pm import PM
from pymoo.operators.mutation.rm import ChoiceRandomMutation
from pymoo.operators.repair.rounding import RoundingRepair
from pymoo.operators.selection.rnd import RandomSelection
from pymoo.util.display.single import SingleObjectiveOutput
from pymoo.operators.crossover.pntx import SinglePointCrossover
from pymoo.operators.mutation.bitflip import BitflipMutation
from pymoo.core.mixed import MixedVariableMating, MixedVariableGA, MixedVariableSampling, MixedVariableDuplicateElimination
from pymoo.optimize import minimize
from pymoo.algorithms.moo.nsga2 import NSGA2, RankAndCrowdingSurvival
from pymoo.core.crossover import Crossover
from pymoo.core.mutation import Mutation
from pymoo.core.duplicate import ElementwiseDuplicateElimination
from sklearn.neighbors import KernelDensity




from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier

from multiprocessing.pool import ThreadPool
from sklearn.base import BaseEstimator, ClassifierMixin



#from mlxtend.classifier import EnsembleVoteClassifier
#from mlxtend.classifier import StackingClassifier

from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from imblearn.metrics import geometric_mean_score



In [None]:
DATA_PATH = "C:/Users/Motaz/Desktop/work/TSE_R3/PyMEG/data/CV_data/ck"
FEATURES = [
    'wmc', 'dit', 'noc', 'cbo', 'rfc', 'lcom', 'ca', 'ce', 'npm', 'lcom3',
       'loc', 'dam', 'moa', 'mfa', 'cam', 'ic', 'cbm', 'amc', 'max_cc',
       'avg_cc'
]
TARGET = 'bug'
DATASET_NAME ='ck_data'

In [None]:
class KDENBClassifier(BaseEstimator, ClassifierMixin):
    """Bayesian generative classification based on KDE
    
    Parameters
    ----------
    bandwidth : float
        the kernel bandwidth within each class
    kernel : str
        the kernel name, passed to KernelDensity
    """
    def __init__(self, bandwidth=0.01, kernel='gaussian'):
        self.bandwidth = bandwidth
        self.kernel = kernel
        
    def fit(self, X, y):
        self.classes_ = np.sort(np.unique(y))
        
        training_sets = [X[y == yi] for yi in self.classes_]
       
        self.models_ = [KernelDensity(bandwidth=self.bandwidth,
                                      kernel=self.kernel).fit(Xi)
                        for Xi in training_sets]
        
        self.logpriors_ = [np.log(Xi.shape[0] / X.shape[0])
                           for Xi in training_sets]
        return self
        
    def predict_proba(self, X):
        logprobs = np.array([model.score_samples(X)
                             for model in self.models_]).T
        result = np.exp(logprobs + self.logpriors_)
        #print(result / result.sum(1, keepdims=True))
        return np.nan_to_num(result / result.sum(1, keepdims=True))
        
    def predict(self, X):
        return self.classes_[np.argmax(self.predict_proba(X), 1)]
    
class ArrayVaraiable(Variable): 
    def __init__(self,length,vtype, **kwargs) -> None: 
        super().__init__(**kwargs)
        self.length = length
        self.vtype = copy.deepcopy(vtype)
    
    def  _sample(self, n):
        return np.array([[self.vtype.sample(1) for i in range(self.length)] for j in n])
        

class BinaryArrayVariable(ArrayVaraiable):
     def __init__(self,length,**kwargs):
          super().__init__(length=length, vtype=Binary)
    
class HyperParametersArray(Variable): 
    def __init__(self, **kwargs) -> None: 
        super().__init__(**kwargs)
        self.variables = [
            #NB
            Binary(), 
            Binary(),
            Binary(),
            #KNN
            Integer(bounds= [1, 17]),
            Integer(bounds= [1, 17]),
            Integer(bounds= [1, 17]),
            #SVM
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            #DT 
            Real(bounds= [0, 0.3]),
        ]
    def _sample(self, n) :
         np.array([
            [
                variable.sample(1) for variable in self.variables
            ]
            for _ in range(n)
        ])

class MEGVariable(Variable): 
    def __init__(self, bounds, **kwargs):
        self.bounds = bounds

    def _sample(self, n):
        new_vars = [] 
        for _ in range(n): 
            
            var = {
                'ensemble' : [random.choice([0, 1]) for i in range(len(self.bounds))],
                'parameters' : [v.sample(1)[0] for v in self.bounds],
                'voting_strategy' : random.choice(['majority', 'stacking', 'average_voting', 'weighted_average_voting'])
            }
            new_vars.append(var)
        return np.array(new_vars)

class MEGCrossover(Crossover):
    def __init__(self,prob=0.95):

        # define the crossover: number of parents and number of offsprings
        self.proba = prob
        super().__init__(2, 2)

    def _do(self, problem, X, **kwargs):

        # The input of has the following shape (n_parents, n_matings, n_var)
        _, n_matings, n_var = X.shape

        # The output owith the shape (n_offsprings, n_matings, n_var)
        # Because there the number of parents and offsprings are equal it keeps the shape of X
        Y = np.full_like(X, None, dtype=object)

        # for each mating provided
        for k in range(n_matings):

            # get the first and the second parent
            a, b = X[0, k, 0], X[1, k, 0]

            # prepare the offsprings
            off_a = copy.deepcopy(a)
            off_b = copy.deepcopy(b)
            if random.random() < self.proba:
                crossover_point = random.choice(range(1, len(a['ensemble'])))
                off_a['ensemble'][crossover_point:] = b['ensemble'][crossover_point:]
                off_b['ensemble'][crossover_point:] = a['ensemble'][crossover_point:]
                off_a['parameters'][crossover_point:] = b['parameters'][crossover_point:]
                off_b['parameters'][crossover_point:] = a['parameters'][crossover_point:]
            # join the character list and set the output
            Y[0, k, 0], Y[1, k, 0] = off_a, off_b
        return Y

class MEGMutation(Mutation):
    def __init__(self, prob = 0.07):
        self.bounds = [
            Binary(), 
            Binary(),
            Binary(),
            #KNN
            Integer(bounds= [1, 17]),
            Integer(bounds= [1, 17]),
            Integer(bounds= [1, 17]),
            #SVM
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            #DT 
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
        ]
        self.proba = prob
        self.stratgy_proba = 0.25
        super().__init__()

    def _do(self, problem, X, **kwargs):

        # for each individual
        for i in range(len(X)):

            for index in range(len(X[i, 0]['ensemble'])): 
                r = random.random()
                if r < self.proba: 
                    X[i, 0]['ensemble'][index] = 1 - X[i, 0]['ensemble'][index]
                if r < self.proba: 
                    X[i, 0]['parameters'][index] = self.bounds[index].sample(1)[0]
                    
            if random.random() < self.stratgy_proba:
                X[i, 0]['voting_strategy'] = random.choice(['majority', 'stacking', 'average_voting', 'weighted_average_voting'])
        return X

class MEGLearner(Problem):
    def __init__(self, X, y, test_size, **kwargs):
        self.bounds =  [
            Binary(),   
            Binary(),
            Binary(),
            #KNN
            Integer(bounds= [1, 17]),
            Integer(bounds= [1, 17]),
            Integer(bounds= [1, 17]),
            #SVM
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            #DT 
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
        ]
        vars = {
            "MEGVar": MEGVariable(bounds=self.bounds),
        }
        self.X = X
        self.y = y
        self.test_size = test_size
        
        self.default_models = {
            'DT':{
                'default' : DecisionTreeClassifier(),
                'param' : 'ccp_alpha'
            } ,
            'NB' : {
                'default' : GaussianNB(),
                'param' : 'kernel'
            },
            'SVM' :  {
                'default':  SVC(probability=True), 
                'param' :'cache_size'
            }
            ,
            'KNN' : {
                'default': KNeighborsClassifier(n_jobs=-1),
                'param': 'n_neighbors'
            }
        }
        super().__init__(vars=vars, n_obj=2,n_ieq_constr=1, **kwargs)
    def _evaluate(self, x, out, *args, **kwargs):
        pool  = ThreadPool(4)
        output = pool.map(self._evaluate_one_sol, x)
        F = [o['F'] for o in output]
        G = [o['G'] for o in output]
        out['F'] = np.array(F)
        out['G'] = np.array(G)
    def _evaluate_one_sol(self, x):
        out = {}
        X_train, X_val, y_train, y_val = copy.deepcopy(self.X), copy.deepcopy(self.X),  copy.deepcopy(self.y), copy.deepcopy(self.y)
        if not (self.test_size is None):
            X_train, X_val, y_train, y_val = train_test_split(self.X, self.y, test_size=self.test_size)
        out['F'] = []
        out['G'] = []
        ready_models =  self.build_models_from_solution(sol=x, out=out)
        if (ready_models is None): 
            print('oups None!!')
            return out
        ready_models.fit(X_train, y_train)
        
        mcc_score = matthews_corrcoef(y_val, ready_models.predict(X_val))
        models_predictions = self.compute_models_predictions(ready_models, X_val)
        diversity = self.compute_deversity_metric(models_predictions, np.array(y_val))
        out['F'].append(1 - mcc_score)
        out['F'].append(1 - diversity)
        out['G'].append(-1)
        
        return out

    def build_models_from_solution(self, sol, out): 
        models = []
        for index, bit in enumerate(sol["MEGVar"]['ensemble']):
            if bit == True:
                if  index < 3 and index >= 0:
                    
                    models.append(('NB', 'NB-' + str(index), self.default_models['NB'], sol["MEGVar"]['parameters'][index]))
                elif index >= 3 and index < 6:
                    models.append(('KNN', 'KNN-' + str(index - 3), self.default_models['KNN'], sol["MEGVar"]['parameters'][index]))
                elif index >= 6 and index < 10: 
                    models.append(('SVM', 'SVM-' + str(index - 6), self.default_models['SVM'], sol["MEGVar"]['parameters'][index]))
                else: 
                    models.append(('DT', 'DT-' + str(index - 10), self.default_models['DT'], sol["MEGVar"]['parameters'][index]))
               
        if len(models) <= 1 : 
            out["F"].append(2)
            out["F"].append(1)
            out["G"].append(1)
            return None 
        ready_models = self.build_ensemble(models, sol["MEGVar"]['voting_strategy'])
        return ready_models
    
    def build_ensemble(self, models, voting_strategy):
        ready_models = []
        for model_name, full_model_name, model_data, model_param_value in models: 
            default_model = copy.deepcopy(model_data['default'])
            if model_name == 'NB': 
                ready_models.append((full_model_name, default_model))
            else: 
                ready_models.append((full_model_name, default_model.set_params(**{model_data['param']: model_param_value})))
        ensemble = VotingClassifier(estimators = ready_models, voting='hard', flatten_transform = False, n_jobs=-1)

        if voting_strategy == 'average_voting': 
            ensemble = VotingClassifier(estimators = ready_models, voting='soft', flatten_transform = False, n_jobs=-1)
        
        if voting_strategy == 'weighted_average_voting':
            ensemble = VotingClassifier(estimators = [copy.deepcopy(ready_models[0])] + ready_models, voting='hard', flatten_transform = False, n_jobs=-1)
        if voting_strategy == 'stacking': 
            ensemble = StackingClassifier(estimators = ready_models, final_estimator=ready_models[0][1], n_jobs=-1)
        if ensemble is None: 
            print(ensemble, 'is None!')
        return ensemble

    def compute_deversity_metric(self, model_predictions, y): 
        diversities = []
        for model_1_index in range(model_predictions.shape[0]): 
            model_1_predictions = model_predictions[model_1_index,:]
            for model_2_index in range(model_1_index + 1, model_predictions.shape[0]): 
                model_2_predictions = model_predictions[model_2_index,:]
                diversities.append(self.compute_diversity(model_1_predictions, model_2_predictions, y))
        
        return np.mean(diversities)
    def compute_diversity(self, classifier_1_predictions, classifier_2_predictions, y):
        classifier_1_correctly_classified_instances = np.where(np.equal(classifier_1_predictions, y))[0]
        classifier_1_notcorrect_classified_instances = np.where(np.not_equal(classifier_1_predictions, y))[0]

        classifier_2_correctly_classified_instances = np.where(np.equal(classifier_2_predictions, y))[0]
        classifier_2_notcorrect_classified_instances = np.where(np.not_equal(classifier_2_predictions, y))[0]

        N_1_0 = len(set(classifier_1_correctly_classified_instances).intersection(set(classifier_2_notcorrect_classified_instances)))
        N_0_1 = len(set(classifier_2_correctly_classified_instances).intersection(set(classifier_1_notcorrect_classified_instances)))

        N_1_1 = len(set(classifier_1_correctly_classified_instances).intersection(set(classifier_2_correctly_classified_instances)))
        N_0_0 = len(set(classifier_1_notcorrect_classified_instances).intersection(set(classifier_2_notcorrect_classified_instances)))

        return (N_1_0 + N_0_1)/(N_1_0 + N_0_1 + N_1_1 + N_0_0)
    
    def compute_models_predictions(self, ensemble, X): 
        model_predicion = ensemble.transform(X)
        if len(model_predicion.shape)== 2: 
            return model_predicion.T
        else: 
            predictions = np.argmax(model_predicion, axis=-1)
            return predictions
                

In [None]:
class MEGVariable(Variable): 
    def __init__(self, bounds, **kwargs):
        self.bounds = bounds

    def _sample(self, n):
        new_vars = [] 
        for _ in range(n): 
            
            var = {
                'ensemble' : [random.choice([0, 1]) for i in range(len(self.bounds))],
                'parameters' : [v.sample(1)[0] for v in self.bounds],
                'voting_strategy' : random.choice(['majority',  'average_voting', 'stacking' , 'weighted_average_voting'])
            }
            new_vars.append(var)
        return np.array(new_vars)

class MEGCrossover(Crossover):
    def __init__(self,prob=0.95):

        # define the crossover: number of parents and number of offsprings
        self.proba = prob
        super().__init__(2, 2)

    def _do(self, problem, X, **kwargs):

        # The input of has the following shape (n_parents, n_matings, n_var)
        _, n_matings, n_var = X.shape

        # The output owith the shape (n_offsprings, n_matings, n_var)
        # Because there the number of parents and offsprings are equal it keeps the shape of X
        Y = np.full_like(X, None, dtype=object)

        # for each mating provided
        for k in range(n_matings):

            # get the first and the second parent
            a, b = X[0, k, 0], X[1, k, 0]

            # prepare the offsprings
            off_a = copy.deepcopy(a)
            off_b = copy.deepcopy(b)
            if random.random() < self.proba:
                crossover_point = random.choice(range(1, len(a['ensemble'])))
                off_a['ensemble'][crossover_point:] = b['ensemble'][crossover_point:]
                off_b['ensemble'][crossover_point:] = a['ensemble'][crossover_point:]
                off_a['parameters'][crossover_point:] = b['parameters'][crossover_point:]
                off_b['parameters'][crossover_point:] = a['parameters'][crossover_point:]
            # join the character list and set the output
            Y[0, k, 0], Y[1, k, 0] = off_a, off_b
        return Y

class MEGMutation(Mutation):
    def __init__(self, prob = 0.07):
        self.bounds = [
            Binary(), 
            Binary(),
            Binary(),
            #KNN
            Integer(bounds= [1, 17]),
            Integer(bounds= [1, 17]),
            Integer(bounds= [1, 17]),
            #SVM
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            #DT 
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
        ]
        self.proba = prob
        self.stratgy_proba = 0.25
        super().__init__()

    def _do(self, problem, X, **kwargs):

        # for each individual
        for i in range(len(X)):

            for index in range(len(X[i, 0]['ensemble'])): 
                r = random.random()
                if r < self.proba: 
                    X[i, 0]['ensemble'][index] = 1 - X[i, 0]['ensemble'][index]
                if r < self.proba: 
                    X[i, 0]['parameters'][index] = self.bounds[index].sample(1)[0]
                    
            if random.random() < self.stratgy_proba:
                X[i, 0]['voting_strategy'] = random.choice(['majority',  'average_voting', 'stacking', 'weighted_average_voting'])
        return X


class MEGDuplicateElimination(ElementwiseDuplicateElimination):

    def is_equal(self, a, b):
        if a.X['vote_strategy'] != b.X['vote_strategy']:
            return False 
        for index in range(len(a.X['ensemble'])):
            if a.X['ensemble'][index] != b.X['ensemble'][index]:
                return False 
            if a.X['parameters'][index] != b.X['paraneters'][index]:
                return False
        return True 
class MixedVariableMating(InfillCriterion):

    def __init__(self,
                 selection=RandomSelection(),
                 crossover=None,
                 mutation=None,
                 repair=None,
                 eliminate_duplicates=True,
                 n_max_iterations=100,
                 **kwargs):

        super().__init__(repair, eliminate_duplicates, n_max_iterations, **kwargs)

        if crossover is None:
            crossover = {
                Binary: UX(),
                Real: SBX(),
                Integer: SBX(vtype=float, repair=RoundingRepair()),
                Choice: UX(),
            }

        if mutation is None:
            mutation = {
                Binary: BFM(),
                Real: PM(),
                Integer: PM(vtype=float, repair=RoundingRepair()),
                Choice: ChoiceRandomMutation(),
            }

        self.selection = selection
        self.crossover = crossover
        self.mutation = mutation

    def _do(self, problem, pop, n_offsprings, parents=False, **kwargs):

        # So far we assume all crossover need the same amount of parents and create the same number of offsprings
        XOVER_N_PARENTS = 2
        XOVER_N_OFFSPRINGS = 2

        # the variables with the concrete information
        vars = problem.vars

        # group all the variables by their types
        vars_by_type = {}
        for k, v in vars.items():
            clazz = type(v)

            if clazz not in vars_by_type:
                vars_by_type[clazz] = []
            vars_by_type[clazz].append(k)

        # # all different recombinations (the choices need to be split because of data types)
        recomb = []
        for clazz, list_of_vars in vars_by_type.items():
            if clazz == Choice:
                for e in list_of_vars:
                    recomb.append((clazz, [e]))
            else:
                recomb.append((clazz, list_of_vars))

        # create an empty population that will be set in each iteration
        off = Population.new(X=[{} for _ in range(n_offsprings)])

        if not parents:
            n_select = math.ceil(n_offsprings / XOVER_N_OFFSPRINGS)
            pop = self.selection(problem, pop, n_select, XOVER_N_PARENTS, **kwargs)

        for clazz, list_of_vars in recomb:
            crossover = self.crossover[clazz]
            assert crossover.n_parents == XOVER_N_PARENTS and crossover.n_offsprings == XOVER_N_OFFSPRINGS

            _parents = [[Individual(X=np.array([parent.X[var] for var in list_of_vars])) for parent in parents] for
                        parents in pop]

            _vars = {e: vars[e] for e in list_of_vars}
            _xl = np.array([vars[e].lb if hasattr(vars[e], "lb") else None for e in list_of_vars])
            _xu = np.array([vars[e].ub if hasattr(vars[e], "ub") else None for e in list_of_vars])
            _problem = Problem(vars=_vars, xl=_xl, xu=_xu)

            _off = crossover(_problem, _parents, **kwargs)
            mutation = self.mutation[clazz]
            _off = mutation(_problem, _off, **kwargs)
            for k in range(n_offsprings):
                for i, name in enumerate(list_of_vars):
                    off[k].X[name] = _off[k].X[i]

        return off
class MEGLearner(Problem):
    def __init__(self, X, y, test_size, **kwargs):
        self.bounds =  [
            Binary(), 
            Binary(),
            Binary(),
            #KNN
            Integer(bounds= [1, 17]),
            Integer(bounds= [1, 17]),
            Integer(bounds= [1, 17]),
            #SVM
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            Integer(bounds= [1, 51]),
            #DT 
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
            Real(bounds= [0, 0.3]),
        ]
        vars = {
            "MEGVar": MEGVariable(bounds=self.bounds),
        }
        self.X = X
        self.y = y
        self.test_size = test_size
        
        self.default_models = {
            'DT':{
                'default' : DecisionTreeClassifier(),
                'param' : 'ccp_alpha'
            } ,
            'NB' : {
                'default' : GaussianNB(),
                'param' : 'kernel'
            },
            'SVM' :  {
                'default':  SVC(probability=True), 
                'param' :'cache_size'
            }
            ,
            'KNN' : {
                'default': KNeighborsClassifier(n_jobs=-1),
                'param': 'n_neighbors'
            }
        }
        super().__init__(vars=vars, n_obj=2,n_ieq_constr=1, **kwargs)
    def _evaluate(self, x, out, *args, **kwargs):
        pool  = ThreadPool(4)
        output = pool.map(self._evaluate_one_sol, x)
        F = [o['F'] for o in output]
        G = [o['G'] for o in output]
        out['F'] = np.array(F)
        out['G'] = np.array(G)
    def _evaluate_one_sol(self, x):
        out = {}
        X_train, X_val, y_train, y_val = copy.deepcopy(self.X), copy.deepcopy(self.X),  copy.deepcopy(self.y), copy.deepcopy(self.y)
        if not (self.test_size is None):
            X_train, X_val, y_train, y_val = train_test_split(self.X, self.y, test_size=self.test_size)
        out['F'] = []
        out['G'] = []
        ready_models =  self.build_models_from_solution(sol=x, out=out)
        if (ready_models is None): 
            print('oups None!!')
            return out
        ready_models.fit(X_train, y_train)
        
        mcc_score = matthews_corrcoef(y_val, ready_models.predict(X_val))
        models_predictions = self.compute_models_predictions(ready_models, X_val)
        diversity = self.compute_deversity_metric(models_predictions, np.array(y_val))
        out['F'].append(1 - mcc_score)
        out['F'].append(1 - diversity)
        out['G'].append(-1)
        
        return out

    def build_models_from_solution(self, sol, out): 
        models = []
        for index, bit in enumerate(sol["MEGVar"]['ensemble']):
            if bit == True:
                if  index < 3 and index >= 0:
                    models.append(('NB', 'NB-' + str(index), self.default_models['NB'], sol["MEGVar"]['parameters'][index]))
                elif index >= 3 and index < 6:
                    models.append(('KNN', 'KNN-' + str(index - 3), self.default_models['KNN'], sol["MEGVar"]['parameters'][index]))
                elif index >= 6 and index < 10: 
                    models.append(('SVM', 'SVM-' + str(index - 6), self.default_models['SVM'], sol["MEGVar"]['parameters'][index]))
                else: 
                    models.append(('DT', 'DT-' + str(index - 10), self.default_models['DT'], sol["MEGVar"]['parameters'][index]))
               
        if len(models) <= 1 : 
            out["F"].append(2)
            out["F"].append(1)
            out["G"].append(1)
            return None 
        ready_models = self.build_ensemble(models, sol["MEGVar"]['voting_strategy'])
        return ready_models
    
    def build_ensemble(self, models, voting_strategy):
        ready_models = []
        for model_name, full_model_name, model_data, model_param_value in models: 
            default_model = copy.deepcopy(model_data['default'])
            if model_name == 'NB': 
                if model_param_value == False:
                    ready_models.append((full_model_name, default_model))
                else: 
                    ready_models.append((full_model_name, default_model))
            else: 
                ready_models.append((full_model_name, default_model.set_params(**{model_data['param']: model_param_value})))
        ensemble = VotingClassifier(estimators = ready_models, voting='hard', flatten_transform = False, n_jobs=-1)
        
        if voting_strategy == 'weighted_average_voting': 
            first_model = copy.deepcopy(ready_models[0])
            first_model_name, first_model_model = first_model[0], first_model[1]
            first_model_name += "-weighted"
            ensemble = VotingClassifier(estimators = [(first_model_name, first_model_model)] + ready_models, voting='hard', flatten_transform = False, n_jobs=-1)

        if voting_strategy == 'average_voting': 
            first_model = ready_models[0]
            ensemble = VotingClassifier(estimators = ready_models, voting='soft', flatten_transform = False, n_jobs=-1)

        if voting_strategy == 'stacking': 
            ensemble = StackingClassifier(estimators = ready_models, final_estimator=ready_models[0][1], n_jobs=-1)
        if ensemble is None: 
            print(ensemble, 'is None!')
        return ensemble

    def compute_deversity_metric(self, model_predictions, y): 
        diversities = []
        for model_1_index in range(model_predictions.shape[0]): 
            model_1_predictions = model_predictions[model_1_index,:]
            for model_2_index in range(model_1_index + 1, model_predictions.shape[0]): 
                model_2_predictions = model_predictions[model_2_index,:]
                diversities.append(self.compute_diversity(model_1_predictions, model_2_predictions, y))
        
        return np.mean(diversities)
    def compute_diversity(self, classifier_1_predictions, classifier_2_predictions, y):
        classifier_1_correctly_classified_instances = np.where(np.equal(classifier_1_predictions, y))[0]
        classifier_1_notcorrect_classified_instances = np.where(np.not_equal(classifier_1_predictions, y))[0]

        classifier_2_correctly_classified_instances = np.where(np.equal(classifier_2_predictions, y))[0]
        classifier_2_notcorrect_classified_instances = np.where(np.not_equal(classifier_2_predictions, y))[0]

        N_1_0 = len(set(classifier_1_correctly_classified_instances).intersection(set(classifier_2_notcorrect_classified_instances)))
        N_0_1 = len(set(classifier_2_correctly_classified_instances).intersection(set(classifier_1_notcorrect_classified_instances)))

        N_1_1 = len(set(classifier_1_correctly_classified_instances).intersection(set(classifier_2_correctly_classified_instances)))
        N_0_0 = len(set(classifier_1_notcorrect_classified_instances).intersection(set(classifier_2_notcorrect_classified_instances)))

        return (N_1_0 + N_0_1)/(N_1_0 + N_0_1 + N_1_1 + N_0_0)
    
    def compute_models_predictions(self, ensemble, X): 
        model_predicion = ensemble.transform(X)
        if len(model_predicion.shape)== 2: 
            return model_predicion.T
        else: 
            predictions = np.argmax(model_predicion, axis=-1)
            return predictions
                

class MixedVariableSampling(Sampling):

    def _do(self, problem, n_samples, **kwargs):
        V = {name: var.sample(n_samples) for name, var in problem.vars.items()}

        X = []
        for k in range(n_samples):
            X.append({name: V[name][k] for name in problem.vars.keys()})

        return X




In [13]:
#main 
#main
idx = 0
all_results = []
train_size = 0.8
n_runs = 10 
for i_run in range(5, n_runs):
    for train_file_name in os.listdir(DATA_PATH): 
        if not('.csv' in train_file_name): 
            continue
        if not('train' in train_file_name): 
            continue
       
        idx += 1 
        print(train_file_name)
        train_data_df = pd.read_csv(os.path.join(DATA_PATH, train_file_name)) 
        test_data_df = pd.read_csv(os.path.join(DATA_PATH, train_file_name.replace('train', 'test')))

        X_train, y_train = train_data_df[FEATURES], train_data_df[TARGET]
        X_test, y_test = test_data_df[FEATURES], test_data_df[TARGET]
        X_val, y_val =  train_data_df[FEATURES],  train_data_df[TARGET]
        if not(train_size is None):
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size= train_size)
        print('train:',len(X_train))
        print('test:',len(X_test))
        print(y_train)
        MOLR_model = MEGLearner(X_train, y_train,test_size=None)
        algorithm = NSGA2(pop_size=500, sampling=MixedVariableSampling(),
                    mating=MixedVariableMating(
                            eliminate_duplicates=MixedVariableDuplicateElimination(),
                            crossover = {MEGVariable:MEGCrossover(prob=1.0)},
                            mutation = {MEGVariable:MEGMutation(prob=1.0)}
            ),
            eliminate_duplicates=MixedVariableDuplicateElimination())
        res = minimize(MOLR_model,
                    algorithm,
                    ('n_gen', 200),
                    seed=1,
                    verbose=True)
        
        sols, objes = res.X, res.F
        ready_ensembles = []
        mccs = []
        for sol in sols: 
            ensemble = MOLR_model.build_models_from_solution(sol=sol, out = {
                "F":[],
                "G": []
            })
            ensemble.fit(X_train, y_train)
            ready_ensembles.append(copy.deepcopy(ensemble))
            mccs.append(matthews_corrcoef(y_val, ensemble.predict(X_val)))
        
        best_ensemble_idx = np.argmax(np.array(mccs))
        best_ensemble = ready_ensembles[best_ensemble_idx]
        best_ensemble.fit(X_train, y_train)
        y_test_pred = best_ensemble.predict(X_test)
        print('f1:', f1_score(y_test, y_test_pred))
        print('G:',geometric_mean_score(y_test, y_test_pred))
        print('MCC:',matthews_corrcoef(y_test, y_test_pred))
        new_row = {
            'algorithm' : 'MEG',
            'file_id' : train_file_name,
            'model_id': 'best_performance_model',
            'run_id': i_run,
            'f1' :f1_score(y_test, y_test_pred), 
            'G' : geometric_mean_score(y_test, y_test_pred), 
            'MCC': matthews_corrcoef(y_test, y_test_pred),
            "project_name": train_file_name.split('_')[0]
        }
        all_results.append(new_row)
        if idx % 1 == 0: 
            pd.DataFrame(all_results).to_csv(f'{DATASET_NAME}_results.csv',index=False)
