In [7]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import queue
import math
from enum import Enum
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

In [12]:
%run D:\Projects\MCTS\src\BuildInMetrics.py
%run D:\Projects\MCTS\src\DefaultSettings.py

In [11]:
a = BuildInMetrics()

In [12]:
data_path = 'D:\\Projects\\MCTS\\datasets\\'
file_names = [#{'name':'blood-transfusion-service-center.csv', 'target_class': 'Class', 'pos_class': 'numerical'},
             {'name':'credit-g.csv', 'target_class': 'class', 'pos_class': 'good'},
             {'name':'kr-vs-kp.csv', 'target_class': 'class', 'pos_class': '\'won\''},
             {'name':'monks-problems-2.csv', 'target_class': 'class', 'pos_class': 'numerical'},
             {'name':'diabetes.csv', 'target_class': 'class', 'pos_class': 'tested_positive'},
             {'name':'qsar-biodeg.csv', 'target_class': 'Class', 'pos_class': 'numerical'},
             {'name':'steel-plates-fault.csv', 'target_class': 'Class', 'pos_class': 'numerical'},
             {'name':'tic-tac-toe.csv', 'target_class': 'Class', 'pos_class': 'positive'},
             {'name':'wdbc.csv', 'target_class': 'Class', 'pos_class': 'numerical'},
             {'name':'hill-valley.csv', 'target_class': 'Class', 'pos_class': 'numerical'},
             {'name':'pc1.csv', 'target_class': 'defects', 'pos_class': 'true'},
             {'name':'spambase.csv', 'target_class': 'class', 'pos_class': 'numerical'},
             {'name':'artificial.csv', 'target_class': 'class', 'pos_class': 'numerical'}]

## Method for preprocessing data

In [1]:
def get_data(file_info, pos_label = None):
    """Method for getting transformed data"""
    data = pd.read_csv(data_path + file_info['name'])  
    
    if pos_label is None:
        pos_label = max(data.loc[:,file_info['target_class']])

    data.loc[data.loc[:,file_info['target_class']] != pos_label, file_info['target_class']] = 0
    data.loc[data.loc[:,file_info['target_class']] == pos_label, file_info['target_class']] = 1
    
    labels = data.loc[:,file_info['target_class']]
    data = data.drop(columns = [file_info['target_class']])
    
    return (data, labels)

## Metrics class

In [22]:
class BuildInMetrics:
    """Class used for getting classic metrics (like accuracy, f1, etc.) to evaluate model's performance."""
    def __init__(self):
        """Initializes the metrics dictionary."""
        
        self._metrics = {
            'acc': accuracy_score,
            'f1': f1_score,
            'roc_auc': roc_auc_score
        }
        
    def get_metric(self, name):
        """
        Method for getting metric with selected name. If the proper name won't be provided the exception will be thrown.
        
        Parameters
        ----------
        name: str
            name of the metric
        """
        if name in self._metrics:
            return self._metrics[name]
        else:
            raise Exception('Error initializing MCTS object, \"' + name + '\" is not supported metric, available values are: ' + ', '.join(self._metrics.keys()))

## Scoring function

In [25]:
class ScoringFunctions():
    """Class used for getting scoring functions that are used to evaluate the node of the MCTS tree."""
    
    def __init__(self, name, params):
        """
        Parameters
        ----------
        name: str
            Name of the scoring function that will be used
        params: dict
            Dictionary containing parameters of the scoring functions, please see documentation for more info
        """
        self._params = params
        self._name = name
    
    def set_scoring_function(self, name, params = None):
        """
        Method for setting scoring function that will be returned from class instance.
        
        Parameters
        ----------
        name: str
            Name of the scoring function that will be used
        params: dict (default: None)
            Dictionary containing parameters of the scoring functions, please see documentation for more info
        """
        
        self._name = name
        if(params is not None):
            self._params = params 
    
    def get_score(self, node, scores):
        """
        Method for getting score for selected node. If the name set with init or set_scoring_function is not supported, 
        the exception will be thrown.
        
        Paramteres
        ----------
        node: Node
            Node which the score will be calculated for
        scores: dict
            Dictionary containing global scores values used for some of the scores, please see documentation for more info
        """
        if(self._name == 'default'):
            return self._default_scoring(node)
        elif(self._name == 'with_variance'):
            return self._var_scoring(node)
        elif(self._name == 'g_rave'):
            return self._g_rave_scoring(node, scores)
        else:
            raise Exception('Error initializing MCTS object, \"' + self._name + '\" is not supported.')
    
    def _default_scoring(self, node):
        if(node._parent_node == None or node.T == 0):
            return float("Inf")
        else:
            return node.get_score() + math.sqrt(self._params['c_e'] * math.log(node._parent_node.T)/node.T)
        
    def _var_scoring(self, node):
        if(node._parent_node == None or node.T == 0):
            return float("Inf")
        else:
            return node.get_score() + math.sqrt(self._params['c_e'] * math.log(node._parent_node.T)/node.T)
    
    def _g_rave_scoring(self, node, scores):
        g_scores = scores['g_rave']
        
        if(node._parent_node == None or node.T == 0):
            if(node.feature_name not in g_scores):
                return float("Inf")
            else:
                return g_scores[node.feature_name]['score']
        else:
            c = self._params['c']
            c_l = self._params['c_l']
            alpha = c/(c + node.T)
            beta = c_l/(c_l + node.T)
            
            return ((1 - alpha) * node.get_score() + 
                alpha * g_scores[node.feature_name]['score'] + 
                math.sqrt(self._params['c_e'] * math.log(node._parent_node.T)/node.T) *
                min(0.25, node.get_variance() + math.sqrt(2 * math.log(node._parent_node.T)/node.T)))

## Global scores

In [29]:
class GlobalScores:
    """Class used for getting global scores like g_rave."""
    
    def __init__(self):
        self.scores = {'g_rave': {}}
    
    def update_g_rave_score(self, name, score):
        """
        Method for updataing g_rave score. For more information about g_rave score please see documentation.
        Parameters
        ----------
        name: str
            Name of the feature
        score: numeric
            Value of the score
        """
        if(name not in self.scores['g_rave']):
            self.scores['g_rave'][name] = {'n': 1, 'score': score}
        else:
            n = self.scores['g_rave'][name]['n']
            t_score = (self.scores['g_rave'][name]['score'] * n + score)/(n + 1)
            self.scores['g_rave'][name] = {'n': n + 1, 'score': t_score}

## Node class

In [33]:
class Node:
    """Class representing node of MCTS tree."""
    def __init__(self, feature_name, parent_node = None, is_subtree_full = False):
        """
        Parameters
        ----------
        feature_name: str
            Name of the feature that this node will represent
        parent_node: Node (default: None)
            Reference to parent node of new node, if None that means that it's root
        is_subtree_full: boolean (default: False)
            Value indicating whether the node's subtree is fully searched, if so then this node won't be searched in the
            future. If parameter set to True then it's last node in the path from the root (all features used in this path)
        """
        
        self.feature_name = feature_name
        self.child_nodes = []
        self.T = 0
        self.score_sum = 0
        self._is_subtree_full = is_subtree_full
        self._parent_node = parent_node
        self._scores = []
        
    def add_child_node(self, node_name, is_subtree_full = False):
        """
        Method for adding child node to children of current node.
        Parameters
        ----------
        node_name: str
            Name (feature name) of the node that will be added
        is_subtree_full: boolean (default: False)
            Value indicating whether the node will be last feature not used in path from root to current node
        """
        
        new_node = Node(node_name, self, is_subtree_full)
        self.child_nodes.append(new_node)
        
    def add_child_nodes(self, node_names):
        """
        Method for adding child nodes to children of current node.
        Parameters
        ----------
        node_names: list
            List of names (feature names) of nodes that will be added to children of current node
        """
        
        for name in node_names:
            self.add_child_node(name, len(node_names) == 1)
      
    def update_node(self, score, scores):
        """
        Method for updating scores of current node.
        Parameters
        ----------
        score: numeric
            Value of the score that will added to score of the current node
        scores: GlobalScores
            Instance of GlobalScores
        """
        
        self.score_sum += score
        self.T += 1
        self._scores.append(score)
        scores.update_g_rave_score(self.feature_name, score)
        if(len(self.child_nodes) != 0):
            is_subtree_full = True
            for node in self.child_nodes:
                if(not node._is_subtree_full):
                    is_subtree_full = False
                    break
            self._is_subtree_full = is_subtree_full
    
    def get_score(self):
        """Method for getting score for current node, if node not visited then float(\'Inf\') is returned, 
        otherwise numeric is returned"""
        
        return float('Inf') if self.T == 0 else self.score_sum/self.T
    
    def get_variance(self):
        """Method for getting variance of scores of current node."""
        
        return np.var(self._scores)
    
    def update_scores_up(self, score, scores):
        """
        Method for updating scores of current node and updating scores of all nodes up to the root
        Parameters
        ----------
        score: numeric
            Value of the score that will added to score of the current node
        scores: GlobalScores
            Instance of GlobalScores
        """
        
        current_node = self
        
        while(current_node != None):
            current_node.update_node(score, scores)
            current_node = current_node._parent_node  

## Multiarm strategies

In [34]:
class MultiArmStrategies:
    """Class strategies used for selecting next node to visit in MCTS. 
    For more info please see documentation"""
    
    def __init__(self, name, all_node_names):
        """
        Parameters
        ----------
        name: str
            Name of the strategy that will be used
        all_node_names: list
            List containing all possible names of the features that MCTS will search through
        """
        
        self._name = name
        self._all_node_names = all_node_names
        
    def multiarm_strategy(self, node, used_features, scoring_function, other_scores):
        """
        Method for getting next node in current search.
        Parameters
        ----------
        node: Node
            Current node
        used_features: list
            List of feature names that are already used in search path
        scoring_function: ScoringFunctions
            Scoring function that is used in current search instance
        other_scores: dict
            Dictionary containing scores used in strategy
        """
        
        if(self._name == 'default'):
            return self._default_strategy(node, used_features, scoring_function, other_scores)
        else:
            raise Exception("Error getting multiarm strategy, strategy \'" + self._name + "\' is not supported.")   
    
    def _default_strategy(self, node, used_features, scoring_function, other_scores):
        #print("default strategy: " + node.feature_name)
        if(len(node.child_nodes) == 0):
            #print("first if")
            self._add_child_nodes(node, used_features)
            return node.child_nodes[0]
        else:
            #print("else")
            best_score = 0
            best_node = None
            tmp_score = 0
            
            for child_node in node.child_nodes:
                if(not child_node._is_subtree_full):
                    score = scoring_function(child_node, other_scores)
                    if(score > best_score):
                        best_score = score
                        best_node = child_node
            return best_node
  
    def _add_child_nodes(self, node, used_features):
        #print('adding nodes to ' + node.feature_name + ' :' + ' '.join(self._all_node_names - used_features))
        node.add_child_nodes(self._all_node_names - used_features)   

## EndStrategies

In [35]:
class EndStrategies:
    """Class for strategies of ending search."""
    
    def __init__(self, name):
        """
        Parameters
        ----------
        name: str
            Name of the end strategy
        """
        
        self._name = name
    
    def are_calculations_over(self, node, params):
        """
        Method for getting information whether the current iteration of search is over. 
        If the name set in the init is not correct then the exception is thrown.
        Parameters
        ----------
        node: Node
            Current node
        params: dict
            Dictionary containing parameters of MCTS algorithm
        """
        
        if(self._name == 'default'):
            return self._first_new_strategy(node, params)
        else:
            raise Exception("Error getting end strategy, end strategy \'" + self._name + "\'")
        
    def _first_new_strategy(self, node, params):
        #print("_first_new_strategy:")
        if(node.T > 0 and not node._is_subtree_full):
            #print("first if")
            return False
        else:
            if(node._parent_node == None):
                #print("second if, no parent node")
                return False
            else:
                #print("else")
                return True


## DefaultSettings

In [36]:
class DefaultSettings:
    """Class containing default settings for MCTS."""
    
    @staticmethod
    def get_default_params():
        """Method for getting default settings."""
        
        return {
            "c_e": 2,
            "c": 1,
            "c_l": 1
        }
    

## MCTS

In [11]:
class MCTS:
    def __init__(self, 
                 model,
                 task = 'classification',
                 calculactions_done_conditions = {'type': 'iterations', 'max_val': 10},
                 params = None,
                 metric = 'acc', 
                 scoring_function = 'g_rave', 
                 multiarm_strategy = 'default', 
                 end_strategy = 'default'):
        self._metric_name = metric
        self._scoring_function_name = scoring_function
        self._multiarm_strategy_name = multiarm_strategy
        self._end_strategy_name = end_strategy
        self._best_features = None
        self._best_score = 0
        self._task = task
        self._root = Node("")
        self._feature_names = None
        self._calculactions_done_conditions = calculactions_done_conditions
        self._model = model
        self._time = 0
        self._iterations = 0
        
        if(params is None):
            self._params = DefaultSettings.get_default_params()
        
    def fit(self, data, out_variable):
        data = data.reset_index(drop=True)
        out_variable = out_variable.reset_index(drop=True)
        if(self._task == 'classification'):
            self._classification_fit(data, out_variable)
        else:
            self._regression_fit(data, out_variable)
            
    def _classification_fit(self, data, out_variable):
        self._init_fitting_values(data)
        while not self._is_fitting_over():
            self._single_classification_iteration(data, out_variable)
        
        self._model.fit(data.loc[:, self._best_features], out_variable)
    
    def _regression_fit(self, data, out_variable):
        return None
    
    def _single_classification_iteration(self, data, out_variable):
        #print('classification iteration')
        used_features = set()
        node = self._root
        is_iteration_over = False
        while not is_iteration_over:
            node = self._multiarm_strategy.multiarm_strategy(node, used_features, self._scoring_function.get_score, self._global_scores.scores)
            #print("selected feature: " + node.feature_name)
            is_iteration_over = self._end_strategy.are_calculations_over(node, self._params)
            used_features.add(node.feature_name)
            #print("--------")
        
        score = self._cv_score(data.loc[:,used_features], out_variable)
        #print('score: ' + str(score))
        node.update_scores_up(score, self._global_scores)
        #print(used_features)
        if(score > self._best_score):
            self._best_score = score
            self._best_features = used_features
        
        if(self._longest_tree_branch < len(used_features)):
            self._longest_tree_branch = len(used_features)
        
        #print("----------END OF ITERATION----------")
    
    def _single_regression_iteration(self):
        return None
    
    def _init_fitting_values(self, data):
        self._feature_names = set(data.columns)
        self._multiarm_strategy = MultiArmStrategies(self._multiarm_strategy_name, self._feature_names)
        self._end_strategy = EndStrategies(self._end_strategy_name)
        self._scoring_function = ScoringFunctions(self._scoring_function_name, self._params)
        self._metric = BuildInMetrics().get_metric(self._metric_name)
        self._best_features = None
        self._best_score = 0
        self._iterations = 0
        self._longest_tree_branch = 0
        self._time = time.time()
        self._global_scores = GlobalScores()
    
    def _is_fitting_over(self):
        if(self._root._is_subtree_full):
            print('Whole tree searched, finishing prematurely')
            return True
        
        if(self._calculactions_done_conditions['type'] == 'iterations'):
            self._iterations += 1
            return self._iterations > self._calculactions_done_conditions['max_val'] 
        else:
            #print('Time ellapsed: ' + str(time.time() - self._time))
            return (time.time() - self._time) > self._calculactions_done_conditions['max_val'] 
        
    def _cv_score(self, data, labels):
        kf = KFold(n_splits = 5, random_state = 123, shuffle=True)
        score = 0
        for train, test in kf.split(data):
            self._model.fit(data.loc[train,:], labels[train])
            predicted = self._model.predict(data.loc[test,:])
            score += self._metric(labels[test], predicted)
        return score/5
    
    def predict(self, data):
        return self._model.predict(data.loc[:, self._best_features])
    
    def predict_proba(self, data):
        return self._model.predict_proba(data.loc[:, self._best_features])
    
    def get_features_importances(self):
        return self._global_scores.scores['g_rave']

In [20]:
data = pd.read_csv(data_path + file_names[7]['name'])
labels = data.loc[:,file_names[7]['target_class']]
data = data.drop(columns = [file_names[7]['target_class']])
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size = 0.25, random_state = 123)

In [44]:
a = get_data(file_names[7])

TypeError: list indices must be integers or slices, not Series

In [21]:
kf = KFold(n_splits = 5, random_state = 123, shuffle=True)
model = LogisticRegression(solver='liblinear')
auc_score = 0
acc_score = 0
f1_score_ = 0
for train, test in kf.split(data):
    model.fit(data.loc[train,:], labels[train])
    predicted = model.predict(data.loc[test,:])
    p_proba = model.predict_proba(data.loc[test,:])[:,1]
    auc_score += roc_auc_score(labels[test] == 1, p_proba)
    acc_score += accuracy_score(labels[test], predicted)
    f1_score_ += f1_score(labels[test], predicted, pos_label = 1)

print('auc: ' + str(auc_score/5))
print('acc: ' + str(acc_score/5))
print('f1: ' + str(f1_score_/5))

auc: 0.010293381766029912
acc: 0.9524607980127309
f1: 0.9626405561889433


In [15]:
model = LogisticRegression(solver='liblinear')
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size = 0.25, random_state = 123)
mcts = MCTS(model, calculactions_done_conditions = {'type': 'time', 'max_val': 3600}, metric='acc')
mcts.fit(X_train, y_train)

In [18]:
predicted = mcts.predict(X_test)
p_proba = mcts.predict_proba(X_test)
print(roc_auc_score(y_test == 1, p_proba[:,1]))
print(accuracy_score(y_test, predicted))
print(f1_score(y_test, predicted, pos_label = 1))

0.6513511782786885
0.618
0.6156941649899397


In [19]:
mcts._best_features

{'x476'}