In [22]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import queue
import math
from enum import Enum
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [78]:
%run D:\Projects\MCTS\src\BuildInMetrics.py
%run D:\Projects\MCTS\src\DefaultSettings.py
%run D:\Projects\MCTS\src\EndStrategies.py
%run D:\Projects\MCTS\src\GlobalScores.py
%run D:\Projects\MCTS\src\MultiArmStrategies.py
%run D:\Projects\MCTS\src\Node.py
%run D:\Projects\MCTS\src\ScoringFunctions.py
%run D:\Projects\MCTS\src\Preprocessing.py
%run D:\Projects\MCTS\src\CV.py
%run D:\Projects\MCTS\utils\EvaluationUtils.py

In [39]:
data_path = 'D:\\Projects\\MCTS\\datasets\\'
file_names = [#{'name':'blood-transfusion-service-center.csv', 'target_class': 'Class', 'pos_class': 'numerical'},
             {'name':'credit-g.csv', 'target_class': 'class', 'pos_class': 'good'},
             {'name':'kr-vs-kp.csv', 'target_class': 'class', 'pos_class': '\'won\''},
             {'name':'monks-problems-2.csv', 'target_class': 'class', 'pos_class': 'numeric'},
             {'name':'diabetes.csv', 'target_class': 'class', 'pos_class': 'tested_positive'},
             {'name':'qsar-biodeg.csv', 'target_class': 'Class', 'pos_class': 'numeric'},
             {'name':'steel-plates-fault.csv', 'target_class': 'Class', 'pos_class': 'numeric'},
             {'name':'tic-tac-toe.csv', 'target_class': 'Class', 'pos_class': 'positive'},
             {'name':'wdbc.csv', 'target_class': 'Class', 'pos_class': 'numeric'},
             {'name':'hill-valley.csv', 'target_class': 'Class', 'pos_class': 'numeric'},
             {'name':'pc1.csv', 'target_class': 'defects', 'pos_class': True},
             {'name':'spambase.csv', 'target_class': 'class', 'pos_class': 'numeric'},
             {'name':'artificial.csv', 'target_class': 'class', 'pos_class': 'numeric'}]

## MCTS

In [72]:
class MCTS:
    """Class for MCTS"""
    def __init__(self, 
                 model,
                 task = 'classification',
                 calculactions_done_conditions = {'type': 'iterations', 'max_val': 10},
                 params = None,
                 metric = 'acc', 
                 scoring_function = 'g_rave', 
                 multiarm_strategy = 'default', 
                 end_strategy = 'default'):
        
        self._metric_name = metric
        self._scoring_function_name = scoring_function
        self._multiarm_strategy_name = multiarm_strategy
        self._end_strategy_name = end_strategy
        self._best_features = None
        self._best_score = 0
        self._task = task
        self._root = Node("")
        self._feature_names = None
        self._calculactions_done_conditions = calculactions_done_conditions
        self._model = model
        self._time = 0
        self._iterations = 0
        
        if(params is None):
            self._params = DefaultSettings.get_default_params()
        
    def fit(self, data, out_variable, preprocess = True, pos_class = 'numeric'):
        data = data.reset_index(drop=True)
        out_variable = out_variable.reset_index(drop=True)
        
        if preprocess:
            out_variable = self._preprocess_labels(out_variable, pos_class)
        
        if self._task == 'classification':
            self._classification_fit(data, out_variable)
        else:
            self._regression_fit(data, out_variable)
            
    def _classification_fit(self, data, out_variable):
        self._init_fitting_values(data)
        while not self._is_fitting_over():
            self._single_classification_iteration(data, out_variable)
        
        self._model.fit(data.loc[:, self._best_features], out_variable)
    
    def _regression_fit(self, data, out_variable):
        return None
    
    def _single_classification_iteration(self, data, out_variable):
        #print('classification iteration')
        used_features = set()
        node = self._root
        is_iteration_over = False
        while not is_iteration_over:
            node = self._multiarm_strategy.multiarm_strategy(node, used_features, self._scoring_function.get_score, self._global_scores.scores)
            #print("selected feature: " + node.feature_name)
            is_iteration_over = self._end_strategy.are_calculations_over(node, self._params)
            used_features.add(node.feature_name)
            #print("--------")
        
        score = CV.cv(self._metric, self._metric_name, self._model, data, out_variable, self._params['cv'])
        #score = self._cv_score(data.loc[:,used_features], out_variable)
        #print('score: ' + str(score))
        node.update_scores_up(score, self._global_scores)
        #print(used_features)
        if(score > self._best_score):
            self._best_score = score
            self._best_features = used_features
        
        if(self._longest_tree_branch < len(used_features)):
            self._longest_tree_branch = len(used_features)
        
        #print("----------END OF ITERATION----------")
    
    def _single_regression_iteration(self):
        return None
    
    def _init_fitting_values(self, data):
        self._feature_names = set(data.columns)
        self._multiarm_strategy = MultiArmStrategies(self._multiarm_strategy_name, self._feature_names)
        self._end_strategy = EndStrategies(self._end_strategy_name)
        self._scoring_function = ScoringFunctions(self._scoring_function_name, self._params)
        self._metric = BuildInMetrics().get_metric(self._metric_name)
        self._best_features = None
        self._best_score = 0
        self._iterations = 0
        self._longest_tree_branch = 0
        self._time = time.time()
        self._global_scores = GlobalScores()
    
    def _is_fitting_over(self):
        if(self._root._is_subtree_full):
            print('Whole tree searched, finishing prematurely')
            return True
        
        if(self._calculactions_done_conditions['type'] == 'iterations'):
            self._iterations += 1
            return self._iterations > self._calculactions_done_conditions['max_val'] 
        else:
            #print('Time ellapsed: ' + str(time.time() - self._time))
            return (time.time() - self._time) > self._calculactions_done_conditions['max_val'] 
    
    def _preprocess_labels(self, labels, pos_class):
        if pos_class == 'numeric':
            pos_class = 1
            
        return Preprocessing.relabel_data(labels, pos_class)
    
    def predict(self, data):
        return self._model.predict(data.loc[:, self._best_features])
    
    def predict_proba(self, data):
        return self._model.predict_proba(data.loc[:, self._best_features])
    
    def get_features_importances(self):
        return self._global_scores.scores['g_rave']
    
    def one_hot_encode(self, data):
        return Preprocessing.one_hot_encode(data)

In [76]:
EvaluationUtils.eval(LogisticRegression(solver='liblinear'), 
                     file_names, 
                     data_path, 
                     'D:\\Projects\\MCTS\\',
                     'scores', 
                     10, 
                     'roc_auc')

Using credit-g.csv
Using kr-vs-kp.csv
Using monks-problems-2.csv
Using diabetes.csv
Using qsar-biodeg.csv
Using steel-plates-fault.csv
Using tic-tac-toe.csv
Using wdbc.csv
Using hill-valley.csv








Using pc1.csv












Using spambase.csv




Using artificial.csv


Unnamed: 0,name,model_roc_auc,model_acc,MCTS_roc_auc,MCTS_acc
0,credit-g.csv,0.7845523809523809,0.7469999999999999,0.5622096399535425,0.676
1,kr-vs-kp.csv,0.993950751002084,0.9637002952076688,0.5159473551953763,0.5256570713391739
2,monks-problems-2.csv,0.5376497141203023,0.6572440997377662,0.5211782826338083,0.6490066225165563
3,diabetes.csv,0.8258507462686567,0.76953125,0.8512720156555773,0.8177083333333334
4,qsar-biodeg.csv,0.9266674969096512,0.8654186830279986,0.7955429347490586,0.678030303030303
5,steel-plates-fault.csv,0.6580308088740507,0.6522411013533579,0.6252354048964219,0.6358024691358025
6,tic-tac-toe.csv,0.9906214765972268,0.9822611576011158,0.6424940191387559,0.6541666666666667
7,wdbc.csv,0.9917061220644006,0.9472939032798188,0.9879317519766958,0.9370629370629372
8,hill-valley.csv,0.9725973854683136,0.962873605785988,0.5458293342640258,0.5115511551155115
9,pc1.csv,0.658876478988168,0.9287620445160116,0.7040793328825783,0.9388489208633094


In [68]:
scores = pd.DataFrame(columns=['name','LR_roc_auc','LR_acc','MCTS_roc_auc','MCTS_acc'])

for file_info in file_names:
    print(file_info['name'])
    mcts = MCTS(model, calculactions_done_conditions = {'type': 'time', 'max_val': 10}, metric='roc_auc')
    data = pd.read_csv(data_path + file_info['name'])
    labels = mcts._preprocess_labels(data.loc[:,file_info['target_class']], file_info['pos_class'])
    data = data.drop(columns = [file_info['target_class']])
    data = Preprocessing.one_hot_encode(data)
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size = 0.25, random_state = 123)
    model = LogisticRegression(solver='liblinear')

    # model with whole dataset
    scores = scores.append({
        'name': file_info['name'],
        'LR_roc_auc': str(CV.cv(roc_auc_score, 'roc_auc', model, data, labels, 4)),
        'LR_acc': str(CV.cv(accuracy_score, 'acc', model, data, labels, 4))
    }, ignore_index = True)
    
    #MCTS
    #continue
    mcts.fit(X_train, y_train, preprocess = False)
    predicted = mcts.predict(X_test)
    predicted_proba = mcts.predict_proba(X_test)[:,1]
    scores.loc[scores.loc[:,'name'] == file_info['name'], 'MCTS_roc_auc'] = str(roc_auc_score(y_test, predicted_proba))
    scores.loc[scores.loc[:,'name'] == file_info['name'], 'MCTS_acc'] = str(accuracy_score(y_test, predicted))

credit-g.csv
kr-vs-kp.csv
monks-problems-2.csv
diabetes.csv
qsar-biodeg.csv
steel-plates-fault.csv
tic-tac-toe.csv
wdbc.csv
hill-valley.csv










pc1.csv












spambase.csv




artificial.csv


In [69]:
scores

Unnamed: 0,name,LR_roc_auc,LR_acc,MCTS_roc_auc,MCTS_acc
0,credit-g.csv,0.7926285714285715,0.744,0.5548054587688733,0.672
1,kr-vs-kp.csv,0.9938589030442664,0.9633893606434736,0.5034426435481845,0.5256570713391739
2,monks-problems-2.csv,0.487119323253777,0.6555884705986933,0.5435117443203696,0.6490066225165563
3,diabetes.csv,0.8233731343283581,0.7617187499999999,0.8410268216875791,0.78125
4,qsar-biodeg.csv,0.9257375325178504,0.8653826765756423,0.8601765541082782,0.7575757575757576
5,steel-plates-fault.csv,0.6613684054156634,0.6635791862882355,0.9487868648638764,0.8621399176954733
6,tic-tac-toe.csv,0.9913143509855198,0.9812194909344492,0.6789398923444976,0.6708333333333333
7,wdbc.csv,0.9901838503757096,0.952514035260514,0.9284228048272992,0.8531468531468531
8,hill-valley.csv,0.975271376985124,0.9661412948762634,0.5484905331122938,0.504950495049505
9,pc1.csv,0.6762698898408813,0.9287685375165572,0.6673427991886409,0.9388489208633094
