## Import packages

In [3]:
import numpy as np 
from sklearn import tree
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import os
import re
import glob
import datetime
from sklearn.grid_search import GridSearchCV
import pickle
import warnings
warnings.filterwarnings('ignore')
import itertools
import random

## Relevant files paths

In [4]:
DATASETS_PATH='/home/sagio/Unitree forest/datasets/'
PICKLES_PATH='/home/sagio/Unitree forest/pickles/'

## Branch class
A branch represent the concept of a rules-set that result in a classes vector output. The output of the first stage of our method is a set of branches instances.


**Important note:** Conditions are implemented as features threshold, each threshold type (upper and lower) has a vector v so |v| equals Number of features. conditions are added as features thresholds

In [6]:
class Branch:
    def __init__(self,labels_probas=None,number_of_samples=None):
        """Branch inatance can be initialized in 2 ways. One option is to initialize an empty branch
        (only with a global number of features and number of class labels) and gradually add 
        conditions - this option is relevant for the merge implementation.
        Second option is to get the number of samples in branch and the labels
        probability vector - relevant for creating a branch out of an existing tree leaf.
        """
        self.features_upper={k:np.inf for k in xrange(NUMBEROFFEATURES)} #upper bound of the feature for the given rule
        self.features_lower={k:-np.inf for k in xrange(NUMBEROFFEATURES)} #lower bound of the feature for the given rule
        if labels_probas is not None:
            labels_probas=[x/np.sum(labels_probas) for x in labels_probas] #set the result vector
            self.labels={k:v for k,v in zip(xrange(NUMOFLABELS),labels_probas)}
            self.number_of_samples=number_of_samples #save number of samples in leaf (not relevant for the current model)
    def addCondition(self,feature,threshold,bound):
        """
        This function gets feature index, its threshold for the condition and whether
        it is upper or lower bound. It updates the features thresholds for the given rule.
        """
        if bound=='lower':
            if self.features_lower[feature]<=threshold:
                self.features_lower[feature]=np.round(threshold,2)+0.001
        else:
            if self.features_upper[feature]>=threshold:
                self.features_upper[feature]=np.round(threshold,2)
    def contradictBranch(self,b):
        """
        check wether Branch b can be merged with the "self" Branch. Returns Boolean answer.
        """
        for i in xrange(NUMBEROFFEATURES):
            if b.features_upper[i]<=self.features_lower[i] or b.features_lower[i]>=self.features_upper[i]:
                return True
        return False
    def mergeBranch(self,b):
        """
        This method gets Branch b and create a new branch which is a merge of the "self" object
        with b. As describe in the algorithm.
        """
        new_b=Branch()
        new_b.features_upper,new_b.features_lower,new_b.labels=dict(self.features_upper),dict(self.features_lower),dict(self.labels)
        for feature in xrange(NUMBEROFFEATURES):
            new_b.addCondition(feature,b.features_upper[feature],'upper')
            new_b.addCondition(feature,b.features_lower[feature],'lower')
        new_b.labels={k:v1+v2 for k,v1,v2 in zip(xrange(NUMOFLABELS),new_b.labels.values(),b.labels.values())}
        new_b.number_of_samples=np.sqrt(self.number_of_samples*b.number_of_samples)
        return new_b
    def toString(self):
        """
        This function creates a string representation of the branch (only for demonstration purposes)
        """
        s=""
        for feature,threshold in self.features_lower.iteritems():
            if threshold!=(-np.inf):
                s+='['+str(feature)+'] >'+str(threshold)+", "
        for feature,threshold in self.features_upper.iteritems():
            if threshold!=np.inf:
                s+='['+str(feature)+'] <'+str(threshold)+", "
        s+=str(self.labels)
        return s
    def printBranch(self):
        #print the branch by using tostring()
        print self.toString()
    def getLabel(self):
        #Return the predicted label accordint to the branch
        return np.argmax(self.labels.values())
    def containsInstance(self,v):
        """This function gets an ibservation as an input. It returns True if the set of rules
        that represented by the branch matches the instance and false otherwise.
        """
        for index,item in enumerate(v):
            if self.features_upper[index]<item or self.features_lower[index]>item:
                return False
        return True
    def containsCondition(self,tup):
        """
        This function gets a three items tuple with the following structure: (feature index, threshold,
        upper or lower bound). The tuple represent a rule. Function returns True if this rule already
        contained in the branch and False otherwise.
        """
        feature,threshold,bound=tup[0],tup[1],tup[2]
        if bound=='upper':
            if self.features_upper[feature]<=threshold:
                return True
        else:
            if self.features_lower[feature]>=threshold:
                return True
        return False
    def contradictCondition(self,tup):
        """
        An helping function for contradictBranch. Gets a tuple (same as in containsCondition), returns
        True if this condition cannot be merged within the branch.
        """
        feature,threshold,bound=tup[0],tup[1],tup[2]
        if bound=='upper':
            if self.features_lower[feature]>=threshold:
                return True
        else:
            if self.features_upper[feature]<=threshold:
                return True
        return False

## Stage 1: functions for building the rules-sets
The following functions ensble the execution of the first stage algorithm. 

In [22]:
def get_branch(x,node_id):
    """
    This function gets an sklearn tree x and a node_id of a leaf.
    It creates a branch object for the given leaf in the tree
    """
    b=Branch(x.value[node_id][0],x.n_node_samples[node_id])
    while node_id!=0:
        if node_id in x.children_left:
            ancesor=np.where(x.children_left==node_id)[0][0]
            b.addCondition(x.feature[ancesor],x.threshold[ancesor],'upper')
            node_id=ancesor
        else:
            ancesor=np.where(x.children_right==node_id)[0][0]
            b.addCondition(x.feature[ancesor],x.threshold[ancesor],'lower')
            node_id=ancesor
    return b
def print_branches(branches):
    #print a set of branches (denoted as rules-sets in the method description)
    for b in branches:
        b.printBranch()
branch_exp=re.compile('\D+(?P<num>\d+)\] (?P<sign>\D)')
def fit_decision_tree_model(train_x,train_y):
    """
    This function gets train data and conducts a gridsearch for the best decision tree
    out of several options. It returns the fitted tree
    """
    parameters = {'criterion': ['entropy','gini'],
                  'max_depth': [10,20,50],
                  'min_samples_leaf': [1,2,5,10]}
    model=DecisionTreeClassifier()
    clfGS = GridSearchCV(model, parameters, cv=3)
    clfGS.fit(train_x,train_y)
    model=clfGS.best_estimator_
    model.fit(train_x,train_y)
    return model
def divide_to_train_test(X,y):
    #Divide X and y to train and test sets
    train_threshold=int(len(y)*0.7)
    train_x=X[:train_threshold].as_matrix()
    train_y=y[:train_threshold]
    test_x=X[train_threshold:].as_matrix()
    test_y=y[train_threshold:]
    return train_x,train_y,test_x,test_y
def create_output_dict(OUTPUT_PATH,train_x,train_y,test_x,test_y,ensemble_model,
                       decision_tree_model,new_tree_model,comparison_df):
    """"
    This function gets results of a single experiment. It stores the results in a dictionay and
    saves that dictionary as a pickle file for serialization.
    """
    output_dict['train_X']=train_x
    output_dict['train_Y']=train_y
    output_dict['test_X']=test_x
    output_dict['test_Y']=test_y
    output_dict['ensemble_model']=ensemble_model
    output_dict['decision_tree_model']=decision_tree_model
    output_dict['ensemble_max_depth']=np.sum([x.tree_.max_depth for x in ensemble_model.estimators_])
    output_dict['new_tree_max_depth']=new_tree_model.tree_.max_depth
    output_dict['decision_tree_max_depth']=decision_tree_model.tree_.max_depth
    output_dict['number_of_features']=NUMBEROFFEATURES
    output_dict['number_of_labels']=NUMOFLABELS
    output_dict['number_of_instances']=len(train_y)+len(test_y)
    output_dict['ensemble_number_of_nodes']=np.sum([x.tree_.node_count for x in ensemble_model.estimators_])
    output_dict['new_tree_number_of_nodes']=new_tree_model.tree_.node_count
    output_dict['comparison_data_Set']=comparison_df
    pickle.dump(output_dict,open(OUTPUT_PATH,'wb'))
def fit_ensemble_model(train_x,train_y,n_estimators,max_depth=3,min_leaf_samples=10):
    #Fit an ensemble model for the given dataset
    model=RandomForestClassifier(n_estimators=n_estimators,criterion='entropy',max_depth=max_depth,min_samples_leaf=min_leaf_samples)
    model.fit(train_x,train_y)
    return model
def build_rules_set(ensemble_model):
    """
    This is an implementation of the first stage of our method, here we create 
    the rules-sets out of the given ensemble.
    """
    x=ensemble_model.estimators_[0].tree_ #extract the first tree
    leafs_indexes=[i for i in xrange(x.node_count) if x.children_left[i]==-1 and x.children_right[i]==-1]
    branches=[get_branch(x,i) for i in leafs_indexes]#Create branch object for each of the leaves
    for idx in range(1,len(ensemble_model.estimators_)): #Iterate over the trees
        x=ensemble_model.estimators_[idx].tree_
        leafs_indexes=[i for i in xrange(x.node_count) if x.children_left[i]==-1 and x.children_right[i]==-1]
        temp_branches=[get_branch(x,i) for i in leafs_indexes] #current tree leaves\branches
        branches1=[] #initialize the branches list for the current iteration
        for b in branches: #iterate over each branch from last phase
            for tb in temp_branches: #for each leaf in current tree
                if b.contradictBranch(tb)==False: #check if the tree leaf can be merged with the branch 
                    branches1.append(b.mergeBranch(tb)) #if can be merged, add the merged branches to the rules-set
        branches=list(branches1)
    print("Total number of branches (rules-sets): " + str(len(branches)))
    #print datetime.datetime.now()
    return branches
def get_branches_predictions(branches,test_x):
    """
    This function enables the evaluation of the rerieved rules-setrs\branches list.
    It gets a list of branches and a test set as an input and returns the branches predictions
    for the given test set. predictions are made here by iterate over the rules sets and find
    the matching rule for each instance.
    """
    branches_predictions=[]
    for i in xrange(len(test_x)):
        found=0
        for b in branches:
            if b.containsInstance(test_x[i]):
                found=1
                branches_predictions.append(b.getLabel())
                break
        if found==0:
            branches_predictions.append(None)
    return branches_predictions

## Stage 2: Functions for building the trees out of the rules sets

In [23]:
def create_thresholds_vector_space(features_upper_thresholds,features_lower_thresholds):
    """
    Helping function for generating the new model dataset. Gets all branches thresholds
    and creates the features-space as a set of all relevant columns (except from weight and class)
    """
    upper_vectors={k:{} for k in features_upper_thresholds}
    lower_vectors={k:{} for k in features_lower_thresholds}
    for feature,threshold_values in features_upper_thresholds.iteritems():
        for v in threshold_values:
            upper_vectors[feature][v]={"[x"+str(feature)+"] < "+str(v1):(1 if v1 >= v else 0) for v1 in threshold_values}
    for feature,threshold_values in features_lower_thresholds.iteritems():
        for v in threshold_values:
            lower_vectors[feature][v]={"[x"+str(feature)+"] > "+str(v1):(1 if v1 <= v else 0) for v1 in threshold_values}
    return upper_vectors,lower_vectors
def create_new_model_input(branches):
    """
    Create the new model dataset out of the given rules-sets (branches)
    """
    l=[] #This list will include all the records
    features_upper_thresholds={k:set() for k in xrange(NUMBEROFFEATURES)} #possible upper thresholds for each feature
    features_lower_thresholds={k:set() for k in xrange(NUMBEROFFEATURES)} #possible lower thresholds for each feature
    for b in branches: #This loop updates the features space according to the given features in each branch
        for k,v in b.features_upper.iteritems():
            features_upper_thresholds[k].add(v)
        for k,v in b.features_lower.iteritems():
            features_lower_thresholds[k].add(v)
    #upper_vectors and lower_vectors maps an input of feature and value into its relevant vector
    #For example: x1< 2 will return 1 for (x1<2) and also 1 for (x1<3)
    upper_vectors,lower_vectors=create_thresholds_vector_space(features_upper_thresholds,features_lower_thresholds)
    for b in branches:
        d={} #d represents a record within the dataset
        for k,v in b.features_upper.iteritems():
            d.update(upper_vectors[k][v]) 
        for k,v in b.features_lower.iteritems():
            d.update(lower_vectors[k][v])
        for k,v in b.labels.iteritems():
            d['assigned_label']=k
            d['weight']=v
            l.append(dict(d)) #Notice!!! we create a record for every possible class butwith different weights
    ensemble_df=pd.DataFrame(l) #greate the dataset and then remove upper bound features (it's redundant to the lower bounds)
    #and also remove the infiniy thresholds features
    ensemble_df=ensemble_df.drop([col for col in ensemble_df.columns if '>' in col or 'inf' in col],axis=1)
    return ensemble_df
def fit_new_model(new_model_input_data):
    """
    Function gets the new model input data and fits an sklearn decision tree which is returned
    to the user
    """
    new_tree_model=DecisionTreeClassifier(criterion='entropy',min_samples_split=1)
    f_names=[col for col in new_model_input_data.columns if '[x' in col.lower()]
    X=new_model_input_data[f_names].as_matrix()
    y=new_model_input_data['assigned_label']
    new_tree_model.fit(X,y,sample_weight=new_model_input_data['weight'].values)
    return new_tree_model
def create_output_df(test_x,test_y,ensemble_model,new_tree_model,decision_tree_model,new_model_input_data,branches_predictions):
    """
    In this function we build a comparison dataframe. we document the results for the test set
    """
    comparison_df=pd.DataFrame(test_x)
    comparison_df['ensmble_predictions']=ensemble_model.predict(test_x)
    new_model_predictions,new_model_depth,new_model_probas = get_new_model_predictions(new_tree_model,test_x,new_model_input_data)
    output_dict['new_model_probas']=np.array([list(i) for i in new_model_probas])
    output_dict['ensemble_probas']=ensemble_model.predict_proba(test_x)
    comparison_df['new_tree_predictions']=new_model_predictions
    comparison_df['new_model_depth']=new_model_depth
    output_dict['decision_tree_depth']=np.log2(len(decision_tree_model.tree_.n_node_samples))
    comparison_df['actual']=list(test_y)
    if len(branches_predictions)==len(comparison_df):
        comparison_df['branches_predictions']=branches_predictions
    comparison_df['decision_tree_predictions']=decision_tree_model.predict(test_x)
    instances_depth=[]
    new_model_depth=[]
    for i in test_x:
        temp_depth=[]
        for m in ensemble_model.estimators_:
            temp_depth.append(np.round(np.log2(m.apply(i)[0])))
        instances_depth.append(np.sum(temp_depth))
    comparison_df['ensemble_depth']=instances_depth
    return comparison_df

## Tree conversion functions
These functions execute a procedure that isnot included in the paper in which we convert the tree nodes from being binary to numeric based on the original features.

In [9]:
def get_new_features_and_new_thresholds(t,new_model_input_data):
    f_names=[col for col in new_model_input_data.columns if 'x' in col]
    new_features=[]
    new_thresholds=[]
    for feature in t.feature:
        if feature==-2:
            new_features.append(feature)
            new_thresholds.append(-2)
        else:
            new_f,new_t=get_new_node(feature,f_names)
            new_features.append(new_f)
            new_thresholds.append(new_t)
    return new_features,new_thresholds
def get_new_model_predictions(new_model,test_x,new_model_input_data):
    new_model_predictions=[]
    new_model_depth=[]
    new_model_probas=[]
    t=new_model.tree_
    new_features,new_thresholds=get_new_features_and_new_thresholds(t,new_model_input_data)
    for inst in test_x:
        d=0
        i=0
        while new_features[i]!=-2:
            d+=1
            if inst[new_features[i]]>new_thresholds[i]:
                i=t.children_left[i]
            else:
                i=t.children_right[i]
        new_model_depth.append(d)
        new_model_predictions.append(np.argmax(t.value[i]))
        new_model_probas.append(t.value[i][0])
    #new_model_predictions=[4 if i else 2 for i in new_model_predictions]
    return new_model_predictions,new_model_depth,new_model_probas
def get_new_node(feature,f_names):
    new_f=int(f_names[feature].split("x")[1].split(']')[0])
    new_t=float(f_names[feature].split(' ')[2])
    return new_f,new_t

## Run the experiment
The following code runs the experiment that is described in the paper

In [10]:
def create_data_set(s):
    """
    This is sort of configuration function
    The function is called for every dataset that we want to test in our experiment. 's' is the
    name of the dataset. according to 's' we create a dataset (manually configured)
    It returns the features matrix (X) and class vector for every dataset 
    """
    if s=='breast_cancer':
        names=['code_number','Clump_thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size'
            ,'Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','class']
        data=pd.read_csv(DATASETS_PATH+'breast-cancer-wisconsin.data',names=names)
        data=data[data['Bare Nuclei']!='?']
        data['Bare Nuclei']=[int(i) for i in data['Bare Nuclei']]
        data['class']=[0 if i==2 else 1 for i in data['class']]
    if s=='iris':
        iris = load_iris()
        data = pd.DataFrame(iris.data[:],columns=iris.feature_names)
        data['class'] = iris.target
    if s=='winery':
        data=pd.read_csv(DATASETS_PATH+"wine.data",names=['class','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium',
                                     'Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins',
                                    'Color intensity','Hue','OD280/OD315 of diluted wines','Proline'])
        data['class']=[i-1 for i in data['class']]
    if s=='vehicle':
        column_names=[str(i) for i in xrange(19)]
        column_names.append('class')
        pathes=glob.glob(DATASETS_PATH+"vehicle_data/xa*.dat")
        data=pd.read_csv(pathes[0],sep=" ",names=column_names)
        for p in pathes[1:]:
            data=data.append(pd.read_csv(p,sep=" ",names=column_names),ignore_index=True)
        data['class']=data['18']
        data=data.drop(['18'],axis=1)
        new_classes={v:k for k,v in enumerate(list(set(data['class'].values)))}
        data['class']=[new_classes[i] for i in data['class']]
    if s=='car':
        data=pd.read_csv(DATASETS_PATH+"car.data",names=['buying','maint','doors','persons','lug_boot','safety','class'])
        for col in data.columns[:-1]:
            temp_df=pd.get_dummies(data[col])
            temp_df.columns=[col+"_"+val for val in temp_df.columns]
            data=data.join(temp_df)
            data=data.drop([col],axis=1)
        class_map={v:k for k,v in enumerate(set(data['class']))}
        data['class']=[class_map[i] for i in data['class']]
    if s=='glass':
        data=pd.read_csv(DATASETS_PATH+"glass.data",names=['RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','class'])
        class_map={v:k for k,v in enumerate(set(data['class']))}
        data['class']=[class_map[i] for i in data['class']]
    if s=='aust_credit':
        names=["A"+str(i) for i in range(1,15)]
        names.append('class')
        data=pd.read_csv(DATASETS_PATH+"australian.dat",sep=" ",names=names)
    if s=='nurse':
        names=['x'+str(i) for i in range(1,9)]
        names.append('class')
        data=pd.read_csv(DATASETS_PATH+"post-operative.data",names=names)
        for col in data.columns[:-1]:
            temp_df=pd.get_dummies(data[col])
            temp_df.columns=[col+"_"+val for val in temp_df.columns]
            data=data.join(temp_df)
            data=data.drop([col],axis=1)
        new_classes={v:k for k,v in enumerate(list(set(data['class'].values)))}
        data['class']=[new_classes[i] for i in data['class']]
    if s=='diabetes':
        names=['x'+str(i) for i in range(1,9)]
        names.append('class')
        data=pd.read_csv(DATASETS_PATH+"pima-indians-diabetes.data",names=names)
    if s=='monk1':
        names=['class']
        names.extend(['x'+str(i) for i in range(1,8)])
        data=pd.read_csv(DATASETS_PATH+"monks-1.train",sep=" ",names=names)
        data=data.append(pd.read_csv(DATASETS_PATH+"monks-1.test",sep=" ",names=names))
        data.index=np.arange(len(data))
        data=data.drop(['x7'],axis=1)
    if s=='monk2':
        names=['class']
        names.extend(['x'+str(i) for i in range(1,8)])
        data=pd.read_csv(DATASETS_PATH+"monks-2.train",sep=" ",names=names)
        data=data.append(pd.read_csv(DATASETS_PATH+"monks-2.test",sep=" ",names=names))
        data.index=np.arange(len(data))
        data=data.drop(['x7'],axis=1)
    if s=='monk3':
        names=['class']
        names.extend(['x'+str(i) for i in range(1,8)])
        data=pd.read_csv(DATASETS_PATH+"monks-3.train",sep=" ",names=names)
        data=data.append(pd.read_csv(DATASETS_PATH+"monks-3.test",sep=" ",names=names))
        data.index=np.arange(len(data))
        data=data.drop(['x7'],axis=1)
    if s=='zoo':
        names=['x'+str(i) for i in range(0,17)]
        names.append('class')
        data=pd.read_csv(DATASETS_PATH+'zoo.data',names=names)
        data=data.drop(['x0'],axis=1)
        data['class']=[i-1 for i in data['class']]
    
    if s=='tic_tac_toe':
        names=["x"+str(i) for i in range(1,10)]
        names.append('class')
        data=pd.read_csv("/home/sagio/Unitree forest/datasets/tic-tac-toe.data",names=names)
        for col in data.columns[:-1]:
            temp_df=pd.get_dummies(data[col])
            temp_df.columns=[col+"_"+val for val in temp_df.columns]
            data=data.join(temp_df)
            data=data.drop([col],axis=1)
        data['class']=[1 if i=='positive' else 0 for i in data['class']]
    if s=='letter':
        names=['class']
        names.extend(['x'+str(i) for i in xrange(1,17)])
        data=pd.read_csv(DATASETS_PATH+"letter-recognition.data",names=names)
        class_map={v:k for k,v in enumerate(set(data['class']))}
        data['class']=[class_map[i] for i in data['class']]
    if s=='balance_scale':
        data=pd.read_csv(DATASETS_PATH+"balance-scale.data",names=['class','x1','x2','x3','x4'])
        class_map={v:k for k,v in enumerate(set(data['class']))}
        data['class']=[class_map[i] for i in data['class']]
    if s=='ecoli':
        f=open(DATASETS_PATH+"ecoli.data")
        line=f.readline()
        l=[]
        names=['x'+str(i) for i in range(1,9)]
        names.append('class')
        while line:
            line=line.replace("\n","").replace("    ","   ").replace("   ","  ").replace("  "," ").split(" ")
            l.append({k:v for k,v in zip(names,line)})
            line=f.readline()
        data=pd.DataFrame(l)
        data=data.drop(['x1'],axis=1)
        class_map={v:k for k,v in enumerate(set(data['class']))}
        data['class']=[class_map[i] for i in data['class']]
    if s=='transfusion':
        data=pd.read_csv(DATASETS_PATH+"transfusion.data")
        data['class']=data['whether he/she donated blood in March 2007']
        data=data.drop(['whether he/she donated blood in March 2007'], axis=1)
    if s=='user_modelling':
        data=pd.read_csv(DATASETS_PATH+"User_Modeling.csv")
        data['class']=data[' UNS']
        class_map={v:k for k,v in enumerate(set(data['class']))}
        data['class']=[class_map[i] for i in data['class']]
        data=data.drop([' UNS'],axis=1)
    if s=='kohkiloyeh':
        data=pd.read_csv(DATASETS_PATH+"kohkiloyeh.csv")
        data['class']=data['pb']
        data=data.drop(['pb'],axis=1)
        for col in data.columns[:-1]:
            temp_df=pd.get_dummies(data[col])
            temp_df.columns=[col+"_"+val for val in temp_df.columns]
            data=data.join(temp_df)
            data=data.drop([col],axis=1)
        class_map={v:k for k,v in enumerate(set(data['class']))}
        data['class']=[class_map[i] for i in data['class']]
    if s=='haberman':
        data=pd.read_csv(DATASETS_PATH+"haberman.data",names=['x1','x2','x3','class'])
        data['class']=[i-1 for i in data['class']]
    data=data.sample(frac=1)
    X = data.drop(['class'],axis=1)
    y = data['class'].values
    return X,y
def run_experiment(s,number_of_trees,day_string,max_depth=3,min_leaf_samples=10):
    """
    Run experiment runs an experiment 100 times for the given dataset and the given configuration.
    configuration also include number of trees, and other hyper-parameters for the ensemble
    """
    for RANDOM_SEED in range(1,100):
        global output_dict
        output_dict={}
        np.random.seed(RANDOM_SEED)
        dest=PICKLES_PATH+s+"/"+day_string+"_"+str(number_of_trees)+"trees"
        if not os.path.isdir(dest):
            os.makedirs(dest)
        OUTPUT_PATH=dest+"/seed_"+str(RANDOM_SEED)+".pkl"
        if os.path.isfile(OUTPUT_PATH):
            continue
        X,y=create_data_set(s)
        global NUMOFLABELS
        global NUMBEROFFEATURES
        NUMOFLABELS=len(set(y))
        NUMBEROFFEATURES=len(X.columns)
        train_x,train_y,test_x,test_y=divide_to_train_test(X,y)
        ensemble_model=fit_ensemble_model(train_x,train_y,number_of_trees,max_depth=max_depth,min_leaf_samples=min_leaf_samples)
        decision_tree_model=fit_decision_tree_model(train_x,train_y)
        branches=build_rules_set(ensemble_model)
        branches_predictions=get_branches_predictions(branches,test_x)
        new_model_input_data=create_new_model_input(branches)
        new_tree_model=fit_new_model(new_model_input_data)
        comparison_df=create_output_df(test_x,test_y,ensemble_model,new_tree_model,decision_tree_model,new_model_input_data,branches_predictions)
        create_output_dict(OUTPUT_PATH,train_x,train_y,test_x,test_y,ensemble_model,
                       decision_tree_model,new_tree_model,comparison_df)
        print float(len(comparison_df[comparison_df['actual']==comparison_df['ensmble_predictions']]))/len(comparison_df)
        print float(len(comparison_df[comparison_df['actual']==comparison_df['new_tree_predictions']]))/len(comparison_df)
        print float(len(comparison_df[comparison_df['actual']==comparison_df['decision_tree_predictions']]))/len(comparison_df)
""""
To use for running the experiment
day_string='10_4_17'
run_experiment('iris',10,day_string,max_depth=10,min_leaf_samples=1)
run_experiment('iris',20,day_string,max_depth=10,min_leaf_samples=1)
run_experiment('breast_cancer',7,day_string)
run_experiment('breast_cancer',10,day_string)
run_experiment('winery',7,day_string)
run_experiment('winery',10,day_string)
run_experiment('aust_credit',7,day_string)
run_experiment('aust_credit',10,day_string)
run_experiment('nurse',7,day_string)
run_experiment('nurse',10,day_string)
run_experiment('diabetes',7,day_string)
run_experiment('diabetes',10,day_string)
run_experiment('zoo',10,day_string,max_depth=10,min_leaf_samples=1)
run_experiment('zoo',20,day_string,max_depth=10,min_leaf_samples=1)
run_experiment('balance_scale',10,day_string,max_depth=10,min_leaf_samples=1)
run_experiment('balance_scale',20,day_string,max_depth=10,min_leaf_samples=1)
run_experiment('transfusion',10,day_string,max_depth=10,min_leaf_samples=1)
run_experiment('transfusion',20,day_string,max_depth=10,min_leaf_samples=1)
run_experiment('kohkiloyeh',10,day_string,max_depth=10,min_leaf_samples=1)
run_experiment('kohkiloyeh',7,day_string,max_depth=10,min_leaf_samples=1)
run_experiment('haberman',7,day_string)
run_experiment('haberman',10,day_string)
run_experiment('user_modelling',10,day_string)
run_experiment('user_modelling',20,day_string,max_depth=10,min_leaf_samples=1)
"""

'"\nTo use for running the experiment\nday_string=\'10_4_17\'\nrun_experiment(\'iris\',10,day_string,max_depth=10,min_leaf_samples=1)\nrun_experiment(\'iris\',20,day_string,max_depth=10,min_leaf_samples=1)\nrun_experiment(\'breast_cancer\',7,day_string)\nrun_experiment(\'breast_cancer\',10,day_string)\nrun_experiment(\'winery\',7,day_string)\nrun_experiment(\'winery\',10,day_string)\nrun_experiment(\'aust_credit\',7,day_string)\nrun_experiment(\'aust_credit\',10,day_string)\nrun_experiment(\'nurse\',7,day_string)\nrun_experiment(\'nurse\',10,day_string)\nrun_experiment(\'diabetes\',7,day_string)\nrun_experiment(\'diabetes\',10,day_string)\nrun_experiment(\'zoo\',10,day_string,max_depth=10,min_leaf_samples=1)\nrun_experiment(\'zoo\',20,day_string,max_depth=10,min_leaf_samples=1)\nrun_experiment(\'balance_scale\',10,day_string,max_depth=10,min_leaf_samples=1)\nrun_experiment(\'balance_scale\',20,day_string,max_depth=10,min_leaf_samples=1)\nrun_experiment(\'transfusion\',10,day_string,max

## An ilustrative example - Iris dataset
The following cells present a single run for the Iris dataset. We build ensemble of 20 trees

### Load the data

In [24]:
s='iris'
X,y=create_data_set(s)
NUMOFLABELS=len(set(y))
NUMBEROFFEATURES=len(X.columns)

In [25]:
X

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
131,7.9,3.8,6.4,2.0
85,6.0,3.4,4.5,1.6
97,6.2,2.9,4.3,1.3
113,5.7,2.5,5.0,2.0
40,5.0,3.5,1.3,0.3
73,6.1,2.8,4.7,1.2
147,6.5,3.0,5.2,2.0
22,4.6,3.6,1.0,0.2
138,6.0,3.0,4.8,1.8
88,5.6,3.0,4.1,1.3


In [26]:
y

array([2, 1, 1, 2, 0, 1, 2, 0, 2, 1, 2, 1, 2, 0, 0, 1, 1, 1, 0, 2, 0, 0, 1,
       0, 0, 0, 2, 2, 2, 2, 2, 0, 1, 2, 1, 1, 2, 1, 0, 2, 2, 2, 1, 0, 0, 2,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 2, 2, 0, 2, 0, 0, 1, 1, 0, 0, 1, 2,
       1, 0, 0, 1, 1, 2, 2, 1, 1, 1, 0, 0, 2, 0, 1, 1, 2, 2, 1, 1, 0, 0, 0,
       1, 2, 2, 2, 0, 1, 2, 1, 0, 2, 2, 0, 2, 0, 2, 1, 0, 2, 1, 0, 0, 2, 0,
       2, 1, 1, 2, 1, 0, 0, 1, 2, 2, 2, 0, 1, 2, 2, 1, 2, 2, 1, 2, 1, 1, 1,
       1, 2, 0, 2, 1, 2, 1, 0, 0, 0, 0, 1])

### Fit ensemble model

In [27]:
train_x,train_y,test_x,test_y=divide_to_train_test(X,y)
ensemble_model=fit_ensemble_model(train_x,train_y,20)

### Create the branches

In [28]:
branches=build_rules_set(ensemble_model)

Total number of branches (rules-sets): 2192


In [29]:
print_branches(branches[:20]) # print first 20 rules

[0] <4.95, [2] <1.45, [3] <0.75, {0: 19.687840290381125, 1: 0.20689655172413793, 2: 0.10526315789473684}
[3] >0.751, [0] <4.95, [2] <1.45, [3] <0.8, {0: 18.687840290381125, 1: 1.2068965517241379, 2: 0.10526315789473684}
[3] >0.801, [0] <4.95, [2] <1.45, [3] <1.35, {0: 12.687840290381125, 1: 7.1443965517241379, 2: 0.16776315789473684}
[3] >1.351, [0] <4.95, [2] <1.45, [3] <1.45, {0: 12.687840290381125, 1: 6.909102434077079, 2: 0.40305727554179566}
[3] >1.451, [0] <4.95, [2] <1.45, [3] <1.55, {0: 12.687840290381125, 1: 6.6233881483627934, 2: 0.68877156125608141}
[3] >1.551, [0] <4.95, [2] <1.45, [3] <1.65, {0: 12.687840290381125, 1: 5.973388148362794, 2: 1.3387715612560815}
[3] >1.651, [0] <4.95, [2] <1.45, [3] <1.75, {0: 12.687840290381125, 1: 4.9733881483627931, 2: 2.3387715612560815}
[3] >1.751, [0] <4.95, [2] <1.45, {0: 12.687840290381125, 1: 3.3179259634888436, 2: 3.994233746130031}
[0] >4.951, [0] <5.45, [2] <1.45, [3] <0.75, {0: 19.737547892720308, 1: 0.26245210727969348, 2: 0.0}


### Compare branches predictions with ensemble predictions:

In [30]:
branches_predictions=get_branches_predictions(branches,test_x)
agreed_predictions_counter=0
for ensemble_pred,branches_pred in zip(ensemble_model.predict(test_x),branches_predictions):
    if ensemble_pred==branches_pred:
        agreed_predictions_counter+=1
print "Number of observations in test set: "+str(len(branches_predictions))
print "Number of agreed predictions: "+str(agreed_predictions_counter)

Number of observations in test set: 45
Number of agreed predictions: 45


In [40]:
output_dict={}
new_model_input_data=create_new_model_input(branches)
decision_tree_model=fit_decision_tree_model(train_x,train_y)
new_tree_model=fit_new_model(new_model_input_data)
comparison_df=create_output_df(test_x,test_y,ensemble_model,new_tree_model,decision_tree_model,new_model_input_data,branches_predictions)
comparison_df[['actual','ensmble_predictions','new_tree_predictions']]

Unnamed: 0,actual,ensmble_predictions,new_tree_predictions
0,0,0,0
1,2,2,2
2,1,1,1
3,0,0,0
4,2,2,2
5,1,1,1
6,0,0,0
7,0,0,0
8,2,2,2
9,0,0,0
