In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

def I(flag):
    return 1 if flag else 0

def sign(x):
    return abs(x)/x if x!=0 else 1       

class AdaBoost:
    
    def __init__(self,n_estimators=50):
        self.n_estimators = n_estimators
        self.models = [None]*n_estimators
        
    def fit(self,X,y):
        
        X = np.float64(X)
        N = len(y)
        w = np.array([1/N for i in range(N)])
        
        for m in range(self.n_estimators):
            
            Gm = DecisionTreeClassifier(max_depth=1).fit(X,y,sample_weight=w).predict
                        
            errM = sum([w[i]*I(y[i]!=Gm(X[i].reshape(1,-1))) for i in range(N)])/sum(w)
 
            AlphaM = np.log((1-errM)/errM)
            
            w = [w[i]*np.exp(AlphaM*I(y[i]!=Gm(X[i].reshape(1,-1)))) for i in range(N)] 
            
            
            self.models[m] = (AlphaM,Gm)

    def predict(self,X):
        
        y = 0
        for m in range(self.n_estimators):
            AlphaM,Gm = self.models[m]
            y += AlphaM*Gm(X)
        signA = np.vectorize(sign)
        y = np.where(signA(y)==-1,-1,1)
        return y

from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix as CM

x,y = make_classification(n_samples=217)
'''
As for our implementaion of AdaBoost 
y needs to be in {-1,1}
'''
y = np.where(y==0,-1,1)

clf = AdaBoost(n_estimators=5) # try 5 10 50 and press Run over and over again
clf.fit(x,y)
y_pred = clf.predict(x)


print("Performance: ", 100*sum(y_pred==y)/len(y))
print("Confusion Matrix: \n", CM(y,y_pred))

Performance:  90.78341013824885
Confusion Matrix: 
 [[99 11]
 [ 9 98]]


In [2]:
import numpy as np 
import matplotlib.pyplot as plt
from numpy import *
from sklearn import datasets 
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
import random
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import roc_curve, auc


def read_data(run_num, prob):

    normalise = False
    
    if prob == 'classifification': #Source: Pima-Indian diabetes dataset: https://www.kaggle.com/kumargh/pimaindiansdiabetescsv
        data_in = genfromtxt("datasets/pima.csv", delimiter=",")
        data_inputx = data_in[:, 0:8] # all features 0, 1, 2, 3, 4, 5, 6, 7 
        data_inputy = data_in[:, -1]  # this is target - so that last col is selected from data

    elif prob == 'regression': # energy - regression prob
        data_in = genfromtxt('datasets/energy/ENB2012_data.csv', delimiter=",")  
        data_inputx = data_in[:, 0:8] # all features 0, - 7
        data_inputy = data_in[:, 8]   # this is target - just the heating load selected from data
  

    if normalise == True:
        transformer = Normalizer().fit(data_inputx)  # fit does nothing.
        data_inputx = transformer.transform(data_inputx)
 
    x_train, x_test, y_train, y_test = train_test_split(data_inputx, data_inputy, test_size=0.40, random_state=run_num)

    return x_train, x_test, y_train, y_test
 
    
def scipy_models(x_train, x_test, y_train, y_test, type_model, hidden, learn_rate, run_num, problem):

    print(run_num, ' is our exp run')

    tree_depth = 2
 
    if problem == 'classifification':
        if type_model == 0:  # SGD 
            model = MLPClassifier(hidden_layer_sizes=(hidden,), random_state=run_num, 
                                  max_iter=100,solver='sgd', learning_rate_init=learn_rate )  
        elif type_model == 1: # https://scikit-learn.org/stable/modules/tree.html  (see how tree can be visualised)
            model = DecisionTreeClassifier(random_state=0, max_depth=tree_depth) 
        elif type_model == 2:
            model = RandomForestClassifier(n_estimators=100, max_depth=tree_depth, random_state=run_num)
            
        elif type_model == 3:
            model = AdaBoostClassifier(n_estimators=100,  random_state=run_num)

        elif type_model == 4:
            model = GradientBoostingClassifier(n_estimators=10,  random_state=run_num)

    elif problem == 'regression':
        if type_model ==0: #SGD  
            model = MLPRegressor(hidden_layer_sizes=(hidden*3,), random_state=run_num, 
                                 max_iter=500, solver='adam',learning_rate_init=learn_rate) 
        elif type_model == 1:  
            model = DecisionTreeRegressor(random_state=0, max_depth=tree_depth)
        elif type_model == 2: 
            model = RandomForestRegressor(n_estimators=100, max_depth=tree_depth, random_state=run_num)
        elif type_model == 3: 
            model = AdaBoostRegressor(n_estimators=100, random_state=run_num)
        elif type_model == 4:
            model = GradientBoostingRegressor(n_estimators=10, random_state=run_num)
   
    # Train the model using the training sets
    model.fit(x_train, y_train)   

    if type_model == 1:
        r = export_text(model)
        print(r)

    # Make predictions using the testing set
    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train) 

    if problem == 'regression':
        perf_test = np.sqrt(mean_squared_error(y_test, y_pred_test)) 
        perf_train = np.sqrt(mean_squared_error(y_train, y_pred_train)) 

    if problem == 'classifification': 
        perf_test = accuracy_score(y_pred_test, y_test) 
        perf_train = accuracy_score(y_pred_train, y_train) 
        cm = confusion_matrix(y_pred_test, y_test) 
        #print(cm, 'is confusion matrix')
        #auc = roc_auc_score(y_pred, y_test, average=None) 

    return perf_test #,perf_train


def main(): 

    max_expruns = 5

    SGD_all = np.zeros(max_expruns) 
    forest_all = np.zeros(max_expruns) 
    tree_all = np.zeros(max_expruns) 
    adaboost_all = np.zeros(max_expruns)  

    gb_all = np.zeros(max_expruns)  
 
    learn_rate = 0.01
    hidden = 8

    #prob = 'classifification' # classification  or regression 
    prob = 'regression'        # classification  or regression 

    # classifcation accurary is reported for classification and RMSE for regression

    print(prob, ' is our problem') 
 
    for run_num in range(0,max_expruns): 

        x_train, x_test, y_train, y_test = read_data(run_num, prob)   
        
        acc_sgd = scipy_models(x_train, x_test, y_train, y_test, 0, hidden, learn_rate, run_num, prob)   #SGD 
        acc_tree = scipy_models(x_train, x_test, y_train, y_test, 1, hidden, learn_rate, run_num, prob)  #Decision Tree
        acc_forest = scipy_models(x_train, x_test, y_train, y_test, 2, hidden, learn_rate, run_num, prob)  #Random Forests
        acc_adaboost = scipy_models(x_train, x_test, y_train, y_test, 3, hidden, learn_rate, run_num, prob) #adaboost
        acc_gb = scipy_models(x_train, x_test, y_train, y_test, 4, hidden, learn_rate, run_num, prob)       #adaboost
       
        SGD_all[run_num] = acc_sgd 
        tree_all[run_num] = acc_tree
        forest_all[run_num] = acc_forest
        adaboost_all[run_num] = acc_adaboost
        gb_all[run_num] = acc_gb

    print(SGD_all,' nn_all')
    print(np.mean(SGD_all), ' mean nn_all')
    print(np.std(SGD_all), ' std nn_all')
 
    print(tree_all, ' tree_all')
    print(np.mean(tree_all), ' tree _all')
    print(np.std(tree_all), ' tree _all')

    print(forest_all, hidden, ' forest_all')
    print(np.mean(forest_all), ' forest _all')
    print(np.std(forest_all), ' forest _all')

    print(adaboost_all, 'adaboost_all')
    print(np.mean(adaboost_all), ' adaboost _all')
    print(np.std(adaboost_all), ' adaboost_all')

       
    print(gb_all, 'gb_all')
    print(np.mean(gb_all), ' gb _all')
    print(np.std(gb_all), ' gb_all')

    
if __name__ == '__main__':
     main() 

regression  is our problem
0  is our exp run
0  is our exp run
|--- feature_0 <= 0.75
|   |--- feature_6 <= 0.18
|   |   |--- value: [11.32]
|   |--- feature_6 >  0.18
|   |   |--- value: [14.82]
|--- feature_0 >  0.75
|   |--- feature_1 <= 624.75
|   |   |--- value: [28.77]
|   |--- feature_1 >  624.75
|   |   |--- value: [37.14]

0  is our exp run
0  is our exp run
0  is our exp run
1  is our exp run
1  is our exp run
|--- feature_1 <= 673.75
|   |--- feature_1 <= 624.75
|   |   |--- value: [28.51]
|   |--- feature_1 >  624.75
|   |   |--- value: [37.22]
|--- feature_1 >  673.75
|   |--- feature_2 <= 330.75
|   |   |--- value: [12.12]
|   |--- feature_2 >  330.75
|   |   |--- value: [15.55]

1  is our exp run
1  is our exp run
1  is our exp run
2  is our exp run
2  is our exp run
|--- feature_1 <= 673.75
|   |--- feature_1 <= 624.75
|   |   |--- value: [28.57]
|   |--- feature_1 >  624.75
|   |   |--- value: [37.18]
|--- feature_1 >  673.75
|   |--- feature_6 <= 0.18
|   |   |--- val

In [3]:
import numpy as np 
import matplotlib.pyplot as plt
from numpy import *
from sklearn import datasets 
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.neural_network import MLPClassifier
import random
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
import xgboost as xgb
from sklearn.metrics import roc_curve, auc

def read_data(run_num, prob):

    normalise = False
    
    if prob == 'classifification': #Source: Pima-Indian diabetes dataset: https://www.kaggle.com/kumargh/pimaindiansdiabetescsv
        data_in = genfromtxt("datasets/pima.csv", delimiter=",")
        data_inputx = data_in[:, 0:8]  # all features 0, 1, 2, 3, 4, 5, 6, 7 
        data_inputy = data_in[:, -1]   # this is target - so that last col is selected from data

    elif prob == 'regression': # energy - regression prob
        data_in = genfromtxt('datasets/energy/ENB2012_data.csv', delimiter=",")  
        data_inputx = data_in[:, 0:8]  # all features 0 - 7
        data_inputy = data_in[:, 8]    # this is target - just the heating load selected from data  

    if normalise == True:
        transformer = Normalizer().fit(data_inputx)  # fit does nothing.
        data_inputx = transformer.transform(data_inputx)
 
    x_train, x_test, y_train, y_test = train_test_split(data_inputx, data_inputy, test_size=0.40, random_state=run_num)

    return x_train, x_test, y_train, y_test
 
    
def scipy_models(x_train, x_test, y_train, y_test, type_model, hidden, learn_rate, run_num, problem):

    print(run_num, ' is our exp run')

    tree_depth = 2
 
    if problem == 'classifification':
        if type_model == 0:  # SGD 
            model = MLPClassifier(hidden_layer_sizes=(hidden,), random_state=run_num, 
                                  max_iter=100,solver='sgd', learning_rate_init=learn_rate)  
        elif type_model == 1: #https://scikit-learn.org/stable/modules/tree.html (see how tree can be visualised)
            model = DecisionTreeClassifier(random_state=0, max_depth=tree_depth) 
        elif type_model == 2:
            model = RandomForestClassifier(n_estimators=100, max_depth=tree_depth, random_state=run_num)
            
        elif type_model == 3:
            model = AdaBoostClassifier(n_estimators=100, random_state=run_num)

        elif type_model == 4:
            model = GradientBoostingClassifier(n_estimators=10, random_state=run_num)

    elif problem == 'regression':
        if type_model == 0: #SGD  
            model = MLPRegressor(hidden_layer_sizes=(hidden*3,), random_state=run_num, 
                                 max_iter=500, solver='adam',learning_rate_init=learn_rate) 
        elif type_model == 1:  
            model = DecisionTreeRegressor(random_state=0, max_depth=tree_depth)
            
        elif type_model == 2: 
            model = RandomForestRegressor(n_estimators=100, max_depth=tree_depth, random_state=run_num)
            
        elif type_model == 3: 
            model = AdaBoostRegressor(n_estimators=100, random_state=run_num)
            
        elif type_model == 4:
            model = GradientBoostingRegressor(n_estimators=10, random_state=run_num)            
   
    # Train the model using the training sets
    model.fit(x_train, y_train)   

    if type_model ==1:
        r = export_text(model)
        print(r)

    # Make predictions using the testing set
    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train) 

    if problem == 'regression':
        perf_test = np.sqrt(mean_squared_error(y_test, y_pred_test)) 
        perf_train = np.sqrt(mean_squared_error(y_train, y_pred_train)) 

    if problem == 'classifification': 
        perf_test = accuracy_score(y_pred_test, y_test) 
        perf_train = accuracy_score(y_pred_train, y_train) 
        cm = confusion_matrix(y_pred_test, y_test) 
        #print(cm, 'is confusion matrix')
        #auc = roc_auc_score(y_pred, y_test, average=None) 

    return perf_test #,perf_train


def xgboost_models(x_train, x_test, y_train, y_test, type_model, hidden, learn_rate, run_num, problem):

    print(run_num, ' is our exp run')

    tree_depth = 2
 
    if problem == 'classifification':
        if type_model == 0:  
            model = xgb.XGBClassifier(colsample_bytree = 0.3, learning_rate = 0.1,
                                      max_depth = 5, alpha = 5, n_estimators = 100)            

    elif problem == 'regression':
        if type_model == 0:    # SGD  
            model = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1, 
                                     max_depth = 5, alpha = 5, n_estimators = 100)            
   
    # Train the model using the training sets
    model.fit(x_train, y_train)   

    if type_model == 1:
        r = export_text(model)
        print(r)

    # Make predictions using the testing set
    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train) 

    if problem == 'regression':
        perf_test = np.sqrt(mean_squared_error(y_test, y_pred_test)) 
        perf_train = np.sqrt(mean_squared_error(y_train, y_pred_train)) 

    if problem == 'classifification': 
        perf_test = accuracy_score(y_pred_test, y_test) 
        perf_train = accuracy_score(y_pred_train, y_train) 
        cm = confusion_matrix(y_pred_test, y_test) 
        #print(cm, 'is confusion matrix')
        #auc = roc_auc_score(y_pred, y_test, average=None) 

    return perf_test #,perf_train


def main(): 

    max_expruns = 5

    SGD_all = np.zeros(max_expruns) 
    forest_all = np.zeros(max_expruns) 
    tree_all = np.zeros(max_expruns) 
    adaboost_all = np.zeros(max_expruns)  
    xg_all = np.zeros(max_expruns)  
    gb_all = np.zeros(max_expruns)  
 
    learn_rate = 0.01
    hidden = 8

    prob = 'classifification' # classification or regression 
    #prob = 'regression'      # classification or regression 

    # classifcation accurary is reported for classification and RMSE for regression

    print(prob, ' is our problem') 
 
    for run_num in range(0,max_expruns): 

        x_train, x_test, y_train, y_test = read_data(run_num, prob)   
        
        acc_sgd = scipy_models(x_train, x_test, y_train, y_test, 0, hidden, learn_rate, run_num, prob) #SGD 
        acc_tree = scipy_models(x_train, x_test, y_train, y_test, 1, hidden, learn_rate, run_num, prob) #Decision Tree
        acc_forest = scipy_models(x_train, x_test, y_train, y_test, 2, hidden, learn_rate, run_num, prob) #Random Forests
        acc_adaboost = scipy_models(x_train, x_test, y_train, y_test, 3, hidden, learn_rate, run_num, prob) #adaboost
        acc_gb = scipy_models(x_train, x_test, y_train, y_test, 4, hidden, learn_rate, run_num, prob) #gboost
        acc_xg = xgboost_models(x_train, x_test, y_train, y_test, 0, hidden, learn_rate, run_num, prob) #adaboost
       
        SGD_all[run_num] = acc_sgd 
        tree_all[run_num] = acc_tree
        forest_all[run_num] = acc_forest
        adaboost_all[run_num] = acc_adaboost
        gb_all[run_num] = acc_gb
        xg_all[run_num] = acc_xg

    print(SGD_all,' nn_all')
    print(np.mean(SGD_all), ' mean nn_all')
    print(np.std(SGD_all), ' std nn_all')
 
    print(tree_all, ' tree_all')
    print(np.mean(tree_all), ' tree _all')
    print(np.std(tree_all), ' tree _all')

    print(forest_all, hidden, ' forest_all')
    print(np.mean(forest_all), ' forest _all')
    print(np.std(forest_all), ' forest _all')

    print(adaboost_all, 'adaboost_all')
    print(np.mean(adaboost_all), ' adaboost _all')
    print(np.std(adaboost_all), ' adaboost_all')
       
    print(gb_all, 'gb_all')
    print(np.mean(gb_all), ' gb _all')
    print(np.std(gb_all), ' gb_all')
 
    print(xg_all, 'xg_all')
    print(np.mean(xg_all), ' xg _all')
    print(np.std(xg_all), ' xg_all')


if __name__ == '__main__':
     main() 

classifification  is our problem
0  is our exp run
0  is our exp run
|--- feature_1 <= 154.50
|   |--- feature_7 <= 28.50
|   |   |--- class: 0.0
|   |--- feature_7 >  28.50
|   |   |--- class: 0.0
|--- feature_1 >  154.50
|   |--- feature_5 <= 29.95
|   |   |--- class: 0.0
|   |--- feature_5 >  29.95
|   |   |--- class: 1.0

0  is our exp run
0  is our exp run
0  is our exp run
0  is our exp run




1  is our exp run
1  is our exp run
|--- feature_1 <= 130.50
|   |--- feature_7 <= 27.50
|   |   |--- class: 0.0
|   |--- feature_7 >  27.50
|   |   |--- class: 0.0
|--- feature_1 >  130.50
|   |--- feature_5 <= 33.25
|   |   |--- class: 0.0
|   |--- feature_5 >  33.25
|   |   |--- class: 1.0

1  is our exp run
1  is our exp run
1  is our exp run
1  is our exp run
2  is our exp run




2  is our exp run
|--- feature_1 <= 127.50
|   |--- feature_0 <= 4.50
|   |   |--- class: 0.0
|   |--- feature_0 >  4.50
|   |   |--- class: 0.0
|--- feature_1 >  127.50
|   |--- feature_1 <= 165.50
|   |   |--- class: 1.0
|   |--- feature_1 >  165.50
|   |   |--- class: 1.0

2  is our exp run
2  is our exp run
2  is our exp run
2  is our exp run
3  is our exp run




3  is our exp run
|--- feature_1 <= 144.50
|   |--- feature_7 <= 28.50
|   |   |--- class: 0.0
|   |--- feature_7 >  28.50
|   |   |--- class: 0.0
|--- feature_1 >  144.50
|   |--- feature_1 <= 166.50
|   |   |--- class: 1.0
|   |--- feature_1 >  166.50
|   |   |--- class: 1.0

3  is our exp run
3  is our exp run
3  is our exp run
3  is our exp run
4  is our exp run




4  is our exp run
|--- feature_1 <= 127.50
|   |--- feature_7 <= 28.50
|   |   |--- class: 0.0
|   |--- feature_7 >  28.50
|   |   |--- class: 0.0
|--- feature_1 >  127.50
|   |--- feature_5 <= 29.95
|   |   |--- class: 0.0
|   |--- feature_5 >  29.95
|   |   |--- class: 1.0

4  is our exp run
4  is our exp run
4  is our exp run
4  is our exp run
[0.66558442 0.6461039  0.66558442 0.59415584 0.66558442]  nn_all
0.6474025974025974  mean nn_all
0.02767178669176953  std nn_all
[0.72402597 0.70454545 0.73051948 0.71428571 0.78896104]  tree_all
0.7324675324675325  tree _all
0.029586456493232507  tree _all
[0.74025974 0.75       0.74025974 0.71753247 0.78896104] 8  forest_all




0.7474025974025974  forest _all
0.02335857889429213  forest _all
[0.76298701 0.77597403 0.74025974 0.74675325 0.77272727] adaboost_all
0.7597402597402597  adaboost _all
0.014077586616025192  adaboost_all
[0.74675325 0.74675325 0.74675325 0.72402597 0.80519481] gb_all
0.753896103896104  gb _all
0.02711767924392353  gb_all
[0.75974026 0.76948052 0.76623377 0.74025974 0.78571429] xg_all
0.7642857142857142  xg _all
0.014750411287792906  xg_all


In [3]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.0-py3-none-win_amd64.whl (106.6 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.0
Note: you may need to restart the kernel to use updated packages.
