In [1]:
#Disable warning of Ripper implementation
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from scipy.io import arff
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedShuffleSplit
from sklearn import tree, svm
from sklearn.model_selection import cross_validate
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn_evaluation import plot
import seaborn as sns
import time

In [2]:
data = arff.loadarff('ionosphere.arff')
df_iono = pd.DataFrame(data[0])

data = arff.loadarff('diabetes.arff')
df_diabe = pd.DataFrame(data[0])

data = arff.loadarff('vehicle.arff')
df_Vehicle = pd.DataFrame(data[0])

data = arff.loadarff('vowel.arff')
df_vowel = pd.DataFrame(data[0])

data = arff.loadarff('iris.arff')
df_iris = pd.DataFrame(data[0])

data = arff.loadarff('letter.arff')
df_letter = pd.DataFrame(data[0])

In [3]:
# Changing the last categorical class value into a numerical value
df_iono['class'] = pd.factorize(df_iono['class'])[0]

# Changing the last categorical class value into a numerical value
df_diabe['class'] = pd.factorize(df_diabe['class'])[0]

# Changing the last categorical class value into a numerical value
df_Vehicle['Class'] = pd.factorize(df_Vehicle['Class'])[0]

# Changing the last categorical class value into a numerical value
df_vowel['Class'] = pd.factorize(df_vowel['Class'])[0]

# Changing the last categorical class value into a numerical value
df_iris['class'] = pd.factorize(df_iris['class'])[0]

# Changing the last categorical class value into a numerical value
df_letter['class'] = pd.factorize(df_letter['class'])[0]


In [4]:
trainIono, testIono = train_test_split(df_iono, test_size=.25)
X_trainIono = trainIono.drop('class', axis=1)
y_trainIono = trainIono['class']
X_testIono = testIono.drop('class', axis=1)
y_testIono = testIono['class']

trainDiabe, testDiabe = train_test_split(df_diabe, test_size=.25)
X_trainDiabe = trainDiabe.drop('class', axis=1)
y_trainDiabe = trainDiabe['class']
X_testDiabe = testDiabe.drop('class', axis=1)
y_testDiabe = testDiabe['class']

trainVehicle, testVehicle = train_test_split(df_Vehicle, test_size=.25)
X_trainVehicle = trainVehicle.drop('Class', axis=1)
y_trainVehicle = trainVehicle['Class']
X_testVehicle = testVehicle.drop('Class', axis=1)
y_testVehicle = testVehicle['Class']

trainVowel, testVowel = train_test_split(df_vowel, test_size=.25)
X_trainVowel = trainVowel.drop('Class', axis=1)
y_trainVowel = trainVowel['Class']
X_testVowel = testVowel.drop('Class', axis=1)
y_testVowel = testVowel['Class']

trainIris, testIris = train_test_split(df_iris, test_size=.25)
X_trainIris = trainIris.drop('class', axis=1)
y_trainIris = trainIris['class']
X_testIris = testIris.drop('class', axis=1)
y_testIris = testIris['class']

trainLetter, testLetter = train_test_split(df_letter, test_size=.25)
X_trainLetter = trainLetter.drop('class', axis=1)
y_trainLetter = trainLetter['class']
X_testLetter = testLetter.drop('class', axis=1)
y_testLetter = testLetter['class']

In [5]:
data = [
    ('Ionosphere', X_trainIono, y_trainIono, X_testIono, y_testIono),
    ('Diabetes', X_trainDiabe, y_trainDiabe,X_testDiabe, y_testDiabe),
    ('Vehicle', X_trainVehicle, y_trainVehicle, X_testVehicle, y_testVehicle),
    ('Vowel', X_trainVowel, y_trainVowel, X_testVowel, y_testVowel),
    ('Iris', X_trainIris, y_trainIris, X_testIris, y_testIris),
    ('Letter', X_trainLetter, y_trainLetter, X_testLetter, y_testLetter)
]

# Comparison without using GridSearch

In [6]:
clf_tree = tree.DecisionTreeClassifier()
clf_svm = svm.SVC()
clf_BaggingTree = BaggingClassifier(estimator=clf_tree)
clf_AdaBoostSAMMETree = AdaBoostClassifier(estimator=clf_tree,algorithm='SAMME')
clf_AdaBoostSAMMERTree = AdaBoostClassifier(estimator=clf_tree,algorithm='SAMME.R')
clf_BaggingSVM = BaggingClassifier(estimator=clf_svm)
clf_AdaBoostSAMMESVM = AdaBoostClassifier(estimator=clf_svm,algorithm='SAMME')
clf_AdaBoostSAMMERSVM = AdaBoostClassifier(estimator=clf_svm,algorithm='SAMME.R')
clf_GradBoost = GradientBoostingClassifier()

In [8]:
Datasets = []
TimeTrain = []
TimeScore = []
TimeCV = []
ScoreSVM = []
for i in data:
    init = time.time()
    cv = cross_validate(clf_svm,i[1], y=i[2],cv=10, n_jobs=-1)
    end = time.time()
    timeCV = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"Score de la Validación Cruzada:\n   score = {np.mean(cv['test_score'])} +- {np.std(cv['test_score'])}")
    print(f"Tiempo medio en ejecutarse el método (train): {np.mean(cv['fit_time'])} +- {np.mean(np.mean(cv['fit_time']))}s")
    print(f"Tiempo medio en ejecutarse el método (score): {np.mean(cv['score_time'])} +- {np.mean(np.mean(cv['score_time']))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeCV}s, ({timeCV/60} min)")
    Datasets.append(i[0])
    TimeTrain.append(np.mean(cv['fit_time']))
    TimeScore.append(np.mean(cv['score_time']))
    TimeCV.append(timeCV)
    ScoreSVM.append(np.mean(cv['test_score']))

my_dict = dict(Dataset=Datasets,TimeTrain=TimeTrain, TimeScore=TimeScore,TimeCV=TimeCV, Score=ScoreSVM)
SVMDF = pd.DataFrame (my_dict)
print(SVMDF.to_latex())


--------- Ionosphere ---------
Score de la Validación Cruzada:
   score = 0.9272079772079772 +- 0.03655976731195557
Tiempo medio en ejecutarse el método (train): 0.006222295761108399 +- 0.006222295761108399s
Tiempo medio en ejecutarse el método (score): 0.002826356887817383 +- 0.002826356887817383s
Tiempo en ejecutarse la búsqueda 3.044736623764038s, (0.0507456103960673 min)

--------- Diabetes ---------
Score de la Validación Cruzada:
   score = 0.7551119177253478 +- 0.06743984851311845
Tiempo medio en ejecutarse el método (train): 0.014174675941467286 +- 0.014174675941467286s
Tiempo medio en ejecutarse el método (score): 0.0041184663772583004 +- 0.0041184663772583004s
Tiempo en ejecutarse la búsqueda 1.0964512825012207s, (0.01827418804168701 min)

--------- Vehicle ---------
Score de la Validación Cruzada:
   score = 0.8232638888888889 +- 0.03830704516341843
Tiempo medio en ejecutarse el método (train): 0.01577138900756836 +- 0.01577138900756836s
Tiempo medio en ejecutarse el método

In [9]:
Datasets = []
TimeTrain = []
TimeScore = []
TimeCV = []
ScoreTree = []
for i in data:
    init = time.time()
    cv = cross_validate(clf_tree,i[1], y=i[2],cv=10, n_jobs=-1)
    end = time.time()
    timeCV = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"Score de la Validación Cruzada:\n   score = {np.mean(cv['test_score'])} +- {np.std(cv['test_score'])}")
    print(f"Tiempo medio en ejecutarse el método (train): {np.mean(cv['fit_time'])} +- {np.mean(np.mean(cv['fit_time']))}s")
    print(f"Tiempo medio en ejecutarse el método (score): {np.mean(cv['score_time'])} +- {np.mean(np.mean(cv['score_time']))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeCV}s, ({timeCV/60} min)")
    Datasets.append(i[0])
    TimeTrain.append(np.mean(cv['fit_time']))
    TimeScore.append(np.mean(cv['score_time']))
    TimeCV.append(timeCV)
    ScoreTree.append(np.mean(cv['test_score']))

my_dict = dict(Dataset=Datasets,TimeTrain=TimeTrain, TimeScore=TimeScore,TimeCV=TimeCV, Score=ScoreTree)
SVMDF = pd.DataFrame (my_dict)
print(SVMDF.to_latex())


--------- Ionosphere ---------
Score de la Validación Cruzada:
   score = 0.8703703703703702 +- 0.04587248486639492
Tiempo medio en ejecutarse el método (train): 0.013461613655090332 +- 0.013461613655090332s
Tiempo medio en ejecutarse el método (score): 0.0026299476623535155 +- 0.0026299476623535155s
Tiempo en ejecutarse la búsqueda 0.13106179237365723s, (0.0021843632062276204 min)

--------- Diabetes ---------
Score de la Validación Cruzada:
   score = 0.6803992740471869 +- 0.0436210209538901
Tiempo medio en ejecutarse el método (train): 0.007199478149414062 +- 0.007199478149414062s
Tiempo medio en ejecutarse el método (score): 0.0022942066192626954 +- 0.0022942066192626954s
Tiempo en ejecutarse la búsqueda 0.058751821517944336s, (0.0009791970252990723 min)

--------- Vehicle ---------
Score de la Validación Cruzada:
   score = 0.927405753968254 +- 0.016273716056452166
Tiempo medio en ejecutarse el método (train): 0.00684504508972168 +- 0.00684504508972168s
Tiempo medio en ejecutarse

In [10]:
Datasets = []
TimeTrain = []
TimeScore = []
TimeCV = []
ScoreBaggingSVM = []
for i in data:
    init = time.time()
    cv = cross_validate(clf_BaggingSVM,i[1], y=i[2],cv=10, n_jobs=-1)
    end = time.time()
    timeCV = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"Score de la Validación Cruzada:\n   score = {np.mean(cv['test_score'])} +- {np.std(cv['test_score'])}")
    print(f"Tiempo medio en ejecutarse el método (train): {np.mean(cv['fit_time'])} +- {np.mean(np.mean(cv['fit_time']))}s")
    print(f"Tiempo medio en ejecutarse el método (score): {np.mean(cv['score_time'])} +- {np.mean(np.mean(cv['score_time']))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeCV}s, ({timeCV/60} min)")
    Datasets.append(i[0])
    TimeTrain.append(np.mean(cv['fit_time']))
    TimeScore.append(np.mean(cv['score_time']))
    TimeCV.append(timeCV)
    ScoreBaggingSVM.append(np.mean(cv['test_score']))

my_dict = dict(Dataset=Datasets,TimeTrain=TimeTrain, TimeScore=TimeScore,TimeCV=TimeCV, Score=ScoreBaggingSVM)
SVMDF = pd.DataFrame (my_dict)
print(SVMDF.to_latex())


--------- Ionosphere ---------
Score de la Validación Cruzada:
   score = 0.9272079772079772 +- 0.03655976731195557
Tiempo medio en ejecutarse el método (train): 0.044843220710754396 +- 0.044843220710754396s
Tiempo medio en ejecutarse el método (score): 0.007536458969116211 +- 0.007536458969116211s
Tiempo en ejecutarse la búsqueda 0.10985589027404785s, (0.0018309315045674643 min)

--------- Diabetes ---------
Score de la Validación Cruzada:
   score = 0.7602843315184512 +- 0.060405141892844516
Tiempo medio en ejecutarse el método (train): 0.06925463676452637 +- 0.06925463676452637s
Tiempo medio en ejecutarse el método (score): 0.01099848747253418 +- 0.01099848747253418s
Tiempo en ejecutarse la búsqueda 0.1377706527709961s, (0.0022961775461832683 min)

--------- Vehicle ---------
Score de la Validación Cruzada:
   score = 0.8169394841269841 +- 0.03453561142295125
Tiempo medio en ejecutarse el método (train): 0.07515547275543213 +- 0.07515547275543213s
Tiempo medio en ejecutarse el méto

In [11]:
Datasets = []
TimeTrain = []
TimeScore = []
TimeCV = []
ScoreBaggingTree = []
for i in data:
    init = time.time()
    cv = cross_validate(clf_BaggingTree,i[1], y=i[2],cv=10, n_jobs=-1)
    end = time.time()
    timeCV = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"Score de la Validación Cruzada:\n   score = {np.mean(cv['test_score'])} +- {np.std(cv['test_score'])}")
    print(f"Tiempo medio en ejecutarse el método (train): {np.mean(cv['fit_time'])} +- {np.mean(np.mean(cv['fit_time']))}s")
    print(f"Tiempo medio en ejecutarse el método (score): {np.mean(cv['score_time'])} +- {np.mean(np.mean(cv['score_time']))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeCV}s, ({timeCV/60} min)")
    Datasets.append(i[0])
    TimeTrain.append(np.mean(cv['fit_time']))
    TimeScore.append(np.mean(cv['score_time']))
    TimeCV.append(timeCV)
    ScoreBaggingTree.append(np.mean(cv['test_score']))

my_dict = dict(Dataset=Datasets,TimeTrain=TimeTrain, TimeScore=TimeScore,TimeCV=TimeCV, Score=ScoreBaggingTree)
SVMDF = pd.DataFrame (my_dict)
print(SVMDF.to_latex())


--------- Ionosphere ---------
Score de la Validación Cruzada:
   score = 0.9122507122507123 +- 0.04213828352019221
Tiempo medio en ejecutarse el método (train): 0.0751305341720581 +- 0.0751305341720581s
Tiempo medio en ejecutarse el método (score): 0.0045659542083740234 +- 0.0045659542083740234s
Tiempo en ejecutarse la búsqueda 0.11945891380310059s, (0.001990981896718343 min)

--------- Diabetes ---------
Score de la Validación Cruzada:
   score = 0.7535390199637023 +- 0.05073341056061811
Tiempo medio en ejecutarse el método (train): 0.04737944602966308 +- 0.04737944602966308s
Tiempo medio en ejecutarse el método (score): 0.004109096527099609 +- 0.004109096527099609s
Tiempo en ejecutarse la búsqueda 0.07030749320983887s, (0.0011717915534973145 min)

--------- Vehicle ---------
Score de la Validación Cruzada:
   score = 0.9509920634920634 +- 0.024100507774603035
Tiempo medio en ejecutarse el método (train): 0.057846474647521975 +- 0.057846474647521975s
Tiempo medio en ejecutarse el mé

In [12]:
Datasets = []
TimeTrain = []
TimeScore = []
TimeCV = []
ScoreBoosting1_SVM = []
for i in data:
    init = time.time()
    cv = cross_validate(clf_AdaBoostSAMMESVM,i[1], y=i[2],cv=10, n_jobs=-1)
    end = time.time()
    timeCV = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"Score de la Validación Cruzada:\n   score = {np.mean(cv['test_score'])} +- {np.std(cv['test_score'])}")
    print(f"Tiempo medio en ejecutarse el método (train): {np.mean(cv['fit_time'])} +- {np.mean(np.mean(cv['fit_time']))}s")
    print(f"Tiempo medio en ejecutarse el método (score): {np.mean(cv['score_time'])} +- {np.mean(np.mean(cv['score_time']))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeCV}s, ({timeCV/60} min)")
    Datasets.append(i[0])
    TimeTrain.append(np.mean(cv['fit_time']))
    TimeScore.append(np.mean(cv['score_time']))
    TimeCV.append(timeCV)
    ScoreBoosting1_SVM.append(np.mean(cv['test_score']))

my_dict = dict(Dataset=Datasets,TimeTrain=TimeTrain, TimeScore=TimeScore,TimeCV=TimeCV, Score=ScoreBoosting1_SVM)
SVMDF = pd.DataFrame (my_dict)
print(SVMDF.to_latex())


--------- Ionosphere ---------
Score de la Validación Cruzada:
   score = 0.6159544159544159 +- 0.009602449883961674
Tiempo medio en ejecutarse el método (train): 0.14536588191986083 +- 0.14536588191986083s
Tiempo medio en ejecutarse el método (score): 0.010915851593017578 +- 0.010915851593017578s
Tiempo en ejecutarse la búsqueda 0.5188491344451904s, (0.008647485574086507 min)

--------- Diabetes ---------
Score de la Validación Cruzada:
   score = 0.6458560193587417 +- 0.006837393933965337
Tiempo medio en ejecutarse el método (train): 0.29889588356018065 +- 0.29889588356018065s
Tiempo medio en ejecutarse el método (score): 0.019411635398864747 +- 0.019411635398864747s
Tiempo en ejecutarse la búsqueda 1.3182523250579834s, (0.021970872084299722 min)

--------- Vehicle ---------
Score de la Validación Cruzada:
   score = 0.7602678571428572 +- 0.005332283591419313
Tiempo medio en ejecutarse el método (train): 0.2613798141479492 +- 0.2613798141479492s
Tiempo medio en ejecutarse el método 

In [13]:
Datasets = []
TimeTrain = []
TimeScore = []
TimeCV = []
ScoreBoosting1_Tree = []
for i in data:
    init = time.time()
    cv = cross_validate(clf_AdaBoostSAMMETree,i[1], y=i[2],cv=10, n_jobs=-1)
    end = time.time()
    timeCV = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"Score de la Validación Cruzada:\n   score = {np.mean(cv['test_score'])} +- {np.std(cv['test_score'])}")
    print(f"Tiempo medio en ejecutarse el método (train): {np.mean(cv['fit_time'])} +- {np.mean(np.mean(cv['fit_time']))}s")
    print(f"Tiempo medio en ejecutarse el método (score): {np.mean(cv['score_time'])} +- {np.mean(np.mean(cv['score_time']))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeCV}s, ({timeCV/60} min)")
    Datasets.append(i[0])
    TimeTrain.append(np.mean(cv['fit_time']))
    TimeScore.append(np.mean(cv['score_time']))
    TimeCV.append(timeCV)
    ScoreBoosting1_Tree.append(np.mean(cv['test_score']))

my_dict = dict(Dataset=Datasets,TimeTrain=TimeTrain, TimeScore=TimeScore,TimeCV=TimeCV, Score=ScoreBoosting1_Tree)
SVMDF = pd.DataFrame (my_dict)
print(SVMDF.to_latex())


--------- Ionosphere ---------
Score de la Validación Cruzada:
   score = 0.8628205128205128 +- 0.04613735592295677
Tiempo medio en ejecutarse el método (train): 0.00831308364868164 +- 0.00831308364868164s
Tiempo medio en ejecutarse el método (score): 0.001977705955505371 +- 0.001977705955505371s
Tiempo en ejecutarse la búsqueda 0.11420869827270508s, (0.0019034783045450846 min)

--------- Diabetes ---------
Score de la Validación Cruzada:
   score = 0.6804900181488203 +- 0.04820611422491956
Tiempo medio en ejecutarse el método (train): 0.006333398818969727 +- 0.006333398818969727s
Tiempo medio en ejecutarse el método (score): 0.0016691446304321288 +- 0.0016691446304321288s
Tiempo en ejecutarse la búsqueda 0.025470256805419922s, (0.00042450428009033203 min)

--------- Vehicle ---------
Score de la Validación Cruzada:
   score = 0.9353422619047619 +- 0.010934687667086433
Tiempo medio en ejecutarse el método (train): 0.005640053749084472 +- 0.005640053749084472s
Tiempo medio en ejecutars

AdaBoost with SVM and SAMMER_R is not supported

In [14]:
"""
Datasets = []
TimeTrain = []
TimeScore = []
TimeCV = []
ScoreBoosting2_SVM = []
for i in data:
    init = time.time()
    cv = cross_validate(clf_AdaBoostSAMMERSVM,i[1], y=i[2],cv=10, n_jobs=-1, error_score='raise')
    end = time.time()
    timeCV = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"Score de la Validación Cruzada:\n   score = {np.mean(cv['test_score'])} +- {np.std(cv['test_score'])}")
    print(f"Tiempo medio en ejecutarse el método (train): {np.mean(cv['fit_time'])} +- {np.mean(np.mean(cv['fit_time']))}s")
    print(f"Tiempo medio en ejecutarse el método (score): {np.mean(cv['score_time'])} +- {np.mean(np.mean(cv['score_time']))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeCV}s, ({timeCV/60} min)")
    Datasets.append(i[0])
    TimeTrain.append(np.mean(cv['fit_time']))
    TimeScore.append(np.mean(cv['score_time']))
    TimeCV.append(timeCV)
    ScoreBoosting2_SVM.append(np.mean(cv['test_score']))

my_dict = dict(Dataset=Datasets,TimeTrain=TimeTrain, TimeScore=TimeScore,TimeCV=TimeCV, Score=Score)
SVMDF = pd.DataFrame (my_dict)
print(SVMDF.to_latex())
"""

'\nDatasets = []\nTimeTrain = []\nTimeScore = []\nTimeCV = []\nScoreBoosting2_SVM = []\nfor i in data:\n    init = time.time()\n    cv = cross_validate(clf_AdaBoostSAMMERSVM,i[1], y=i[2],cv=10, n_jobs=-1, error_score=\'raise\')\n    end = time.time()\n    timeCV = end - init\n    print(f"\n--------- {i[0]} ---------")\n    print(f"Score de la Validación Cruzada:\n   score = {np.mean(cv[\'test_score\'])} +- {np.std(cv[\'test_score\'])}")\n    print(f"Tiempo medio en ejecutarse el método (train): {np.mean(cv[\'fit_time\'])} +- {np.mean(np.mean(cv[\'fit_time\']))}s")\n    print(f"Tiempo medio en ejecutarse el método (score): {np.mean(cv[\'score_time\'])} +- {np.mean(np.mean(cv[\'score_time\']))}s")\n    print(f"Tiempo en ejecutarse la búsqueda {timeCV}s, ({timeCV/60} min)")\n    Datasets.append(i[0])\n    TimeTrain.append(np.mean(cv[\'fit_time\']))\n    TimeScore.append(np.mean(cv[\'score_time\']))\n    TimeCV.append(timeCV)\n    ScoreBoosting2_SVM.append(np.mean(cv[\'test_score\']))\n\nm

In [15]:
Datasets = []
TimeTrain = []
TimeScore = []
TimeCV = []
ScoreBoosting2_Tree = []
for i in data:
    init = time.time()
    cv = cross_validate(clf_AdaBoostSAMMERTree,i[1], y=i[2],cv=10, n_jobs=-1)
    end = time.time()
    timeCV = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"Score de la Validación Cruzada:\n   score = {np.mean(cv['test_score'])} +- {np.std(cv['test_score'])}")
    print(f"Tiempo medio en ejecutarse el método (train): {np.mean(cv['fit_time'])} +- {np.mean(np.mean(cv['fit_time']))}s")
    print(f"Tiempo medio en ejecutarse el método (score): {np.mean(cv['score_time'])} +- {np.mean(np.mean(cv['score_time']))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeCV}s, ({timeCV/60} min)")
    Datasets.append(i[0])
    TimeTrain.append(np.mean(cv['fit_time']))
    TimeScore.append(np.mean(cv['score_time']))
    TimeCV.append(timeCV)
    ScoreBoosting2_Tree.append(np.mean(cv['test_score']))

my_dict = dict(Dataset=Datasets,TimeTrain=TimeTrain, TimeScore=TimeScore,TimeCV=TimeCV, Score=ScoreBoosting2_Tree)
SVMDF = pd.DataFrame (my_dict)
print(SVMDF.to_latex())


--------- Ionosphere ---------
Score de la Validación Cruzada:
   score = 0.8665242165242166 +- 0.05491282085574618
Tiempo medio en ejecutarse el método (train): 0.007663512229919433 +- 0.007663512229919433s
Tiempo medio en ejecutarse el método (score): 0.0013726472854614258 +- 0.0013726472854614258s
Tiempo en ejecutarse la búsqueda 0.43686938285827637s, (0.007281156380971273 min)

--------- Diabetes ---------
Score de la Validación Cruzada:
   score = 0.6752571082879613 +- 0.05043957376081613
Tiempo medio en ejecutarse el método (train): 0.004450702667236328 +- 0.004450702667236328s
Tiempo medio en ejecutarse el método (score): 0.001280808448791504 +- 0.001280808448791504s
Tiempo en ejecutarse la búsqueda 0.5096859931945801s, (0.008494766553243 min)

--------- Vehicle ---------
Score de la Validación Cruzada:
   score = 0.9353670634920634 +- 0.012828535622418924
Tiempo medio en ejecutarse el método (train): 0.004993653297424317 +- 0.004993653297424317s
Tiempo medio en ejecutarse el m

In [16]:
Datasets = []
TimeTrain = []
TimeScore = []
TimeCV = []
ScoreGradBoost = []
for i in data:
    init = time.time()
    cv = cross_validate(clf_GradBoost,i[1], y=i[2],cv=10, n_jobs=-1)
    end = time.time()
    timeCV = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"Score de la Validación Cruzada:\n   score = {np.mean(cv['test_score'])} +- {np.std(cv['test_score'])}")
    print(f"Tiempo medio en ejecutarse el método (train): {np.mean(cv['fit_time'])} +- {np.mean(np.mean(cv['fit_time']))}s")
    print(f"Tiempo medio en ejecutarse el método (score): {np.mean(cv['score_time'])} +- {np.mean(np.mean(cv['score_time']))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeCV}s, ({timeCV/60} min)")
    Datasets.append(i[0])
    TimeTrain.append(np.mean(cv['fit_time']))
    TimeScore.append(np.mean(cv['score_time']))
    TimeCV.append(timeCV)
    ScoreGradBoost.append(np.mean(cv['test_score']))

my_dict = dict(Dataset=Datasets,TimeTrain=TimeTrain, TimeScore=TimeScore,TimeCV=TimeCV, Score=ScoreGradBoost)
SVMDF = pd.DataFrame (my_dict)
print(SVMDF.to_latex())


--------- Ionosphere ---------
Score de la Validación Cruzada:
   score = 0.9235042735042736 +- 0.04227218477942589
Tiempo medio en ejecutarse el método (train): 0.18839969635009765 +- 0.18839969635009765s
Tiempo medio en ejecutarse el método (score): 0.0014230012893676758 +- 0.0014230012893676758s
Tiempo en ejecutarse la búsqueda 0.21954035758972168s, (0.0036590059598286947 min)

--------- Diabetes ---------
Score de la Validación Cruzada:
   score = 0.7654869933454325 +- 0.05773715833974873
Tiempo medio en ejecutarse el método (train): 0.10426268577575684 +- 0.10426268577575684s
Tiempo medio en ejecutarse el método (score): 0.0012633562088012694 +- 0.0012633562088012694s
Tiempo en ejecutarse la búsqueda 0.1313488483428955s, (0.002189147472381592 min)

--------- Vehicle ---------
Score de la Validación Cruzada:
   score = 0.9542410714285714 +- 0.021733111734585763
Tiempo medio en ejecutarse el método (train): 0.1734703302383423 +- 0.1734703302383423s
Tiempo medio en ejecutarse el mét

In [17]:
compareDF = pd.DataFrame()
compareDF['SVM'] = ScoreSVM
compareDF['Tree'] = ScoreTree
compareDF['BaggingSVM'] = ScoreBaggingSVM
compareDF['BaggingTree'] = ScoreBaggingTree
compareDF['Boosting1_SVM'] = ScoreBoosting1_SVM
compareDF['Boosting1_Tree'] = ScoreBoosting1_Tree
compareDF['Boosting2_Tree'] = ScoreBoosting2_Tree
compareDF['GradBoost'] = ScoreGradBoost
compareDF.to_csv("performance.csv", index=False)
print(compareDF)
print(compareDF.to_latex())

        SVM      Tree  BaggingSVM  BaggingTree  Boosting1_SVM  Boosting1_Tree  \
0  0.927208  0.870370    0.927208     0.912251       0.615954        0.862821   
1  0.755112  0.680399    0.760284     0.753539       0.645856        0.680490   
2  0.823264  0.927406    0.816939     0.950992       0.760268        0.935342   
3  0.946054  0.975712    0.950072     0.983820       0.916342        0.978414   
4  0.981818  0.954545    0.972727     0.945455       0.366667        0.954545   
5  0.920600  0.862200    0.920600     0.922200       0.080467        0.864867   

   Boosting2_Tree  GradBoost  
0        0.866524   0.923504  
1        0.675257   0.765487  
2        0.935367   0.954241  
3        0.979766   0.987874  
4        0.954545   0.954545  
5        0.863333   0.917333  
\begin{tabular}{lrrrrrrrr}
\toprule
{} &       SVM &      Tree &  BaggingSVM &  BaggingTree &  Boosting1\_SVM &  Boosting1\_Tree &  Boosting2\_Tree &  GradBoost \\
\midrule
0 &  0.927208 &  0.870370 &    0.927208 & 

In [18]:
import numpy as np
from scipy.stats import chi2

def compute_iman_davenport_statistic(performance_matrix):
  # Compute the ranks of the model performance on each dataset
  ranks = np.apply_along_axis(lambda x: len(x) - np.argsort(np.argsort(x)), 1, performance_matrix)

  # Sum the ranks for each model across all datasets
  rank_sums = np.sum(ranks, axis=1)

  # Compute the iman Davenport statistic
  iman_davenport_statistic = (np.max(rank_sums) - np.min(rank_sums)) / performance_matrix.shape[1]

  return iman_davenport_statistic

def compute_p_value(iman_davenport_statistic, num_models, num_datasets):
  # Compute the degrees of freedom for the iman Davenport test
  df = num_models - 1

  # Compute the p-value using the chi-squared distribution
  p_value = 1 - chi2.cdf(iman_davenport_statistic, df)

  return p_value

def iman_davenport_test(performance_matrix, significance_level):
  # Compute the iman Davenport statistic and p-value
  iman_davenport_statistic = compute_iman_davenport_statistic(performance_matrix)
  p_value = compute_p_value(iman_davenport_statistic, performance_matrix.shape[0], performance_matrix.shape[1])

  # Determine whether the difference in performance between the models is statistically significant
  if p_value < significance_level:
    print(f"The difference in performance between the models is statistically significant (p = {p_value:.3f})")
  else:
    print(f"The difference in performance between the models is not statistically significant (p = {p_value:.3f})")
  return p_value

In [19]:
performance_matrix = pd.read_csv("performance.csv")
# Run the iman Davenport test
p_value = iman_davenport_test(performance_matrix, 0.05)

The difference in performance between the models is not statistically significant (p = 1.000)


In [20]:
# If there are significant differences between the models, then apply the wilcoxon test to determine which models are significantly different
if p_value < 0.05:
  # Compute the pairwise differences between the models
  pairwise_differences = np.apply_along_axis(lambda x: x - x[:, None], 1, performance_matrix)

  # Compute the p-values for the pairwise differences using the Wilcoxon signed-rank test
  p_values = np.apply_along_axis(lambda x: wilcoxon(x, zero_method="wilcox")[1], 1, pairwise_differences)

  # Compute the Bonferroni correction
  bonferroni_correction = 0.05 / (p_values.shape[0] * (p_values.shape[0] - 1) / 2)

  # Determine which models are significantly different
  significant_differences = np.argwhere(p_values < bonferroni_correction)

  # Print the significant differences
  for difference in significant_differences:
    print(f"Model {difference[0]} is significantly different from model {difference[1]}")

# Comparison using GridSearch

In [27]:
parametersSVM = [
    {"kernel": ["rbf"], "C": [1, 10, 100, 1000], 'gamma': [0.01, 0.1, 1]},
    {"kernel": ["linear"], "C": [1, 10, 100, 1000]}
]
parametersTree = {
    'criterion':['gini', 'entropy', 'log_loss'],
    'splitter' : ['best', 'random']
}
parametersSVMBagging = [
    {
    'n_estimators':[10], # Numero de estimators = 10 porque si no tarda demasiado
    "estimator__kernel": ["rbf"],
    "estimator__C": [1, 10, 100, 1000],
    'estimator__gamma': [0.01, 0.1, 1],
    #'max_samples': [0.75, 1]
    # Saltan warnings si aleatoriamente solo seleccionamos instancias de una clase. Podríamos ignorar dichos fits o capturar los warning. Eliminamos el problema directamente
    'max_features': [0.5, 0.75, 1],
    'bootstrap': [True, False]
    },
    {
    'n_estimators':[10], # Numero de estimators = 10 porque si no tarda demasiado
    "estimator__kernel": ["linear"],
    "estimator__C": [1, 10, 100, 1000],
    #'max_samples': [0.75, 1]
    # Saltan warnings si aleatoriamente solo seleccionamos instancias de una clase. Podríamos ignorar dichos fits o capturar los warning. Eliminamos el problema directamente
    'max_features': [0.5, 0.75, 1],
    'bootstrap': [True, False]
    }
]
parametersTreeBagging = {
    'estimator__criterion':['gini', 'entropy', 'log_loss'],
    'estimator__splitter' : ['best', 'random'],
    # {'max_samples': [0.75, 1],
    'max_features': [0.5, 0.75, 1],
    'bootstrap': [True, False]
}
parametersSVMBoosting = [
    {
    "estimator__kernel": ["rbf"],
    "estimator__C": [1, 10, 100, 1000],
    'estimator__gamma': [0.01, 0.1, 1],
    'n_estimators':[25, 50],
    'learning_rate':[0.1, 1, 10]
    },{
    "estimator__kernel": ["linear"],
    "estimator__C": [1, 10, 100, 1000],
    'n_estimators':[25, 50, 75],
    'learning_rate':[0.1, 1, 10]
    }
]
parametersTreeBoosting = {
    'estimator__criterion':['gini', 'entropy', 'log_loss'],
    'estimator__splitter' : ['best', 'random'],
    'n_estimators':[25, 50],
    'learning_rate':[0.1,1,10]
}
parametersGradBoosting = {
    "loss" : ['log_loss', 'deviance', 'exponential'],
    'learning_rate' : [0.01, 0.1, 1],
    'n_estimators': [100],
    'criterion': ['friedman_mse', 'squared_error'],
    'warm_start': [True, False]
}

optimalSVM = GridSearchCV(estimator=clf_svm, cv=10, param_grid=parametersSVM, n_jobs=-1)
optimalTree = GridSearchCV(estimator=clf_tree, cv=10, param_grid=parametersTree, n_jobs=-1)
optimalSVMBag = GridSearchCV(estimator=clf_BaggingSVM, cv=10, param_grid=parametersSVMBagging, n_jobs=-1)
optimalTreeBag = GridSearchCV(estimator=clf_BaggingTree, cv=10, param_grid=parametersTreeBagging, n_jobs=-1)
optimalSVMBoost1 = GridSearchCV(estimator=clf_AdaBoostSAMMESVM, cv=10, param_grid=parametersSVMBoosting, n_jobs=-1)
optimalTreeBoost1 = GridSearchCV(estimator=clf_AdaBoostSAMMETree, cv=10, param_grid=parametersTreeBoosting, n_jobs=-1)
optimalSVMBoost2 = GridSearchCV(estimator=clf_AdaBoostSAMMERSVM, cv=10, param_grid=parametersTreeBoosting, n_jobs=-1)
optimalTreeBoost2 = GridSearchCV(estimator=clf_AdaBoostSAMMERTree, cv=10, param_grid=parametersTreeBoosting, n_jobs=-1)
optimalGradBoost = GridSearchCV(estimator=clf_GradBoost, cv=10, param_grid=parametersGradBoosting, n_jobs=-1)

In [28]:
Datasets = []
TimeSearch = []
TimeMethod = []
ScoreSVM = []
for i in data:
    init = time.time()
    optimalSVM.fit(i[1], i[2])
    end = time.time()
    timeSearch = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"La mejor accuracy se obtuvo con el siguiente SVM:")
    print(f'    Best params -> {optimalSVM.best_params_}')
    print(f'    Best score -> {optimalSVM.best_score_}')

    print(f"Si usamos el dataset de test, obtenemos el siguiente resultado:")
    print(f"    score = {optimalSVM.score(i[3], i[4])}")
    print(f"Tiempo medio en ejecutarse el método: {np.mean(optimalSVM.cv_results_.get('mean_fit_time'))} +- {np.mean(optimalSVM.cv_results_.get('std_fit_time'))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeSearch}s, ({timeSearch/60} min)")
    Datasets.append(i[0])
    TimeMethod.append(np.mean(optimalSVM.cv_results_.get('mean_fit_time')))
    TimeSearch.append(timeSearch)
    ScoreSVM.append(optimalSVM.score(i[3], i[4]))

my_dict = dict(Dataset=Datasets,TimeMethod=TimeMethod,TimeSearch=TimeSearch, Score=ScoreSVM)
SVMDF = pd.DataFrame (my_dict)
print(SVMDF.to_latex())


--------- Ionosphere ---------
La mejor accuracy se obtuvo con el siguiente SVM:
    Best params -> {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
    Best score -> 0.931054131054131
Si usamos el dataset de test, obtenemos el siguiente resultado:
    score = 0.9886363636363636
Tiempo medio en ejecutarse el método: 0.01968134343624115 +- 0.007482357550970337s
Tiempo en ejecutarse la búsqueda 2.148488759994507s, (0.03580814599990845 min)

--------- Diabetes ---------
La mejor accuracy se obtuvo con el siguiente SVM:
    Best params -> {'C': 10, 'kernel': 'linear'}
    Best score -> 0.7724137931034483
Si usamos el dataset de test, obtenemos el siguiente resultado:
    score = 0.7708333333333334
Tiempo medio en ejecutarse el método: 10.642622084915638 +- 3.6427488372419266s
Tiempo en ejecutarse la búsqueda 177.8575484752655s, (2.9642924745877584 min)

--------- Vehicle ---------
La mejor accuracy se obtuvo con el siguiente SVM:
    Best params -> {'C': 1, 'kernel': 'linear'}
    Best score -> 0.

In [29]:
Datasets = []
TimeSearch = []
TimeMethod = []
ScoreTree = []
for i in data:
    init = time.time()
    optimalTree.fit(i[1], i[2])
    end = time.time()
    timeSearch = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"La mejor accuracy se obtuvo con el siguiente Tree:")
    print(f'    Best params -> {optimalTree.best_params_}')
    print(f'    Best score -> {optimalTree.best_score_}')

    print(f"Si usamos el dataset de test, obtenemos el siguiente resultado:")
    print(f"    score = {optimalTree.score(i[3], i[4])}")
    print(f"Tiempo medio en ejecutarse el método: {np.mean(optimalTree.cv_results_.get('mean_fit_time'))} +- {np.mean(optimalTree.cv_results_.get('std_fit_time'))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeSearch}s, ({timeSearch/60} min)")
    Datasets.append(i[0])
    TimeMethod.append(np.mean(optimalTree.cv_results_.get('mean_fit_time')))
    TimeSearch.append(timeSearch)
    ScoreTree.append(optimalTree.score(i[3], i[4]))

my_dict = dict(Dataset=Datasets,TimeMethod=TimeMethod,TimeSearch=TimeSearch, Score=ScoreTree)
TreeDF = pd.DataFrame (my_dict)
print(TreeDF.to_latex())


--------- Ionosphere ---------
La mejor accuracy se obtuvo con el siguiente Tree:
    Best params -> {'criterion': 'log_loss', 'splitter': 'random'}
    Best score -> 0.8971509971509972
Si usamos el dataset de test, obtenemos el siguiente resultado:
    score = 0.8863636363636364
Tiempo medio en ejecutarse el método: 0.004364693164825439 +- 0.0005929095087477455s
Tiempo en ejecutarse la búsqueda 0.19702720642089844s, (0.003283786773681641 min)

--------- Diabetes ---------
La mejor accuracy se obtuvo con el siguiente Tree:
    Best params -> {'criterion': 'entropy', 'splitter': 'best'}
    Best score -> 0.7065033272837267
Si usamos el dataset de test, obtenemos el siguiente resultado:
    score = 0.7447916666666666
Tiempo medio en ejecutarse el método: 0.003492077191670736 +- 0.0002670651082585429s
Tiempo en ejecutarse la búsqueda 0.08609962463378906s, (0.0014349937438964844 min)

--------- Vehicle ---------
La mejor accuracy se obtuvo con el siguiente Tree:
    Best params -> {'crite

In [31]:
Datasets = []
TimeSearch = []
TimeMethod = []
ScoreBaggingSVM = []
for i in data:
    init = time.time()
    optimalSVMBag.fit(i[1], i[2])
    end = time.time()
    timeSearch = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"La mejor accuracy se obtuvo con el siguiente SVMBag:")
    print(f'    Best params -> {optimalSVMBag.best_params_}')
    print(f'    Best score -> {optimalSVMBag.best_score_}')

    print(f"Si usamos el dataset de test, obtenemos el siguiente resultado:")
    print(f"    score = {optimalSVMBag.score(i[3], i[4])}")
    print(f"Tiempo medio en ejecutarse el método: {np.mean(optimalSVMBag.cv_results_.get('mean_fit_time'))} +- {np.mean(optimalSVMBag.cv_results_.get('std_fit_time'))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeSearch}s, ({timeSearch/60} min)")
    Datasets.append(i[0])
    TimeMethod.append(np.mean(optimalSVMBag.cv_results_.get('mean_fit_time')))
    TimeSearch.append(timeSearch)
    ScoreBaggingSVM.append(optimalSVMBag.score(i[3], i[4]))

my_dict = dict(Dataset=Datasets,TimeMethod=TimeMethod,TimeSearch=TimeSearch, Score=ScoreBaggingSVM)
SVMBagDF = pd.DataFrame (my_dict)
print(SVMBagDF.to_latex())


--------- Ionosphere ---------
La mejor accuracy se obtuvo con el siguiente SVMBag:
    Best params -> {'bootstrap': False, 'estimator__C': 1000, 'estimator__gamma': 1, 'estimator__kernel': 'rbf', 'max_features': 0.75, 'n_estimators': 10}
    Best score -> 0.9464387464387466
Si usamos el dataset de test, obtenemos el siguiente resultado:
    score = 0.9204545454545454
Tiempo medio en ejecutarse el método: 0.5599615052342415 +- 0.15056221420694077s
Tiempo en ejecutarse la búsqueda 37.03996706008911s, (0.6173327843348185 min)


KeyboardInterrupt: 

In [None]:
Datasets = []
TimeSearch = []
TimeMethod = []
ScoreBaggingTree = []
for i in data:
    init = time.time()
    optimalTreeBag.fit(i[1], i[2])
    end = time.time()
    timeSearch = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"La mejor accuracy se obtuvo con el siguiente TreeBag:")
    print(f'    Best params -> {optimalTreeBag.best_params_}')
    print(f'    Best score -> {optimalTreeBag.best_score_}')

    print(f"Si usamos el dataset de test, obtenemos el siguiente resultado:")
    print(f"    score = {optimalTreeBag.score(i[3], i[4])}")
    print(f"Tiempo medio en ejecutarse el método: {np.mean(optimalTreeBag.cv_results_.get('mean_fit_time'))} +- {np.mean(optimalTreeBag.cv_results_.get('std_fit_time'))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeSearch}s, ({timeSearch/60} min)")
    Datasets.append(i[0])
    TimeMethod.append(np.mean(optimalTreeBag.cv_results_.get('mean_fit_time')))
    TimeSearch.append(timeSearch)
    ScoreBaggingTree.append(optimalTreeBag.score(i[3], i[4]))

my_dict = dict(Dataset=Datasets,TimeMethod=TimeMethod,TimeSearch=TimeSearch, Score=ScoreBaggingTree)
TreeBagDF = pd.DataFrame (my_dict)
print(TreeBagDF.to_latex())

In [None]:
Datasets = []
TimeSearch = []
TimeMethod = []
ScoreBoosting1_SVM = []
for i in data:
    init = time.time()
    optimalSVMBoost1.fit(i[1], i[2])
    end = time.time()
    timeSearch = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"La mejor accuracy se obtuvo con el siguiente SVMBoost1:")
    print(f'    Best params -> {optimalSVMBoost1.best_params_}')
    print(f'    Best score -> {optimalSVMBoost1.best_score_}')

    print(f"Si usamos el dataset de test, obtenemos el siguiente resultado:")
    print(f"    score = {optimalSVMBoost1.score(i[3], i[4])}")
    print(f"Tiempo medio en ejecutarse el método: {np.mean(optimalSVMBoost1.cv_results_.get('mean_fit_time'))} +- {np.mean(optimalSVMBoost1.cv_results_.get('std_fit_time'))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeSearch}s, ({timeSearch/60} min)")
    Datasets.append(i[0])
    TimeMethod.append(np.mean(optimalSVMBoost1.cv_results_.get('mean_fit_time')))
    TimeSearch.append(timeSearch)
    ScoreBoosting1_SVM.append(optimalSVMBoost1.score(i[3], i[4]))

my_dict = dict(Dataset=Datasets,TimeMethod=TimeMethod,TimeSearch=TimeSearch, Score=ScoreBoosting1_SVM)
SVMBoost1DF = pd.DataFrame (my_dict)
print(SVMBoost1DF.to_latex())

In [None]:
Datasets = []
TimeSearch = []
TimeMethod = []
ScoreBoosting1_Tree = []
for i in data:
    init = time.time()
    optimalTreeBoost1.fit(i[1], i[2])
    end = time.time()
    timeSearch = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"La mejor accuracy se obtuvo con el siguiente TreeBoost1:")
    print(f'    Best params -> {optimalTreeBoost1.best_params_}')
    print(f'    Best score -> {optimalTreeBoost1.best_score_}')

    print(f"Si usamos el dataset de test, obtenemos el siguiente resultado:")
    print(f"    score = {optimalTreeBoost1.score(i[3], i[4])}")
    print(f"Tiempo medio en ejecutarse el método: {np.mean(optimalTreeBoost1.cv_results_.get('mean_fit_time'))} +- {np.mean(optimalTreeBoost1.cv_results_.get('std_fit_time'))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeSearch}s, ({timeSearch/60} min)")
    Datasets.append(i[0])
    TimeMethod.append(np.mean(optimalTreeBoost1.cv_results_.get('mean_fit_time')))
    TimeSearch.append(timeSearch)
    ScoreBoosting1_Tree.append(optimalTreeBoost1.score(i[3], i[4]))

my_dict = dict(Dataset=Datasets,TimeMethod=TimeMethod,TimeSearch=TimeSearch, Score=ScoreBoosting1_Tree)
TreeBoost1DF = pd.DataFrame (my_dict)
print(TreeBoost1DF.to_latex())

In [None]:
"""
Datasets = []
TimeSearch = []
TimeMethod = []
Score = []
for i in data:
    init = time.time()
    optimalSVMBoost2.fit(i[1], i[2])
    end = time.time()
    timeSearch = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"La mejor accuracy se obtuvo con el siguiente SVMBoost2:")
    print(f'    Best params -> {optimalSVMBoost2.best_params_}')
    print(f'    Best score -> {optimalSVMBoost2.best_score_}')

    print(f"Si usamos el dataset de test, obtenemos el siguiente resultado:")
    print(f"    score = {optimalSVMBoost2.score(i[3], i[4])}")
    print(f"Tiempo medio en ejecutarse el método: {np.mean(optimalSVMBoost2.cv_results_.get('mean_fit_time'))} +- {np.mean(optimalSVMBoost2.cv_results_.get('std_fit_time'))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeSearch}s, ({timeSearch/60} min)")
    Datasets.append(i[0])
    TimeMethod.append(np.mean(optimalSVMBoost2.cv_results_.get('mean_fit_time')))
    TimeSearch.append(timeSearch)
    Score.append(optimalSVMBoost2.score(i[3], i[4]))

d7 = dict(SVM_Boost2=Score)
my_dict = dict(Dataset=Datasets,TimeMethod=TimeMethod,TimeSearch=TimeSearch, Score=Score)
SVMBoost2DF = pd.DataFrame (my_dict)
print(SVMBoost2DF.to_latex())
"""

In [None]:
Datasets = []
TimeSearch = []
TimeMethod = []
ScoreBoosting2_Tree = []
for i in data:
    init = time.time()
    optimalTreeBoost2.fit(i[1], i[2])
    end = time.time()
    timeSearch = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"La mejor accuracy se obtuvo con el siguiente TreeBoost2:")
    print(f'    Best params -> {optimalTreeBoost2.best_params_}')
    print(f'    Best score -> {optimalTreeBoost2.best_score_}')

    print(f"Si usamos el dataset de test, obtenemos el siguiente resultado:")
    print(f"    score = {optimalTreeBoost2.score(i[3], i[4])}")
    print(f"Tiempo medio en ejecutarse el método: {np.mean(optimalTreeBoost2.cv_results_.get('mean_fit_time'))} +- {np.mean(optimalTreeBoost2.cv_results_.get('std_fit_time'))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeSearch}s, ({timeSearch/60} min)")
    Datasets.append(i[0])
    TimeMethod.append(np.mean(optimalTreeBoost2.cv_results_.get('mean_fit_time')))
    TimeSearch.append(timeSearch)
    ScoreBoosting2_Tree.append(optimalTreeBoost2.score(i[3], i[4]))

my_dict = dict(Dataset=Datasets,TimeMethod=TimeMethod,TimeSearch=TimeSearch, Score=ScoreBoosting2_Tree)
TreeBoost2DF = pd.DataFrame (my_dict)
print(TreeBoost2DF.to_latex())

In [None]:
Datasets = []
TimeSearch = []
TimeMethod = []
ScoreGradBoost = []
for i in data:
    init = time.time()
    optimalGradBoost.fit(i[1], i[2])
    end = time.time()
    timeSearch = end - init
    print(f"\n--------- {i[0]} ---------")
    print(f"La mejor accuracy se obtuvo con el siguiente GradBoost:")
    print(f'    Best params -> {optimalGradBoost.best_params_}')
    print(f'    Best score -> {optimalGradBoost.best_score_}')

    print(f"Si usamos el dataset de test, obtenemos el siguiente resultado:")
    print(f"    score = {optimalGradBoost.score(i[3], i[4])}")
    print(f"Tiempo medio en ejecutarse el método: {np.mean(optimalGradBoost.cv_results_.get('mean_fit_time'))} +- {np.mean(optimalGradBoost.cv_results_.get('std_fit_time'))}s")
    print(f"Tiempo en ejecutarse la búsqueda {timeSearch}s, ({timeSearch/60} min)")
    Datasets.append(i[0])
    TimeMethod.append(np.mean(optimalGradBoost.cv_results_.get('mean_fit_time')))
    TimeSearch.append(timeSearch)
    ScoreGradBoost.append(optimalGradBoost.score(i[3], i[4]))

my_dict = dict(Dataset=Datasets,TimeMethod=TimeMethod,TimeSearch=TimeSearch, Score=ScoreGradBoost)
GradBoostDF = pd.DataFrame (my_dict)
print(GradBoostDF.to_latex())

In [None]:
compareDF = pd.DataFrame()
compareDF['SVM'] = ScoreSVM
compareDF['Tree'] = ScoreTree
compareDF['BaggingSVM'] = ScoreBaggingSVM
compareDF['BaggingTree'] = ScoreBaggingTree
compareDF['Boosting1_SVM'] = ScoreBoosting1_SVM
compareDF['Boosting1_Tree'] = ScoreBoosting1_Tree
compareDF['Boosting2_Tree'] = ScoreBoosting2_Tree
compareDF['GradBoost'] = ScoreGradBoost
compareDF.to_csv("performanceGridSearch.csv", index=False)
print(compareDF)
print(compareDF.to_latex())

# Iman Davenport test

In [None]:
import numpy as np
from scipy.stats import chi2

def compute_imon_davenport_statistic(performance_matrix):
  # Compute the ranks of the model performance on each dataset
  ranks = np.apply_along_axis(lambda x: len(x) - np.argsort(np.argsort(x)), 1, performance_matrix)

  # Sum the ranks for each model across all datasets
  rank_sums = np.sum(ranks, axis=1)

  # Compute the Imon Davenport statistic
  imon_davenport_statistic = (np.max(rank_sums) - np.min(rank_sums)) / performance_matrix.shape[1]

  return imon_davenport_statistic

def compute_p_value(imon_davenport_statistic, num_models, num_datasets):
  # Compute the degrees of freedom for the Imon Davenport test
  df = num_models - 1

  # Compute the p-value using the chi-squared distribution
  p_value = 1 - chi2.cdf(imon_davenport_statistic, df)

  return p_value

def imon_davenport_test(performance_matrix, significance_level):
  # Compute the Imon Davenport statistic and p-value
  imon_davenport_statistic = compute_imon_davenport_statistic(performance_matrix)
  p_value = compute_p_value(imon_davenport_statistic, performance_matrix.shape[0], performance_matrix.shape[1])

  # Determine whether the difference in performance between the models is statistically significant
  if p_value < significance_level:
    print(f"The difference in performance between the models is statistically significant (p = {p_value:.3f})")
  else:
    print(f"The difference in performance between the models is not statistically significant (p = {p_value:.3f})")
  return p_value

In [None]:
performance_matrix = pd.read_csv("performance.csv")
# Run the Imon Davenport test
p_value = imon_davenport_test(performance_matrix, 0.05)

# Wilcoxon Test

In [None]:
# If there are significant differences between the models, then apply the wilcoxon test to determine which models are significantly different
if p_value < 0.05:
  # Compute the pairwise differences between the models
  pairwise_differences = np.apply_along_axis(lambda x: x - x[:, None], 1, performance_matrix)

  # Compute the p-values for the pairwise differences using the Wilcoxon signed-rank test
  p_values = np.apply_along_axis(lambda x: wilcoxon(x, zero_method="wilcox")[1], 1, pairwise_differences)

  # Compute the Bonferroni correction
  bonferroni_correction = 0.05 / (p_values.shape[0] * (p_values.shape[0] - 1) / 2)

  # Determine which models are significantly different
  significant_differences = np.argwhere(p_values < bonferroni_correction)

  # Print the significant differences
  for difference in significant_differences:
    print(f"Model {difference[0]} is significantly different from model {difference[1]}")