# Comparación de técnicas de preprocesamiento de los datos

En este notebook se van a probar distintos métodos de preprocesado de los datos para entrenar todas las alternativas con un mismo modelo (_XGBoost_) y comparar los resultados mediante el accuracy.

In [28]:
import pandas as pd
import numpy as np
# Data partition
from sklearn.model_selection import train_test_split
# Model evaluation
from sklearn.metrics import accuracy_score
# Data preprocessing
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, SelectPercentile, mutual_info_classif, RFE, SelectFromModel
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

### Datos

In [2]:
# Datos de entrenamiento
trainFNC = pd.read_csv("data/train_FNC.csv")
trainSBM = pd.read_csv("data/train_SBM.csv")
train_labels = pd.read_csv("data/train_labels.csv")

# DataFrame con ambas fuentes de datos (FNC y SBM)
train = pd.merge(left=trainFNC, right=trainSBM, left_on='Id', right_on='Id')
data = pd.merge(left=train_labels, right=train, left_on='Id', right_on='Id')
data.drop("Id", inplace=True, axis=1)

# Shuffle de los datos de train
data = data.sample(frac=1, random_state=0)
data.head(5)

Unnamed: 0,Class,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
2,0,0.24585,0.21662,-0.12468,-0.3538,0.1615,-0.002032,-0.13302,-0.035222,0.25904,...,-0.257114,0.597229,1.220756,-0.059213,-0.435494,-0.092971,1.09091,-0.448562,-0.508497,0.350434
13,1,0.41073,-0.031925,0.2107,0.24226,0.3201,-0.41929,-0.18714,0.16845,0.59979,...,-0.050862,0.870602,0.609465,1.181878,-2.279469,-0.013484,-0.012693,-1.244346,-1.080442,-0.788502
53,1,0.070919,0.034179,-0.011755,0.019158,0.024645,-0.032022,0.00462,0.31817,0.21255,...,-1.539922,-1.495822,1.643866,1.68778,1.521086,-1.988432,-0.267471,0.510576,1.104566,-1.067206
41,0,0.087377,-0.052462,-0.007835,-0.11283,0.38938,0.21608,0.063572,-0.25123,-0.080568,...,-0.077353,-0.459463,-0.204328,-0.619508,-1.410523,-0.304622,-1.521928,0.593691,0.073638,-0.26092
74,0,0.20275,0.19142,-0.056662,-0.15778,0.24404,0.03978,-0.001503,0.001056,-0.048222,...,0.044457,0.593326,1.063052,0.434726,1.604964,-0.359736,0.210107,0.355922,0.730287,-0.323557


Vamos a usar la siguiente partición de los datos:

* 60% train $\sim$ 50 datos
* 20% validation $\sim$ 18 datos (se define al aplicar cross-validación en el ajuste)
* 20% test $\sim$ 18 datos

In [3]:
X = data.iloc[:, 1:]
y = data.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("Tamaño del dataset de train:", X_train.shape)
print("Tamaño del dataset de test:", X_test.shape)

Tamaño del dataset de train: (68, 410)
Tamaño del dataset de test: (18, 410)


### Modelo

In [4]:
import warnings
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings('ignore')

model_XGB = XGBClassifier(eval_metric="logloss")
param_grid_XGB = {
    "booster": ["gbtree", "gblinear", "dart"],
    "learning_rate": [0.001, 0.05, 0.1, 0.5]
}
grid_search_XGB = GridSearchCV(estimator=model_XGB, param_grid=param_grid_XGB, cv=4)
# cv = 4 porque así: el conjunto de validation tiene un 0.25 del tamaño de train y: 0.25 * 0.8 = 0.2
#                    el conjunto de train tiene un 0.75 del tamaño de train y: 0.75 * 0.8 = 0.6

  from pandas import MultiIndex, Int64Index


In [5]:
# Función para realizar el entrenamiento y el ajuste de parámetros
def train_model(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
    grid_search_XGB.fit(X_train, y_train)
    model_XGB_opt = grid_search_XGB.best_estimator_
    
    # Predicción en partición de test
    y_pred_XGB = model_XGB_opt.predict(X_test)
    
    # Precisión en partición de test
    accuracy = accuracy_score(y_test, y_pred_XGB)
    
    return accuracy, grid_search_XGB

In [9]:
def print_results(accuracy, param_grid):
    print("Parámetros óptimos:", grid.best_params_)
    print("Modelo óptimo:", grid.best_estimator_)
    print("Accuracy: {:0.2f}%".format(accuracy * 100))

# Sin preprocesado

In [10]:
accuracy, grid = train_model()
print_results(accuracy, grid)

Parámetros óptimos: {'booster': 'gblinear', 'learning_rate': 0.001}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=None,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=0.001, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              validate_parameters=1, verbosity=None)
Accuracy: 83.33%


# Escalado de los datos

In [11]:
preprocess = StandardScaler()

X_train_processed = preprocess.fit_transform(X_train)
X_test_processed = preprocess.fit_transform(X_test)

accuracy, grid = train_model(X_train_processed, X_test_processed)
print_results(accuracy, grid)

Parámetros óptimos: {'booster': 'gblinear', 'learning_rate': 0.001}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=None,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=0.001, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              validate_parameters=1, verbosity=None)
Accuracy: 83.33%


# Normalize

Datos con norma unitaria.

In [12]:
norm_types = ["l1", "l2", "max"]
acc = 0

for norm in norm_types:
    X_train_processed = normalize(X_train, norm)
    X_test_processed = normalize(X_test, norm)
    
    accuracy, grid = train_model(X_train_processed, X_test_processed)
    if accuracy > acc:
        best_norm = norm
        grid = grid
        acc = accuracy

print_results(acc, grid)
print("Resultado obtenido usando la norma {}.".format(best_norm))

Parámetros óptimos: {'booster': 'gblinear', 'learning_rate': 0.001}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=None,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=0.001, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              validate_parameters=1, verbosity=None)
Accuracy: 83.33%
Resultado obtenido usando la norma l1.


# PCA

In [13]:
# NOTA: la dimensionalidad del espacio reducido del método PCA (paquete sklearn) en python tiene como tamaño máximo el mínimo
# entre el número de variables y el número de muestras. Como en este caso nuestro conjunto de datos es pequeño, el número 
# máximo de componentes será precisamente el número de muestras en el conjunto de test.
n_components_range = list(range(2, X_test.shape[0], 2))
acc = 0

for n_components in n_components_range:
    preprocess = PCA(n_components=n_components)

    X_train_processed = preprocess.fit_transform(X_train)
    X_test_processed = preprocess.fit_transform(X_test)

    accuracy, grid = train_model(X_train_processed, X_test_processed)
    if accuracy > acc:
        best_n_components = n_components
        grid = grid
        acc = accuracy
        
print_results(acc, grid)
print("Resultado obtenido usando {:3d} componentes principales.".format(best_n_components))

Parámetros óptimos: {'booster': 'gbtree', 'learning_rate': 0.001}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='logloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.001, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
Accuracy: 66.67%
Resultado obtenido usando  16 componentes principales.


# SelectKBest

Documentación scikit-learn (https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html)

Este método selecciona las k mejores variables en base a la puntuación que estas reciben de acuerdo a una función que toma como parámetro (test estadísticos). 

La función que utiliza por defecto y con la que probaremos inicialmente en el siguiente código el método es ``f_classif``. Esta función calcula el ANOVA F-valor entre etiquetas y características para problemas de clasificación.

_NOTA: ANalysis Of VAriance: método estadístico que permite descubrir si los resultados de una prueba son significativos._

_NOTA: distribución F o de Fisher-Snedecor es una distribución de probabilidad continua, especialmente aplicada en el análisis de la varianza-_

Como hay que indicar un parámetro ``k`` que será el número de variables seleccionadas, probaremos varios valores para ver con cual se obtiene mejores resultados.

In [14]:
k_range = list(range(10, 450, 40))
acc = 0

for k in k_range:
    preprocess = SelectKBest(k=k)

    X_train_processed = preprocess.fit_transform(X_train, y_train)
    X_test_processed = preprocess.fit_transform(X_test, y_test)

    accuracy, grid = train_model(X_train_processed, X_test_processed)
    if accuracy > acc:
        best_k = k
        grid = grid
        acc = accuracy
        
print_results(acc, grid)
print("Resultado obtenido usando las {:3d} mejores componentes.".format(best_k))

Parámetros óptimos: {'booster': 'gblinear', 'learning_rate': 0.001}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=None,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=0.001, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              validate_parameters=1, verbosity=None)
Accuracy: 83.33%
Resultado obtenido usando las 410 mejores componentes.


A continuación, probamos los resultados del código anterior, utilizando una función de score distinta y apropiada para clasificación.

### mutual_info_classif: 

Mutual Information (MI) es un criterio de estimación de mide la dependencia entre dos variables (en este caso para una variable objetivo discreta). Su valor es no negativo y vale cero las dos variables son totalmente independientes, por tanto, cuanto más alto es su valor, mayor es la dependencia.

Su implementación en sklearn se apoya en métodos no-paramétricos, basados en la estimación de la entropía de las distancias de los k-vecinos más cercanos.

In [15]:
acc = 0

for k in k_range:
    preprocess = SelectKBest(score_func=mutual_info_classif, k=k)

    X_train_processed = preprocess.fit_transform(X_train, y_train)
    X_test_processed = preprocess.fit_transform(X_test, y_test)

    accuracy, grid = train_model(X_train_processed, X_test_processed)
    if accuracy > acc:
        best_k = k
        grid = grid
        acc = accuracy
        
print_results(acc, grid)
print("Resultado obtenido usando las {:3d} mejores componentes.".format(best_k))

Parámetros óptimos: {'booster': 'gblinear', 'learning_rate': 0.001}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=None,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=0.001, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              validate_parameters=1, verbosity=None)
Accuracy: 83.33%
Resultado obtenido usando las  10 mejores componentes.


El resto de funciones implementadas en sklearn son para regresión o no válidas en nuestro conjunto de datos (chi2 no admite valores negativos).

# SelectPercentile

Documentación scikit-learn (https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html?highlight=select%20percentile#sklearn.feature_selection.SelectPercentile)

Método muy similar al anterior ``SelectKBest`` pero que hace la selección de las variables esta vez de acuerdo a un percentil de las puntuaciones más altas. También utiliza las mismas funciones de score y seguiremos el mismo procedimiento.

### f_classif

In [16]:
percentile_range = list(range(10, 100, 10))
acc = 0

for percent in percentile_range:
    preprocess = SelectPercentile(percentile=percent)

    X_train_processed = preprocess.fit_transform(X_train, y_train)
    X_test_processed = preprocess.fit_transform(X_test, y_test)
    
    accuracy, grid = train_model(X_train_processed, X_test_processed)
    if accuracy > acc:
        best_percentile = percent
        grid = grid
        acc = accuracy

print_results(acc, grid)
print("Resultado obtenido de acuerdo al percentil {:2d}.".format(best_percentile))

Parámetros óptimos: {'booster': 'gblinear', 'learning_rate': 0.001}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=None,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=0.001, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              validate_parameters=1, verbosity=None)
Accuracy: 88.89%
Resultado obtenido de acuerdo al percentil 10.


### mutual_info_classif

In [17]:
acc = 0

for percent in percentile_range:
    preprocess = SelectPercentile(score_func=mutual_info_classif, percentile=percent)

    X_train_processed = preprocess.fit_transform(X_train, y_train)
    X_test_processed = preprocess.fit_transform(X_test, y_test)
    
    accuracy, grid = train_model(X_train_processed, X_test_processed)
    if accuracy > acc:
        best_percentile = percent
        grid = grid
        acc = accuracy

print_results(acc, grid)
print("Resultado obtenido de acuerdo al percentil {:2d}.".format(best_percentile))

Parámetros óptimos: {'booster': 'gblinear', 'learning_rate': 0.5}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=None,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=0.5, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              validate_parameters=1, verbosity=None)
Accuracy: 66.67%
Resultado obtenido de acuerdo al percentil 60.


# RFE

Documentación scikit-learn (https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html?highlight=rfe#sklearn.feature_selection.RFE)

RFE = Recursive Feature Elimination.

Dado un estimador que asigna pesos a las variables (por ejemplos los coeficientes de un modelo lineal), el método elimina de manera recursiva  variables, considerando cada vez conjuntos más pequeños de acuerdo a estos pesos. Inicialmente se entrena el estimador sobre el conjunto original de variables y se obtiene la importancia de cada variable. Acto seguido se podan las variables menos importantes del conjunto. Este proceso se repite de manera recursiva sobre los conjuntos podados que se van generando hasta que se alcanza el número de variables deseado.

### Estimador SVC (Support Vector Classifier)

In [18]:
num_features_range = list(range(10, 450, 40))
# kernel_types = ["linear", "rbf"]
kernel_types = ["linear"]
# NO FUNCIONA SI USO RBF O POLY
acc = 0

for num_features in num_features_range:
    for kernel in kernel_types:
        estimator = SVC(kernel=kernel)
        preprocess = RFE(estimator, n_features_to_select=num_features)

        X_train_processed = preprocess.fit_transform(X_train, y_train)
        X_test_processed = preprocess.fit_transform(X_test, y_test)
        
        accuracy, grid = train_model(X_train_processed, X_test_processed)
        if accuracy > acc:
            best_num_features = num_features
            best_kernel = kernel
            grid = grid
            acc = accuracy
            
print_results(acc, grid)
print("Resultado obtenido usando las {:3d} componentes más significativas y el kernel ''{}''."
      .format(best_num_features, best_kernel))

Parámetros óptimos: {'booster': 'gblinear', 'learning_rate': 0.5}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=None,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=0.5, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              validate_parameters=1, verbosity=None)
Accuracy: 72.22%
Resultado obtenido usando las  10 componentes más significativas y el kernel ''linear''.


### Estimador LinearSVC

In [19]:
acc = 0

for num_features in num_features_range:
    estimator = LinearSVC()
    preprocess = RFE(estimator, n_features_to_select=num_features)

    X_train_processed = preprocess.fit_transform(X_train, y_train)
    X_test_processed = preprocess.fit_transform(X_test, y_test)
    
    accuracy, grid = train_model(X_train_processed, X_test_processed)
    if accuracy > acc:
        best_num_features = num_features
        grid = grid
        acc = accuracy

print_results(acc, grid)
print("Resultado obtenido usando las {:3d} componentes más significativas.".format(best_num_features))

Parámetros óptimos: {'booster': 'gblinear', 'learning_rate': 0.5}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=None,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=0.5, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              validate_parameters=1, verbosity=None)
Accuracy: 83.33%
Resultado obtenido usando las 370 componentes más significativas.


### Estimador Random Forest

In [22]:
acc = 0

for num_features in num_features_range:
    estimator = RandomForestClassifier(random_state=0)
    preprocess = RFE(estimator, n_features_to_select=num_features)

    X_train_processed = preprocess.fit_transform(X_train, y_train)
    X_test_processed = preprocess.fit_transform(X_test, y_test)
    
    accuracy, grid = train_model(X_train_processed, X_test_processed)
    if accuracy > acc:
        best_num_features = num_features
        grid = grid
        acc = accuracy

print_results(acc, grid)
print("Resultado obtenido usando las {:3d} componentes más significativas.".format(best_num_features))

Parámetros óptimos: {'booster': 'gblinear', 'learning_rate': 0.001}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=None,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=0.001, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              validate_parameters=1, verbosity=None)
Accuracy: 83.33%
Resultado obtenido usando las 410 componentes más significativas.


# SelectFromModel

Método muy similar al anterior, se apoya en el uso de otros estimadores para asignar pesos a las variables del conjunto para eliminar aquellas que considera menos significativas.

### Estimador SVC

In [30]:
acc = 0

for num_features in num_features_range:
    estimator = SVC(kernel="linear")
    preprocess = SelectFromModel(estimator, threshold=-np.inf, max_features=num_features)

    X_train_processed = preprocess.fit_transform(X_train, y_train)
    X_test_processed = preprocess.fit_transform(X_test, y_test)
    
    accuracy, grid = train_model(X_train_processed, X_test_processed)
    if accuracy > acc:
        best_num_features = num_features
        grid = grid
        acc = accuracy

print_results(acc, grid)
print("Resultado obtenido usando las {:3d} componentes más significativas.".format(best_num_features))

Parámetros óptimos: {'booster': 'gblinear', 'learning_rate': 0.5}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=None,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=0.5, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              validate_parameters=1, verbosity=None)
Accuracy: 72.22%
Resultado obtenido usando las 410 componentes más significativas.


### Estimador LinearSVC

In [29]:
acc = 0

for num_features in num_features_range:
    estimator = LinearSVC()
    preprocess = SelectFromModel(estimator, threshold=-np.inf, max_features=num_features)

    X_train_processed = preprocess.fit_transform(X_train, y_train)
    X_test_processed = preprocess.fit_transform(X_test, y_test)
    
    accuracy, grid = train_model(X_train_processed, X_test_processed)
    if accuracy > acc:
        best_num_features = num_features
        grid = grid
        acc = accuracy

print_results(acc, grid)
print("Resultado obtenido usando las {:3d} componentes más significativas.".format(best_num_features))

Parámetros óptimos: {'booster': 'gblinear', 'learning_rate': 0.5}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=None,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=0.5, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              validate_parameters=1, verbosity=None)
Accuracy: 83.33%
Resultado obtenido usando las  90 componentes más significativas.


### Estimador RandomForest

In [31]:
acc = 0

for num_features in num_features_range:
    estimator = RandomForestClassifier(random_state=0)
    preprocess = SelectFromModel(estimator, threshold=-np.inf, max_features=num_features)

    X_train_processed = preprocess.fit_transform(X_train, y_train)
    X_test_processed = preprocess.fit_transform(X_test, y_test)
    
    accuracy, grid = train_model(X_train_processed, X_test_processed)
    if accuracy > acc:
        best_num_features = num_features
        grid = grid
        acc = accuracy

print_results(acc, grid)
print("Resultado obtenido usando las {:3d} componentes más significativas.".format(best_num_features))

Parámetros óptimos: {'booster': 'gblinear', 'learning_rate': 0.001}
Modelo óptimo: XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='logloss', gamma=None,
              gpu_id=-1, importance_type=None, interaction_constraints=None,
              learning_rate=0.001, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=4, num_parallel_tree=None,
              predictor=None, random_state=0, reg_alpha=0, reg_lambda=0,
              scale_pos_weight=1, subsample=None, tree_method=None,
              validate_parameters=1, verbosity=None)
Accuracy: 83.33%
Resultado obtenido usando las 410 componentes más significativas.


# Resultados (accuracy):

* SelectPercentile (con función de clasificación por defecto "f_classif") y percentil 10: 88.89%
<br><br />
* Normalizar (escalado norma unitaria), usando la norma "L1": 83.33%
* SelectKBest (con función de clasificación por defecto "f_classif") y 410 componentes: 83.33%
* SelectKBest (con función de clasificación "mutual_info_classif") y 10 componentes: 83.33%
* Sin preprocesado: 83.33%
* StandardScaler (escalado media=0, std=1): 83.33%
* RFE con el estimador LinearSVC y 370 componentes más significativas: 83.33%
* RFE con el estimador RandomForestClassifier y 410 componentes más significativas: 83.33%
* SelectFromModel con el estimador LinearSVC y 90 componentes más significativas: 83.33%
* SelectFromModel con el estimador RandomForest y 410 componentes más significativas: 83.33%
<br><br />
* SelectFromModel con el estimador SVC y 410 componentes más significativas: 72.22%
* RFE con el estimador SVC (kernel lineal) y 10 componentes más significativas: 72.22%.
<br><br />
* SelectPercentile (con función de clasificación "mutual_info_classif") y percentil 60: 66.67%
* PCA (con 16 componentes principales): 66.67%