### Experimento 5: Otimização de Hiperparâmetros com Bayes Search

Repetição do experimento 4 na tentativa de obter resultados diferentes (ou melhores), usando a Otimização Bayesiana.

In [None]:
import pandas as pd
import numpy as np

#constructors
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#metrics
from sklearn.metrics import mean_absolute_error

#models
from sklearn.linear_model import LinearRegression, Ridge

#outlier detection algorithms
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#Bayesian Optimization algorithms
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [None]:
# load the dataset
X_train = pd.read_parquet("../Datasets/DF_split/X_train.parquet")
X_val = pd.read_parquet("../Datasets/DF_split/X_val.parquet")
y_train = pd.read_csv("../Datasets/DF_split/y_train.csv")
y_val = pd.read_csv("../Datasets/DF_split/y_val.csv")

In [None]:
# initial shape of the training dataset
print(X_train.shape, y_train.shape)

(1988340, 75) (1988340, 1)


In [None]:
X_train.head()

Unnamed: 0,cid,atomic_radii_lvl0,atomic_radii_lvl1,atomic_radii_lvl2,atomic_radii_lvl3,van_der_waals_radii_lvl0,van_der_waals_radii_lvl1,van_der_waals_radii_lvl2,van_der_waals_radii_lvl3,covalent_radii_lvl0,...,kappa2,kappa3,Phi,charge,total_dipole_moment,multiplicity,homo,lumo,gap,total_energy
351250,7140779,92375.0,121050.0,180675.0,210750.0,777929.0,931155.0,1599565.0,2069740.0,130441.0,...,6.659049,3.744617,4.875602,1,3.460254,1,-9.706301,-5.317105,4.389196,-21254.899969
232660,2777299,52125.0,61000.0,83350.0,77775.0,385335.0,425730.0,621119.0,629614.0,74504.0,...,2.92364,2.335996,2.120675,0,3.941092,1,-6.353858,-0.35919,5.994668,-19584.413083
2073107,61392638,53525.0,64975.0,89850.0,95350.0,463952.0,515170.0,824197.0,972010.0,79579.0,...,4.38487,5.911496,4.181571,0,2.60365,1,-6.274945,0.609535,6.88448,-17094.781141
2334865,62960945,73550.0,92850.0,130375.0,149800.0,661633.0,759870.0,1253204.0,1554690.0,105897.0,...,6.893866,4.881316,6.591596,0,2.014337,1,-6.555223,0.3347,6.889923,-16297.709359
1736863,57291457,63650.0,74500.0,107075.0,94525.0,411404.0,474365.0,710805.0,671480.0,82471.0,...,2.632554,1.001034,1.613435,0,2.550175,1,-5.390575,-1.357848,4.032727,-25159.230741


In [None]:
X_train = X_train.drop(columns='cid')
X_val = X_val.drop(columns='cid')

### Bayesian Optimization

In [None]:
!pip install --upgrade scikit-optimize



In [1]:
!pip install numpy==1.20.2

Collecting numpy==1.20.2

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas 2.0.0 requires numpy>=1.20.3; python_version < "3.10", but you have numpy 1.20.2 which is incompatible.



  Downloading numpy-1.20.2-cp38-cp38-win_amd64.whl (13.7 MB)
     --------------------------------------- 13.7/13.7 MB 10.1 MB/s eta 0:00:00
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.4
    Uninstalling numpy-1.24.4:
      Successfully uninstalled numpy-1.24.4
Successfully installed numpy-1.20.2


In [None]:
# define search space
params = dict()
params['C'] = (1e-6, 100.0, 'log-uniform')
params['gamma'] = (1e-6, 100.0, 'log-uniform')
params['degree'] = (1,5)
params['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']

In [None]:
params

{'C': (1e-06, 100.0, 'log-uniform'),
 'gamma': (1e-06, 100.0, 'log-uniform'),
 'degree': (1, 5),
 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

In [None]:
# Define the models and their respective parameter spaces for Bayesian optimization
models = {
    #'LinearRegression': {
    #    'model': LinearRegression(),
    #    'param_space': {}
    #},
    'RidgeRegression': {
        'model': Ridge(),
        'param_space': {
            'model__alpha': (0.1, 10.0, 'log-uniform'),  # Specify alpha range with log-uniform distribution
            #'model__alpha': Real(0.1, 10.0, 'log-uniform'),
            'model__solver':['svd', 'cholesky', 'lsqr'],
            #'model__alpha': [1e-2, 1e-1, 1, 10, 100],
            'model__fit_intercept':[True, False]
        }
    }
}

# Define scalers
scalers = [StandardScaler(), MinMaxScaler()]

# Perform Bayesian optimization for each model with each scaler
results_list = []

for model_name, model_info in models.items():
    model = model_info['model']
    param_space = model_info['param_space']

    for scaler in scalers:
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])

        bayes_search = BayesSearchCV(
            estimator=pipeline,
            search_spaces=param_space,
            n_iter=30,  # Adjust the number of iterations as needed
            cv=3
        )
        bayes_search.fit(X_train, y_train)

        # Save Bayesian optimization results as a dictionary
        model_results = {
            'Model': model_name,
            'Scaler': type(scaler).__name__,
            'params': bayes_search.best_params_,
            'mean_test_score': bayes_search.best_score_,
            'std_test_score': bayes_search.cv_results_['std_test_score'][bayes_search.best_index_]
        }

        results_list.append(model_results)

# Convert list of dictionaries to a DataFrame
results_df = pd.concat([pd.DataFrame(result) for result in results_list], ignore_index=True)
print(results_df)


AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations