### Experimento 4: otimização de hiperparâmetros

Grid Search com o dataset escalado (ou não, dependendo do que for melhor), para otimização de parâmetros 

In [1]:
import pandas as pd

#constructors
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#metrics
from sklearn.metrics import mean_absolute_error

#models
from sklearn.linear_model import LinearRegression, Ridge

#outlier detection algorithms
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
# load the dataset
X_train = pd.read_parquet("../Datasets/DF_split/X_train.parquet")
X_val = pd.read_parquet("../Datasets/DF_split/X_val.parquet")
y_train = pd.read_csv("../Datasets/DF_split/y_train.csv")
y_val = pd.read_csv("../Datasets/DF_split/y_val.csv")

In [3]:
# initial shape of the training dataset
print(X_train.shape, y_train.shape)

(1988340, 75) (1988340, 1)


In [4]:
X_train.head()

Unnamed: 0,cid,atomic_radii_lvl0,atomic_radii_lvl1,atomic_radii_lvl2,atomic_radii_lvl3,van_der_waals_radii_lvl0,van_der_waals_radii_lvl1,van_der_waals_radii_lvl2,van_der_waals_radii_lvl3,covalent_radii_lvl0,...,kappa2,kappa3,Phi,charge,total_dipole_moment,multiplicity,homo,lumo,gap,total_energy
351250,7140779,92375.0,121050.0,180675.0,210750.0,777929.0,931155.0,1599565.0,2069740.0,130441.0,...,6.659049,3.744617,4.875602,1,3.460254,1,-9.706301,-5.317105,4.389196,-21254.899969
232660,2777299,52125.0,61000.0,83350.0,77775.0,385335.0,425730.0,621119.0,629614.0,74504.0,...,2.92364,2.335996,2.120675,0,3.941092,1,-6.353858,-0.35919,5.994668,-19584.413083
2073107,61392638,53525.0,64975.0,89850.0,95350.0,463952.0,515170.0,824197.0,972010.0,79579.0,...,4.38487,5.911496,4.181571,0,2.60365,1,-6.274945,0.609535,6.88448,-17094.781141
2334865,62960945,73550.0,92850.0,130375.0,149800.0,661633.0,759870.0,1253204.0,1554690.0,105897.0,...,6.893866,4.881316,6.591596,0,2.014337,1,-6.555223,0.3347,6.889923,-16297.709359
1736863,57291457,63650.0,74500.0,107075.0,94525.0,411404.0,474365.0,710805.0,671480.0,82471.0,...,2.632554,1.001034,1.613435,0,2.550175,1,-5.390575,-1.357848,4.032727,-25159.230741


In [5]:
X_train = X_train.drop(columns='cid')
X_val = X_val.drop(columns='cid')

### Grid Search

Parte 1: 2 melhores scalers + modelos

In [50]:
# Define the models and their respective parameter grids
models = {
    'LinearRegression': {
        'model': LinearRegression(),
        'param_grid': {}
    },
    'RidgeRegression': {
        'model': Ridge(),
        'param_grid': {
            'model__solver':['svd', 'cholesky', 'lsqr'],
            'model__alpha': [1e-2, 1e-1, 1, 10, 100],
            'model__fit_intercept':[True, False]
        }
    }
}

# Define scalers
scalers = [StandardScaler(), MinMaxScaler()]

# Perform grid search for each model with each scaler
results_list = []

for model_name, model_info in models.items():
    model = model_info['model']
    param_grid = model_info['param_grid']

    for scaler in scalers:
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])

        grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error')
        grid_search.fit(X_train, y_train)

        # Save grid search results as a dictionary
        model_results = {
            'Model': model_name,
            'Scaler': type(scaler).__name__,
            'params': grid_search.cv_results_['params'],
            'mean_test_score': grid_search.cv_results_['mean_test_score'],
            'std_test_score': grid_search.cv_results_['std_test_score']
        }

        results_list.append(model_results)

# Convert list of dictionaries to a DataFrame
results_df = pd.concat([pd.DataFrame(result) for result in results_list], ignore_index=True)
print(results_df)


               Model          Scaler   
0   LinearRegression  StandardScaler  \
1   LinearRegression    MinMaxScaler   
2    RidgeRegression  StandardScaler   
3    RidgeRegression  StandardScaler   
4    RidgeRegression  StandardScaler   
..               ...             ...   
57   RidgeRegression    MinMaxScaler   
58   RidgeRegression    MinMaxScaler   
59   RidgeRegression    MinMaxScaler   
60   RidgeRegression    MinMaxScaler   
61   RidgeRegression    MinMaxScaler   

                                               params  mean_test_score   
0                                                  {}        -0.228844  \
1                                                  {}        -0.228851   
2   {'model__alpha': 0.01, 'model__fit_intercept':...        -0.228872   
3   {'model__alpha': 0.01, 'model__fit_intercept':...        -0.228872   
4   {'model__alpha': 0.01, 'model__fit_intercept':...        -0.232684   
..                                                ...              ...   
5

In [51]:
results_df

Unnamed: 0,Model,Scaler,params,mean_test_score,std_test_score
0,LinearRegression,StandardScaler,{},-0.228844,0.000395
1,LinearRegression,MinMaxScaler,{},-0.228851,0.000405
2,RidgeRegression,StandardScaler,"{'model__alpha': 0.01, 'model__fit_intercept':...",-0.228872,0.000407
3,RidgeRegression,StandardScaler,"{'model__alpha': 0.01, 'model__fit_intercept':...",-0.228872,0.000407
4,RidgeRegression,StandardScaler,"{'model__alpha': 0.01, 'model__fit_intercept':...",-0.232684,0.000359
...,...,...,...,...,...
57,RidgeRegression,MinMaxScaler,"{'model__alpha': 100, 'model__fit_intercept': ...",-0.249330,0.000036
58,RidgeRegression,MinMaxScaler,"{'model__alpha': 100, 'model__fit_intercept': ...",-0.249336,0.000044
59,RidgeRegression,MinMaxScaler,"{'model__alpha': 100, 'model__fit_intercept': ...",-0.250830,0.000072
60,RidgeRegression,MinMaxScaler,"{'model__alpha': 100, 'model__fit_intercept': ...",-0.250830,0.000072


In [52]:
results_df['mean_test_score'].max()

-0.22884398704170195

In [53]:
results_df[results_df['mean_test_score'] == results_df['mean_test_score'].max()]

Unnamed: 0,Model,Scaler,params,mean_test_score,std_test_score
0,LinearRegression,StandardScaler,{},-0.228844,0.000395


In [54]:
results_df.to_csv('../Results/Exp4_results_part1.csv', index=False)

Parte 2: todos os scalers + modelos

In [59]:
#outlier detection algorithms
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import RobustScaler

# Define the models and their respective parameter grids
models = {
    'LinearRegression': {
        'model': LinearRegression(),
        'param_grid': {}
    },
    'RidgeRegression': {
        'model': Ridge(),
        'param_grid': {
            'model__solver':['svd', 'cholesky', 'lsqr'],
            'model__alpha': [1e-2, 1e-1, 1, 10, 100],
            'model__fit_intercept':[True, False]
        }
    }
}

# Define scalers
scalers = [StandardScaler(), MinMaxScaler(), Normalizer(), RobustScaler()]

# Perform grid search for each model with each scaler
results_list = []

for model_name, model_info in models.items():
    model = model_info['model']
    param_grid = model_info['param_grid']

    for scaler in scalers:
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])

        grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error')
        grid_search.fit(X_train, y_train)

        # Save grid search results as a dictionary
        model_results = {
            'Model': model_name,
            'Scaler': type(scaler).__name__,
            'params': grid_search.cv_results_['params'],
            'mean_test_score': grid_search.cv_results_['mean_test_score'],
            'std_test_score': grid_search.cv_results_['std_test_score']
        }

        results_list.append(model_results)

# Convert list of dictionaries to a DataFrame
results_df = pd.concat([pd.DataFrame(result) for result in results_list], ignore_index=True)
print(results_df)


                Model          Scaler   
0    LinearRegression  StandardScaler  \
1    LinearRegression    MinMaxScaler   
2    LinearRegression      Normalizer   
3    LinearRegression    RobustScaler   
4     RidgeRegression  StandardScaler   
..                ...             ...   
119   RidgeRegression    RobustScaler   
120   RidgeRegression    RobustScaler   
121   RidgeRegression    RobustScaler   
122   RidgeRegression    RobustScaler   
123   RidgeRegression    RobustScaler   

                                                params  mean_test_score   
0                                                   {}        -0.228844  \
1                                                   {}        -0.228851   
2                                                   {}      -370.772086   
3                                                   {}        -0.228871   
4    {'model__alpha': 0.01, 'model__fit_intercept':...        -0.228872   
..                                                 ...   

In [60]:
results_df

Unnamed: 0,Model,Scaler,params,mean_test_score,std_test_score
0,LinearRegression,StandardScaler,{},-0.228844,0.000395
1,LinearRegression,MinMaxScaler,{},-0.228851,0.000405
2,LinearRegression,Normalizer,{},-370.772086,290.904529
3,LinearRegression,RobustScaler,{},-0.228871,0.000407
4,RidgeRegression,StandardScaler,"{'model__alpha': 0.01, 'model__fit_intercept':...",-0.228872,0.000407
...,...,...,...,...,...
119,RidgeRegression,RobustScaler,"{'model__alpha': 100, 'model__fit_intercept': ...",-0.233724,0.000359
120,RidgeRegression,RobustScaler,"{'model__alpha': 100, 'model__fit_intercept': ...",-0.255998,0.002870
121,RidgeRegression,RobustScaler,"{'model__alpha': 100, 'model__fit_intercept': ...",-0.233934,0.000356
122,RidgeRegression,RobustScaler,"{'model__alpha': 100, 'model__fit_intercept': ...",-0.233934,0.000356


In [61]:
results_df['mean_test_score'].max()

-0.22884398704170195

In [62]:
results_df[results_df['mean_test_score'] == results_df['mean_test_score'].max()]

Unnamed: 0,Model,Scaler,params,mean_test_score,std_test_score
0,LinearRegression,StandardScaler,{},-0.228844,0.000395


In [63]:
results_df.to_csv('../Results/Exp4_results_part2.csv', index=False)

Parte 3: 2 melhores scalers + remoção de outliers + modelos

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope

# Define the models and their respective parameter grids
models = {
    'LinearRegression': {
        'model': LinearRegression(),
        'param_grid': {
            'model__alpha': [1e-2, 1e-1, 1, 10, 100]
        }
    },
    'RidgeRegression': {
        'model': Ridge(),
        'param_grid': {
            'model__solver':['svd', 'cholesky', 'lsqr'],
            'model__alpha': [1e-2, 1e-1, 1, 10, 100],
            'model__fit_intercept':[True, False]
        }
    }
}

# Define outlier detection models
outlier_detection_models = {
    'IsolationForest': IsolationForest(),
    'EllipticEnvelope': EllipticEnvelope()
}

# Define scalers
scalers = [StandardScaler(), MinMaxScaler()]

# Perform cross-validation for models with hyperparameters
results_list = []

for model_name, model_info in models.items():
    model = model_info['model']
    param_space = model_info['param_space']

    for scaler in scalers:
        for outlier_model_name, outlier_model in outlier_detection_models.items():
            pipeline = Pipeline([
                ('scaler', scaler),
                ('outlier_detection', outlier_model),
                ('model', model)
            ])

            # Perform cross-validation with 'neg_mean_absolute_error'
            scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='neg_mean_absolute_error')

            # Save results as a dictionary
            model_results = {
                'Model': f'{outlier_model_name}_{model_name}',
                'Scaler': type(scaler).__name__,
                'mean_test_score': -scores.mean(),  # Reverse the sign
                'std_test_score': scores.std()
            }

            results_list.append(model_results)

# Convert list of dictionaries to a DataFrame
results_df = pd.concat([pd.DataFrame(result) for result in results_list], ignore_index=True)
print(results_df)


In [None]:
results_df

In [None]:
results_df['mean_test_score'].max()

In [None]:
results_df[results_df['mean_test_score'] == results_df['mean_test_score'].max()]

In [None]:
results_df.to_csv('../Results/Exp4_results_part3.csv', index=False)