### Experimento 3: métodos de scaling

Rodar uma regressão no dataset após ter passado pelos seguintes métodos de `scaling`:

- Normalization
- MinMax Scaling
- Robust Scaling

Iremos testar tanto com a `Regressão Ridge`, quanto com a `Regressão Linear`, que tiveram resultados próximos recentemente.

In [2]:
import pandas as pd

#constructors
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#metrics
from sklearn.metrics import mean_absolute_error

#models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

#outlier detection algorithms
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

In [3]:
# load the dataset
X_train = pd.read_parquet("../Datasets/DF_split/X_train.parquet")
X_val = pd.read_parquet("../Datasets/DF_split/X_val.parquet")
y_train = pd.read_csv("../Datasets/DF_split/y_train.csv")
y_val = pd.read_csv("../Datasets/DF_split/y_val.csv")

In [4]:
# initial shape of the training dataset
print(X_train.shape, y_train.shape)

(1988340, 75) (1988340, 1)


In [5]:
X_train.head()

Unnamed: 0,cid,atomic_radii_lvl0,atomic_radii_lvl1,atomic_radii_lvl2,atomic_radii_lvl3,van_der_waals_radii_lvl0,van_der_waals_radii_lvl1,van_der_waals_radii_lvl2,van_der_waals_radii_lvl3,covalent_radii_lvl0,...,kappa2,kappa3,Phi,charge,total_dipole_moment,multiplicity,homo,lumo,gap,total_energy
351250,7140779,92375.0,121050.0,180675.0,210750.0,777929.0,931155.0,1599565.0,2069740.0,130441.0,...,6.659049,3.744617,4.875602,1,3.460254,1,-9.706301,-5.317105,4.389196,-21254.899969
232660,2777299,52125.0,61000.0,83350.0,77775.0,385335.0,425730.0,621119.0,629614.0,74504.0,...,2.92364,2.335996,2.120675,0,3.941092,1,-6.353858,-0.35919,5.994668,-19584.413083
2073107,61392638,53525.0,64975.0,89850.0,95350.0,463952.0,515170.0,824197.0,972010.0,79579.0,...,4.38487,5.911496,4.181571,0,2.60365,1,-6.274945,0.609535,6.88448,-17094.781141
2334865,62960945,73550.0,92850.0,130375.0,149800.0,661633.0,759870.0,1253204.0,1554690.0,105897.0,...,6.893866,4.881316,6.591596,0,2.014337,1,-6.555223,0.3347,6.889923,-16297.709359
1736863,57291457,63650.0,74500.0,107075.0,94525.0,411404.0,474365.0,710805.0,671480.0,82471.0,...,2.632554,1.001034,1.613435,0,2.550175,1,-5.390575,-1.357848,4.032727,-25159.230741


In [6]:
X_train = X_train.drop(columns='cid')
X_val = X_val.drop(columns='cid')

In [7]:
# scalers
norm = Normalizer()
standard = StandardScaler()
minmax = MinMaxScaler()
robust = RobustScaler()

scalers_list = [('norm', Normalizer()), ('standard', StandardScaler()), ('minmax', MinMaxScaler()), ('robust', RobustScaler())]

Linear Regression

In [11]:
for scaler in scalers_list:
    # fit and transform 
    X_train_scaled = scaler[1].fit_transform(X_train)
    X_val_scaled = scaler[1].transform(X_val)

    # fit the model
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    # evaluate the model
    yhat = model.predict(X_val_scaled)
    # evaluate predictions
    mae = mean_absolute_error(y_val, yhat)
    print(f'MAE {scaler[0]}:', mae)

MAE norm: 330.66413656215434
MAE standard: 0.2289862663690474
MAE minmax: 0.22898327019604856
MAE robust: 0.22894749530026526


Ridge Regression

In [12]:
for scaler in scalers_list:
    # fit and transform 
    X_train_scaled = scaler[1].fit_transform(X_train)
    X_val_scaled = scaler[1].transform(X_val)

    # fit the model
    model = Ridge()
    model.fit(X_train_scaled, y_train)
    # evaluate the model
    yhat = model.predict(X_val_scaled)
    # evaluate predictions
    mae = mean_absolute_error(y_val, yhat)
    print(f'MAE {scaler[0]}:', mae)

MAE norm: 0.6156536908646693
MAE standard: 0.22905710415269023
MAE minmax: 0.23316121346743945
MAE robust: 0.22917136226984433
