# Example 5: Preprocessing
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

In [35]:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [36]:
def train_val_test_split(x, t):
    x_train_val, x_test, t_train_val, t_test = train_test_split(x, t, test_size=0.2)
    x_train, x_val, t_train, t_val = train_test_split(x_train_val, t_train_val, test_size=0.2)
    return (
        x_train, t_train,
        x_val, t_val, 
        x_test, t_test,
    )

In [37]:
num_samples = 100000
w_true = np.array([100, 200])  # You choose
b_true = 5
noise_scale = 1.0

num_features = len(w_true)
x = np.stack(
    arrays=[
        2 + 3 * np.random.randn(num_samples), # sampled from a normal distribution with mean=2, std=3
        5 + 10 * np.random.randn(num_samples), # sampled from a normal distribution with mean=2, std=3
    ],
    axis=1,
)

noise = np.random.randn(num_samples) * noise_scale

t = (x * w_true).sum(axis=1) + b_true + noise

x_train, t_train, x_val, t_val, x_test, t_test = train_val_test_split(x, t)

In [38]:
model_kwargs = dict(random_state=1337, eta0=0.1, max_iter=10000)

In [39]:
# 2. 전처리 없이 학습
model_no_scaling = SGDRegressor(**model_kwargs)
model_no_scaling.fit(x_train, t_train)
y_val_no_scaling = model_no_scaling.predict(x_val)
mse_val_no_scaling = mean_squared_error(t_val, y_val_no_scaling)

In [40]:

def test_scaler(model, x_scaler, t_scaler, x_train, t_train, x_val, t_val,):
    x_train_scaled = x_scaler.fit_transform(x_train)
    x_val_scaled = x_scaler.transform(x_val)

    def t_scaler_wrapper(fn, t):
        return fn(t[:, None])[:, 0]
    
    t_train_scaled = t_scaler_wrapper(t_scaler.fit_transform, t_train)
    t_val_scaled = t_scaler_wrapper(t_scaler.transform, t_val)

    model.fit(x_train_scaled, t_train_scaled)
    y_val_scaled = model.predict(x_val_scaled)
    
    y_val = t_scaler_wrapper(t_scaler.inverse_transform, y_val_scaled)
    
    mse = mean_squared_error(y_true=t_val, y_pred=y_val)

    return model, mse

In [41]:
model_standard, mse_val_standard = test_scaler(
    model=SGDRegressor(**model_kwargs), 
    x_scaler=StandardScaler(),
    t_scaler=StandardScaler(),
    x_train=x_train,
    t_train=t_train,
    x_val=x_val,
    t_val=t_val,
)

model_minmax, mse_val_minmax = test_scaler(
    model=SGDRegressor(**model_kwargs), 
    x_scaler=MinMaxScaler(),
    t_scaler=MinMaxScaler(),
    x_train=x_train,
    t_train=t_train,
    x_val=x_val,
    t_val=t_val,
)

In [42]:
print(f'{mse_val_no_scaling=:.3f}')
print(f'{mse_val_standard=:.3f}')
print(f'{mse_val_minmax=:.3f}')

mse_val_no_scaling=1.106
mse_val_standard=1.079
mse_val_minmax=180.934
