### Import dependencies

In [None]:
# Re-loads all imports every time the cell is ran. 
%load_ext autoreload
%autoreload 2

import pandas as pd
pd.options.display.float_format = '{:,.5f}'.format

import numpy as np
from time import time
from IPython.display import display

from sklearn.model_selection import cross_validate, learning_curve, train_test_split, RepeatedKFold
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import datasets
from lightgbm import LGBMRegressor

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Load dataset

In [None]:
data = datasets.load_boston()
X = pd.DataFrame(data['data'], columns=data['feature_names'])
y = pd.Series(data['target'], name='y')
display(X)

### Exploratory data analysis

Display the dataframe and it's description

In [None]:
X.describe()

In [None]:
plt.hist(y, bins=50)

In [None]:
n_features = X.shape[1]
n_cols = 4
n_rows = np.ceil(n_features / n_cols).astype(int)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, n_rows*3))

for i, col in enumerate(X.columns):
    ax = axes[i//n_cols][i%n_cols]
    ax.hist(X[col], bins=25)

    ax.set_xlabel(X.columns[i])

fig.tight_layout()

**Visualize scatterplots (xi, y) pairs**

In [None]:
n_features = X.shape[1]
pairs = [(X[col], y) for col in X.columns]
n_cols = 4
n_rows = np.ceil(n_features / n_cols).astype(int)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, n_rows*3))

for i, pair in enumerate(pairs):
    ax = axes[i//n_cols][i%n_cols]
    ax.scatter(pair[0], pair[1], marker='x')

    ax.set_xlabel(X.columns[i])
    ax.set_ylabel('House price (in $1000s)')

fig.tight_layout()

The first impression from the plots above is that LSTAT and RM features have close-to-linear relationship with the target and can explain the significant part of the price.

### Prepare sets

The dataset is split initially into train and test sets using train_test_split.

Train set is further split into train and validation subsets using K-fold CV

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    random_state=1, test_size=0.2, shuffle=True
)

cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)

### Select metric

In [None]:
rmse = make_scorer(lambda a, b: np.sqrt(mean_squared_error(a, b)), greater_is_better=False)

### Establish no skill performance

In [None]:
dummy = DummyRegressor()

# Cross validate dummy classifier to establish no skill performance
dummy_res = cross_validate(dummy, X_train, y_train, scoring=rmse, 
                           return_train_score=True, cv=cv, n_jobs=-1)

dummy_train_score = np.mean(dummy_res['train_score'])
dummy_test_score = np.mean(dummy_res['test_score'])

display(
    f'Train score, dummy: {-dummy_train_score:.2f}',
    f'CV score, dummy: {-dummy_test_score:.2f}', 
)

### Linear regression

In [None]:
# Create prediction pipeline
scaler = StandardScaler() 
clf = LinearRegression()

pipeline = make_pipeline(scaler, clf)


# Fit the model, predict train and test sets
res = cross_validate(pipeline, X_train, y_train, scoring=rmse, 
                     return_train_score=True, cv=cv, n_jobs=-1)

train_score = np.mean(res['train_score'])
test_score = np.mean(res['test_score'])

display(
    f'Mean train RMSE: {-train_score:.2f}',
    f'Mean CV RMSE: {-test_score:.2f}', 
)


### Learning curves

In [None]:
train_sizes, train_scores, test_scores = learning_curve(
    pipeline, X, y, train_sizes=np.linspace(0.1, 0.8, 20),
    random_state=1, shuffle=True, scoring=rmse
)

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

fig, ax = plt.subplots(figsize=(10, 5))

sns.lineplot(x=train_sizes, y=train_scores_mean, label='-RMSE: train score', ax=ax)
sns.lineplot(x=train_sizes, y=test_scores_mean, label='-RMSE: CV score', ax=ax)

Let's suppose that ideal performance is unreachable out of sample and estimate that bayes error is somewhere aroung ~1 RMSE. Combining that estimate with the results 

- No skill ~= 9 RMSE
- 80/20 split linreg ~= 6 RMSE
- Cross-validated test set linreg ~= 5 RMSE
- Train set linreg ~= 4.5 RMSE
- Estimated Bayes error ~= 1 RMSE
- Ideal performance == 0 RMSE

That gives the following estimates for bias and variance:
- ~3.5 RMSE worth of avoidable bias
- ~0.5 RMSE worth of variance on CV
- ~1.5 RMSE worth of variance on 80/20 split

That suggests that both bias and variance can be improved.

Accesible ways to reduce bias:
- Use more complex model 
- Engineer better features

Accesible ways to reduce variance:
- Use regularization
- Drop irrelevant features

It's usually a good idea to deal with bias first and work on variance later.
Let's try the easy way first and build a more complex model.

### Non-linear model

Boosted trees have been one of the most succesful ensembles as of late. Let's try fitting LightGBM regressor.

In [None]:
# Create prediction pipeline
scaler = StandardScaler() 
clf = LGBMRegressor()

pipeline = make_pipeline(scaler, clf)

# Fit the model, predict train and test sets
res = cross_validate(pipeline, X_train, y_train, scoring=rmse, 
                     return_train_score=True, cv=cv, n_jobs=-1)

train_score = np.mean(res['train_score'])
test_score = np.mean(res['test_score'])

display(
    f'Mean train RMSE: {-train_score:.2f}',
    f'Mean CV RMSE: {-test_score:.2f}', 
)


In [None]:
train_sizes, train_scores, test_scores = learning_curve(
    pipeline, X, y, train_sizes=np.linspace(0.1, 0.8, 20),
    random_state=1, shuffle=True, scoring=rmse
)

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

fig, ax = plt.subplots(figsize=(10, 5))

sns.lineplot(x=train_sizes, y=train_scores_mean, label='-RMSE: train score', ax=ax)
sns.lineplot(x=train_sizes, y=test_scores_mean, label='-RMSE: CV score', ax=ax)

LGBM indeed turned out to be quite good. Using more complex model, we have the following situation:

- No skill ~= 9 RMSE
- 80/20 split LGBM ~= 4.5 RMSE
- Cross-validated test set LGBM ~= 3.5 RMSE
- Train set LGBM ~= 1.5 RMSE
- Estimated Bayes error ~= 1 RMSE
- Ideal performance == 0 RMSE

That gives the following estimates for bias and variance:
- ~0.5 RMSE worth of avoidable bias
- ~2.0 RMSE worth of variance on CV
- ~3.0 RMSE worth of variance on 80/20 split

We have managed to reduce the bias even below our initial estimate of unavoidable bias just by using the more complex model. Simultaneously, we have managed to reduce out of sample error significantly.

Now the model is complex enough to express the train set, gives a good score on the test set, but still exhibits substantial variance.

Time to try variance mitigation techniques.

In [None]:
# Create prediction pipeline
scaler = StandardScaler() 
clf = LGBMRegressor(
    reg_lambda=5,
    reg_alpha=2,
)

pipeline = make_pipeline(scaler, clf)

# Fit the model, predict train and test sets
res = cross_validate(pipeline, X_train, y_train, scoring=rmse, 
                     return_train_score=True, cv=cv, n_jobs=-1)

train_score = np.mean(res['train_score'])
test_score = np.mean(res['test_score'])

display(
    f'Mean train RMSE: {-train_score:.2f}',
    f'Mean CV RMSE: {-test_score:.2f}', 
)


By manually trying several values of regularization parameters, we can see how they affect the result:

No regularization
- Mean train RMSE: 1.54
- Mean CV RMSE: 3.56

reg_lambda=5, reg_alpha=2
- Mean train RMSE: 1.77
- Mean CV RMSE: 3.57

So regularization indeed reduces variance in this case, but it does so by increasing train loss, not decreasing validation loss.

Let's try tuning hyperparameters.

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from hyperopt.pyll.base import scope

space = dict(
    learning_rate = hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
    n_estimators = scope.int(hp.qloguniform('n_estimators', np.log(50), np.log(500), np.log(10))),
    max_depth = scope.int(hp.quniform('max_depth', 2, 15, 1)),
)

def objective(params):
        
        clf = make_pipeline(StandardScaler(), LGBMRegressor(**params))
        cv = RepeatedKFold(n_splits=5, n_repeats=2, random_state=1)

        res = cross_validate(clf, X_train, y_train, scoring=rmse, 
                             return_train_score=True, cv=cv, n_jobs=-1)

        train_score = np.mean(res['train_score'])
        test_score = np.mean(res['test_score']) - np.std(res['test_score'])

#         print({ **params, 'loss': test_score})
        result = dict(
            params=params,
            train_loss = -train_score,
            # Hyperopt-required keys
            loss = -test_score,
            status = STATUS_OK,   
        )
        return result
        
trials = Trials()
best = fmin(objective, space, algo=tpe.suggest, max_evals=50, trials=trials)
# results = [trial['result'] for trial in trials]
best

Hyperparameter optimization appears to reduce our loss on CV from 3.57 to 3.35. 

However, the search of hyperparameters was done on the same data and the same CV configuration as we are calculating the scores on.

That means that we actually don't know whether the best param set offers any improvement out of sample. I suspect it doesn't. Let's use our test set for the first time and see.

In [None]:
scaler = StandardScaler()

linreg = make_pipeline(scaler, LinearRegression())
lgbm = make_pipeline(scaler, LGBMRegressor())
lgbm_tuned = make_pipeline(scaler, LGBMRegressor(
    learning_rate=best['learning_rate'],
    max_depth=int(best['max_depth']),
    n_estimators=int(best['n_estimators']),
))

linreg.fit(X_train, y_train)
lgbm.fit(X_train, y_train)
lgbm_tuned.fit(X_train, y_train)

print(
    "Linreg:", -rmse(linreg, X_test, y_test),
    "\nDefault LGBM:", -rmse(lgbm, X_test, y_test),
    "\nOptimized LGBM:", -rmse(lgbm_tuned, X_test, y_test)
)

As we can see, optimized hyperparameters didn't remain the best out of sample and untuned LGBM performed slightly better.

Therefore, RMSE = 3.066 is the best out of sample score achieved in this notebook on Boston house prices dataset.