# Exercise 2.1.1

Build a `Pipeline` version of the `PolynomialRegressor` and search for the optimal `degree` of the polynomial.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

## Solution add PolynomialFeatures before LinearRegression in the pipeline
est = Pipeline(steps=[('poly', PolynomialFeatures()), ('lr', LinearRegression())])

## Use nested hyper-parameter naming to search over degree.
gs = GridSearchCV(estimator=est, param_grid={'poly__degree': range(1, 20)}, 
                  scoring='neg_mean_squared_error', cv=3)

gs.fit(X[:, np.newaxis], y)

fig = plot_data(X, y, fn=true_fn)
plot_estimator(gs.best_estimator_, fig)

# Exercise 2.1.2

Implement the following pipeline:

<img src="img/exercise_2_1_2.png">

And tune hyper-parameters:
  * `RandomForestRegressor(criterion='mse')` with `['mse', 'mae']`

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import ShuffleSplit


numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-99999)),
    ])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('fx_selection', SelectFromModel(ElasticNet(alpha=0.1), max_features=100, threshold=None))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features),
    ])

# Append estimator to preprocessing pipeline.
# Now we have a full prediction pipeline.
est = Pipeline(steps=[('preprocessor', preprocessor),
                      ('rf', RandomForestRegressor(n_estimators=10, n_jobs=-1))])

gs = GridSearchCV(estimator=est, param_grid={'rf__criterion': ['mse', 'mae']}, 
                  scoring='neg_mean_absolute_error', cv=3, refit=True, verbose=10)

gs.fit(X_train, y_train)

print("best params: {}".format(gs.best_params_))
print("model score: %.3f" % mean_absolute_error(y_test, gs.best_estimator_.predict(X_test)))

# Exercise 2.1.3

Add a sub-pipeline to process text:

<img src="img/exercise_2_1_3.png">

Notes:
  * `RandomForestRegressor` is not doing great with wide & sparse Bag-of-Words representations; you can use a sub-model to predict the `SalePrice` just from BoW features and feed this predictions into the `RandomForestRegressor`.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.model_selection import KFold
    

class StackingTransformer(BaseEstimator, TransformerMixin):
    """A transformer that wraps a RegressorMixin `est` predict function.
    
    It overloads `fit_transform` to do a stacked transform to avoid leakage.
    """
    def __init__(self, est):
        self.est = est
        
    def fit(self, X, y=None):
        if y is None:
            raise ValueError('{}.fit requires y to be not None'.format(self))
        self.est.fit(X, y)
        return self
    
    def transform(self, X):
        return self.est.predict(X)[:, np.newaxis]
    
    def fit_transform(self, X, y=None):
        if y is None:
            raise ValueError('{}.fit requires y to be not None'.format(self))
        self.models_ = []
        out = np.empty_like(y)[:, np.newaxis]
        for train, test in KFold(5, shuffle=True, random_state=0).split(X):
            self.models_.append(clone(self.est).fit(X[train], y[train]))
            out[test, 0] = self.models_[-1].predict(X[test])
        self.est = self.models_[0]
        return out


numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-99999)),
    ])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('fx_selection', SelectFromModel(ElasticNet(alpha=0.1), max_features=100, threshold=None))
])

text_pipeline = Pipeline(steps=[
    ('bow', CountVectorizer()),
    ('bow_reg', StackingTransformer(ElasticNet(alpha=0.1)))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features),
        ('text', text_pipeline, text_features[0])
    ])

# Append estimator to preprocessing pipeline.
# Now we have a full prediction pipeline.
est = Pipeline(steps=[('preprocessor', preprocessor),
                      ('rf', RandomForestRegressor(n_estimators=10, criterion='mae', n_jobs=-1))])

est.fit(X_train, y_train)

#print("best params: {}".format(gs.best_params_))
print("model score: %.3f" % mean_absolute_error(y_test, est.predict(X_test)))