#Baseline Models

We will compare our baselines with lin_reg's performance

In [None]:
#Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.dummy import DummyRegressor

from sklearn.model_selection import cross_validate
from sklearn.model_selection import permutation_test_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

We will use ShuffleSplit as a cross validation strategy

In [None]:
shuffle_split_cv = ShuffleSplit(n_splits=10, test_size=10,random_state=0)

Let's load the data and split into training and test

In [None]:
features, labels = fetch_california_housing(as_frame=True, return_X_y=True)
train_features, test_features, train_labels, test_labels = train_test_split(features,labels,random_state=42)

# LinearRegression classifier

* Build linear regression model with feature scaling as part of a pipeline.
* Train the model with 10-fold cross validation via ShuffleSplit
* Capture errors on different folds

In [None]:
lin_reg_pipeline = Pipeline([('feature_scaling',StandardScaler()),
                             ('lin_reg',LinearRegression())])

lin_reg_score = cross_val_score(lin_reg_pipeline,
                                train_features,
                                train_labels,
                                scoring='neg_mean_squared_error',
                                n_jobs=2)
lin_reg_errors = pd.Series(-lin_reg_cv_results['test_score'],
                           name='Linear regression error')

# DummyRegressor

In [None]:
def dummy_regressor_baseline(strategy, constant_val=None, quantile_val=None):
  baseline_model_median = DummyRegressor(strategy=strategy,
                                         constant=constant_val,
                                         quantile=quantile_val)
  baseline_median_cv_results = cross_validate(baseline_model_median,
                                              train_features, train_labels,
                                              cv=shuffle_split_cv,
                                              scoring='neg_mean_absolute_error',
                                              n_jobs=2)
  return pd.Series(-baseline_median_cv_results['test_score'],name='Dummy regressor error')

# permutation_test_score

It permutes the target to generate randomized data and computes the empirical p-value against the null hypothesis, that features and targets are independent.

Here we are interested in permutation_score returned by this API, which indicates score of the model on different permutations.

In [None]:
score, permutation_score, pvalue = permutation_test_score(lin_reg_pipeline, train_features, train_labels,
                                                          cv=shuffle_split_cv, scoring='neg_mean_absolute_error',n_jobs=2,n_permutations=30)
permutation_errors = pd.Series(-permutation_score, name='Permuted error')