# Introduction to Pipelines in Sklearn

**Reference**:
1. Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow - Aurélien Géron
2. [Sklearn User Guide on Pipelines](https://scikit-learn.org/stable/modules/compose.html)


## Load all the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Load the data

In [None]:
# upload the file to GitHub repo
housing_df = pd.read_csv('/content/drive/MyDrive/JTL312_Intro_to_ML/week_3/housing.csv')
# housing_df.dropna(inplace=True)

In [None]:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


## Split the data into train and test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
y_serie = housing_df['median_house_value']
X_df = housing_df.drop(['median_house_value'], axis=1)
housing_df = X_df

In [None]:
X_train_df, X_test_df, y_train_serie, y_test_serie = train_test_split(X_df, y_serie, train_size=0.90, random_state=42)

# Sklearn Pipelines

## Building pipeline

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline

In [None]:
columns_numerical = X_df.select_dtypes(include=[np.number]).columns.to_list()
columns_nonnumerical = X_df.select_dtypes(exclude=[np.number]).columns.to_list()
columns_numerical, columns_nonnumerical

(['longitude',
  'latitude',
  'housing_median_age',
  'total_rooms',
  'total_bedrooms',
  'population',
  'households',
  'median_income'],
 ['ocean_proximity'])

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

numerical_preprocessor = Pipeline([
    ('impute', SimpleImputer(strategy="median")),
    ('scaling', MinMaxScaler(feature_range=(-1,1)))
    ])

In [None]:
type(numerical_preprocessor), numerical_preprocessor

(sklearn.pipeline.Pipeline,
 Pipeline(steps=[('impute', SimpleImputer(strategy='median')),
                 ('scaling', MinMaxScaler(feature_range=(-1, 1)))]))

In [None]:
from sklearn.preprocessing import OneHotEncoder

nonnumerical_preprocessor = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)
)

In [None]:
type(nonnumerical_preprocessor), isinstance(nonnumerical_preprocessor, Pipeline), nonnumerical_preprocessor

(sklearn.pipeline.Pipeline,
 True,
 Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                 ('onehotencoder',
                  OneHotEncoder(handle_unknown='ignore', sparse_output=False))]))

In [None]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer([
    ("numerical", numerical_preprocessor, columns_numerical),
    ("nonnumerical", nonnumerical_preprocessor, columns_nonnumerical)
    ])

In [None]:
type(preprocessor), preprocessor

(sklearn.compose._column_transformer.ColumnTransformer,
 ColumnTransformer(transformers=[('numerical',
                                  Pipeline(steps=[('impute',
                                                   SimpleImputer(strategy='median')),
                                                  ('scaling',
                                                   MinMaxScaler(feature_range=(-1,
                                                                               1)))]),
                                  ['longitude', 'latitude', 'housing_median_age',
                                   'total_rooms', 'total_bedrooms', 'population',
                                   'households', 'median_income']),
                                 ('nonnumerical',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer(strategy='most_frequent')),
                                                  ('onehotencoder',
       

In [None]:
X_preprocessed = preprocessor.fit_transform(X_train_df)

In [None]:
from sklearn.linear_model import LinearRegression
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_regression', LinearRegression())
])

In [None]:
full_pipeline.fit(X_train_df, y_train_serie)

In [None]:
yhat_test_serie = full_pipeline.predict(X_test_df)

In [None]:
from sklearn.metrics import root_mean_squared_error as rmse
lr_test_rmse = rmse(y_test_serie, yhat_test_serie)
lr_test_rmse

68047.65927344347

## Accessing pipeline components

In [None]:
full_pipeline[0]

In [None]:
full_pipeline['preprocessor']

In [None]:
full_pipeline['preprocessor']['nonnumerical'][0].statistics_

array(['<1H OCEAN'], dtype=object)

In [None]:
full_pipeline['linear_regression'].coef_

array([-129312.0005646 , -115936.92091248,   26804.26399166,
        -77189.13208927,  222964.63424468, -691257.98521442,
        239694.52032475,  280614.45461095,  -24142.18025472,
        -64073.00929665,  133002.00745042,  -27050.02450655,
        -17736.7933925 ])

### Retriving pipeline attributes

In [None]:
full_pipeline[:-1].get_feature_names_out()

array(['numerical__longitude', 'numerical__latitude',
       'numerical__housing_median_age', 'numerical__total_rooms',
       'numerical__total_bedrooms', 'numerical__population',
       'numerical__households', 'numerical__median_income',
       'nonnumerical__ocean_proximity_<1H OCEAN',
       'nonnumerical__ocean_proximity_INLAND',
       'nonnumerical__ocean_proximity_ISLAND',
       'nonnumerical__ocean_proximity_NEAR BAY',
       'nonnumerical__ocean_proximity_NEAR OCEAN'], dtype=object)

In [None]:
# full_pipeline[:-1].get_params()

## Modifying pipeline components

In [None]:
full_pipeline.set_params(linear_regression__fit_intercept=False, linear_regression__positive=True)

In [None]:
full_pipeline.set_params(preprocessor__numerical__scaling__feature_range=(-10, 10))

eg. Effect of MinMax scaling range on vanilla linear regression fitting

In [None]:
for feature_range in [(-1,2), (-4, 6), (100, 100.5), (0,1)]:
  full_pipeline.set_params(preprocessor__numerical__scaling__feature_range=feature_range)
  # print(full_pipeline.get_params(False))
  full_pipeline.fit(X_train_df, y_train_serie)
  lr_test_rmse = rmse(y_test_serie, yhat_test_serie)
  print(f"Feature Range={feature_range}; RMSE={lr_test_rmse:.2f}")



Feature Range=(-1, 2); RMSE=68047.66
Feature Range=(-4, 6); RMSE=68047.66
Feature Range=(100, 100.5); RMSE=68047.66
Feature Range=(0, 1); RMSE=68047.66
