# Introduction to Pipelines in Sklearn

**Reference**:
1. Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow - Aurélien Géron
2. [Sklearn User Guide on Pipelines](https://scikit-learn.org/stable/modules/compose.html)


## Load all the libraries

In [144]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Load the data

In [145]:
# upload the file to GitHub repo
housing_df = pd.read_csv('/content/drive/MyDrive/JTL312_Intro_to_ML/week_3/housing.csv')
housing_df.dropna(inplace=True)

In [146]:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


## Split the data into train and test

In [147]:
from sklearn.model_selection import train_test_split

In [148]:
y_serie = housing_df['median_house_value']
X_df = housing_df.drop(['median_house_value'], axis=1)
housing_df = X_df

In [149]:
X_train_df, X_test_df, y_train_serie, y_test_serie = train_test_split(X_df, y_serie, train_size=0.90, random_state=42)

## Building pipeline

In [150]:
from sklearn.pipeline import Pipeline, make_pipeline

In [151]:
columns_numerical = X_df.select_dtypes(include=[np.number]).columns.to_list()
columns_nonnumerical = X_df.select_dtypes(exclude=[np.number]).columns.to_list()
columns_numerical, columns_nonnumerical

(['longitude',
  'latitude',
  'housing_median_age',
  'total_rooms',
  'total_bedrooms',
  'population',
  'households',
  'median_income'],
 ['ocean_proximity'])

In [152]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

numerical_preprocessor = Pipeline([
    ('impute', SimpleImputer(strategy="median")),
    ('scaling', MinMaxScaler(feature_range=(-1,1)))
    ])

In [153]:
type(numerical_preprocessor), numerical_preprocessor

(sklearn.pipeline.Pipeline,
 Pipeline(steps=[('impute', SimpleImputer(strategy='median')),
                 ('scaling', MinMaxScaler(feature_range=(-1, 1)))]))

In [154]:
from sklearn.preprocessing import OneHotEncoder

nonnumerical_preprocessor = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)
)

In [155]:
type(nonnumerical_preprocessor), isinstance(nonnumerical_preprocessor, Pipeline), nonnumerical_preprocessor

(sklearn.pipeline.Pipeline,
 True,
 Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                 ('onehotencoder',
                  OneHotEncoder(handle_unknown='ignore', sparse_output=False))]))

In [156]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer([
    ("numerical", numerical_preprocessor, columns_numerical),
    ("nonnumerical", nonnumerical_preprocessor, columns_nonnumerical)
    ])

In [157]:
type(preprocessor), preprocessor

(sklearn.compose._column_transformer.ColumnTransformer,
 ColumnTransformer(transformers=[('numerical',
                                  Pipeline(steps=[('impute',
                                                   SimpleImputer(strategy='median')),
                                                  ('scaling',
                                                   MinMaxScaler(feature_range=(-1,
                                                                               1)))]),
                                  ['longitude', 'latitude', 'housing_median_age',
                                   'total_rooms', 'total_bedrooms', 'population',
                                   'households', 'median_income']),
                                 ('nonnumerical',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer(strategy='most_frequent')),
                                                  ('onehotencoder',
       

In [158]:
X_preprocessed = preprocessor.fit_transform(X_train_df)

In [159]:
from sklearn.linear_model import LinearRegression
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_regression', LinearRegression())
])

In [160]:
full_pipeline.fit(X_train_df, y_train_serie)

In [161]:
yhat_test_serie = full_pipeline.predict(X_test_df)

In [162]:
from sklearn.metrics import mean_squared_error as mse
lr_test_rmse = mse(y_test_serie, yhat_test_serie, squared=False)
lr_test_rmse.round(2)

69164.03

## Running Independent training and tests for Cross Validation

In [163]:
from sklearn.model_selection import cross_val_score

In [164]:
-cross_val_score(full_pipeline, X_df, y_serie, scoring="neg_root_mean_squared_error", cv=10)

array([70648.05190974, 71360.93432602, 68876.66270305, 69445.8222738 ,
       68462.14068028, 68031.11249695, 68957.4499904 , 65740.34917076,
       68155.33830153, 67734.13617067])

Advantages of using Pipelines:
1. Convenience & Encapsulation
2. Joint Parameter Selection
3. Safety from statistics/data leakage

## More about pipeline objects

### Accessing pipeline components

In [166]:
full_pipeline[0]

In [168]:
full_pipeline['preprocessor']

### Retriving pipeline attributes

In [177]:
full_pipeline[:-1].get_feature_names_out()

array(['numerical__longitude', 'numerical__latitude',
       'numerical__housing_median_age', 'numerical__total_rooms',
       'numerical__total_bedrooms', 'numerical__population',
       'numerical__households', 'numerical__median_income',
       'nonnumerical__ocean_proximity_<1H OCEAN',
       'nonnumerical__ocean_proximity_INLAND',
       'nonnumerical__ocean_proximity_ISLAND',
       'nonnumerical__ocean_proximity_NEAR BAY',
       'nonnumerical__ocean_proximity_NEAR OCEAN'], dtype=object)

In [179]:
# full_pipeline[:-1].get_params()

### Modifying pipeline components

In [184]:
full_pipeline.set_params(linear_regression__fit_intercept=False, linear_regression__positive=True)

In [190]:
full_pipeline.set_params(preprocessor__numerical__scaling__feature_range=(-10, 10))

eg. Effect of MinMax scaling range on vanilla linear regression fitting

In [203]:
for feature_range in [(-1,2), (-4, 6), (100, 100.5), (0,1)]:
  full_pipeline.set_params(preprocessor__numerical__scaling__feature_range=feature_range)
  # print(full_pipeline.get_params(False))
  full_pipeline.fit(X_train_df, y_train_serie)
  lr_test_rmse = mse(y_test_serie, yhat_test_serie, squared=False)
  print(f"Feature Range={feature_range}; RMSE={lr_test_rmse:.2f}")



Feature Range=(-1, 2); RMSE=69164.03
Feature Range=(-4, 6); RMSE=69164.03
Feature Range=(100, 100.5); RMSE=69164.03
Feature Range=(0, 1); RMSE=69164.03
