<a href="https://colab.research.google.com/github/shounakk05/Hands-On-ML-Journey/blob/main/Chapter-02/Exercise_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this exercises we have to add a transformer to the data preparation pipeline to only select the most important attributes

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

housing = pd.read_csv("https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv")

train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)
housing = train_set.drop(['median_house_value'], axis = 1)
housing_labels = train_set['median_house_value'].copy()

housing['income_cat'] = pd.cut(housing['median_income'], bins = [0, 1.5, 3.0, 4.5, 6.0, np.inf], labels = [1, 2, 3, 4, 5])

# I decided to use Stratified Split instead of simple train test split to ensure distribution of the data
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state =42)
for train_index, test_index in split.split(housing, housing['income_cat']):
  strat_train_set = housing.iloc[train_index]
  strat_test_index = housing.iloc[test_index]

# Pipeline creation for data cleaning and transformation
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attr = list(housing.drop('ocean_proximity', axis = 1))
cat_attr = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', Pipeline([('imputer', SimpleImputer(strategy = 'median')), ('std_scaler', StandardScaler())]), num_attr),
    ('cat', OneHotEncoder(), cat_attr)
])

housing_prep = full_pipeline.fit_transform(housing)

In [3]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint

params_distribution = {
    'n_estimators': randint(low = 1, high = 200),
    'max_features': randint(low = 1, high = 15)
}

RFR = RandomForestRegressor()

rnd_search = RandomizedSearchCV(RFR, params_distribution, n_iter = 10, cv = 5, scoring = 'neg_mean_squared_error', random_state = 42)

rnd_search.fit(housing_prep, housing_labels)

In [4]:
neg_mse = rnd_search.best_score_
rmse = np.sqrt(-neg_mse)
rmse

np.float64(49409.54574547045)

In [5]:
# Finding the feature importances from the prediction of the RandomForestRegressor model
feature_importances = rnd_search.best_estimator_.feature_importances_

In [19]:
# Creating the transformer for selection of the most important attributes
from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.asarray(arr), -k)[-k:])

class FeatureSelector(BaseEstimator, TransformerMixin):
  def __init__(self, feature_importances, k):
    self.feature_importances = feature_importances
    self.k = k

  def fit(self, X, y = None):
    self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
    return self

  def transform(self, X):
    return X[:, self.feature_indices_]

In [20]:
k = 5

preparation_and_feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', FeatureSelector(feature_importances, k))
])

# Now you can transform your data in one go!
housing_prepared_top_k = preparation_and_feature_selection_pipeline.fit_transform(housing)

In [22]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint

params_distribution = {
    'n_estimators': randint(low = 1, high = 200),
    'max_features': randint(low = 1, high = 15)
}

RFR = RandomForestRegressor()

rnd_search = RandomizedSearchCV(RFR, params_distribution, n_iter = 10, cv = 5, scoring = 'neg_mean_squared_error', random_state = 42)

rnd_search.fit(housing_prepared_top_k, housing_labels)

In [23]:
neg_mse = rnd_search.best_score_
rmse = np.sqrt(-neg_mse)
rmse

np.float64(48848.87162845016)