<a href="https://colab.research.google.com/github/shounakk05/Hands-On-ML-Journey/blob/main/Chapter-02/Exercise_04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Now in this exercise I will try to write a full pipeline with a custom transformer to process and predict the data.

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Step 01: Fatch the data
housing = pd.read_csv("https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv")

# Step 02: Splitting the data using train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)

# Step 03: Seperating the Attributes and the label
housing = train_set.drop("median_house_value", axis = 1)
housing_labels = train_set["median_house_value"].copy()

# Step 04: Finding the indexes of total_rooms, total_bedrooms, population, households
# reason: after functions like SimpleImputer the dataframe will change into numy array, making it impossible to access columns using names
rooms_ix, bedrooms_ix, population_ix, households_ix = [
    housing.columns.get_loc(c) for c in ["total_rooms", "total_bedrooms", "population", "households"]
]

# Step 05: Creation of my CustomTransformer for creation of additional columns
class CombinedAttributeAdder(BaseEstimator, TransformerMixin):
  def __init__(self, add_bedrooms_per_room = True):
    self.add_bedrooms_per_room = add_bedrooms_per_room
  def fit(self, X, y = None):
    return self
  def transform(self, X):
    rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
    population_per_household = X[:, population_ix] / X[:, households_ix]

    if self.add_bedrooms_per_room:
      bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
      return np.c_[X, bedrooms_per_room, rooms_per_household, population_per_household]
    else:
      return np.c_[X, rooms_per_household, population_per_household]

# Step 06: Seperating the numerical and catagorical columns for processing
num_attr = housing.select_dtypes(include=np.number).columns.tolist()  # Got an error here
cat_attr = housing.select_dtypes(include='object').columns.tolist()   # -> my num attribute was giving the column transformer a dataframe instead of columns names list

# Step 07: Creation of a numerical_pipeline for processing of numerical columns
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "median")),
    ("attr_adder", CombinedAttributeAdder()),
    ("std_scaler", StandardScaler())
])

# Step 08: Creation of processing_pipeline for both num. and cat. attr.
processing_pipeline = ColumnTransformer(transformers = [
    ("num", num_pipeline, num_attr),
    ("cat", OneHotEncoder(), cat_attr)
])

# Step 09: Creation of full_pipeline for processing both processing_pipeline and RandomForestRegressor and GridSearchCV
full_pipeline = Pipeline([
    ("processing", processing_pipeline),
    ("model", RandomForestRegressor(n_estimators = 100, random_state = 42))
])

# Step 10: Starting the pipeline on training data
full_pipeline.fit(housing, housing_labels)

# Step 11: Test set features and label splitting
X_test = test_set.drop("median_house_value", axis = 1)
y_test = test_set["median_house_value"].copy()

# Step 12: Final predictions on test set
y_pred = full_pipeline.predict(X_test)

# Step 13: RMSE for performance evaluation
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(rmse)

50339.472725301675
