## 1. Importing Libraries

In [143]:
from pathlib import Path

import numpy as np
import pandas as pd

import sklearn

from sklearn.ensemble import RandomForestRegressor


from sklearn.base import (
    BaseEstimator,
    TransformerMixin
)

from sklearn.impute import (
    SimpleImputer
)

from sklearn.preprocessing import (
	OneHotEncoder,
    OrdinalEncoder,
    TargetEncoder,
    StandardScaler,
    MinMaxScaler,
    FunctionTransformer
)

from sklearn.compose import (
    ColumnTransformer
)

from sklearn.pipeline import (
    Pipeline,
    FeatureUnion
)

from feature_engine.encoding import(
    RareLabelEncoder
)

from feature_engine.selection import(
    SelectBySingleFeaturePerformance
)


import warnings

## 2. Display Settings

In [2]:
pd.set_option("display.max_columns", None)
sklearn.set_config(transform_output = "pandas")
warnings.filterwarnings('ignore')

## 3. Reading the Data

In [3]:
PROJECT_DIR = Path(r"F:\Rishabh\House-Price-Prediction-MLOps-Project")
DATA_DIR = "data/train"

In [4]:
def read_data(name):
    file_name = f"{name}.csv"
    file_path = PROJECT_DIR/DATA_DIR/file_name

    return pd.read_csv(file_path)

In [5]:
houses = read_data("train")

In [6]:
houses

Unnamed: 0,amount,location,carpet_area,transaction,furnishing,facing,bathroom,balcony,ownership,super_area,num_bhk,is_studio,floor_num,num_floors,overlooking_garden,overlooking_mainroad,overlooking_pool,parking_spots,parking_cover,price
0,1.900,greater-noida,5500.0,Resale,Unfurnished,North - East,5.0,6.0,Leasehold,,6.0,0,8.0,8.0,1.0,1.0,1.0,,,2923.0
1,0.400,mangalore,,Resale,Semi-Furnished,,2.0,1.0,,1250.0,2.0,0,2.0,4.0,,,,1.0,Covered,3200.0
2,1.060,surat,1175.0,New Property,Unfurnished,East,3.0,4.0,Freehold,,3.0,0,5.0,10.0,1.0,1.0,0.0,1.0,Covered,5236.0
3,1.600,gurgaon,1350.0,Resale,Semi-Furnished,East,3.0,3.0,Freehold,,3.0,0,2.0,4.0,,,,1.0,Covered,9877.0
4,0.270,gwalior,,New Property,Unfurnished,,2.0,1.0,,750.0,2.0,0,3.0,3.0,,,,,,3600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46903,0.695,hyderabad,1155.0,New Property,Unfurnished,East,2.0,1.0,Freehold,,2.0,0,4.0,9.0,1.0,0.0,0.0,,,6017.0
46904,0.160,surat,,Resale,Unfurnished,,1.0,,,660.0,1.0,0,3.0,5.0,,,,,,2424.0
46905,0.650,mohali,,Resale,Unfurnished,,4.0,,,2500.0,4.0,0,8.0,9.0,,,,,,2600.0
46906,0.550,nagpur,860.0,Resale,Unfurnished,East,2.0,2.0,Freehold,,2.0,0,2.0,3.0,1.0,0.0,0.0,,,5789.0


In [7]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46908 entries, 0 to 46907
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   amount                46908 non-null  float64
 1   location              46908 non-null  object 
 2   carpet_area           24693 non-null  float64
 3   transaction           46908 non-null  object 
 4   furnishing            46293 non-null  object 
 5   facing                28225 non-null  object 
 6   bathroom              46908 non-null  float64
 7   balcony               34723 non-null  float64
 8   ownership             29363 non-null  object 
 9   super_area            22215 non-null  float64
 10  num_bhk               46908 non-null  float64
 11  is_studio             46908 non-null  int64  
 12  floor_num             45293 non-null  float64
 13  num_floors            45261 non-null  float64
 14  overlooking_garden    26290 non-null  float64
 15  overlooking_mainroa

In [8]:
X_train = houses.drop(columns = "amount")
y_train = houses.amount.copy()

In [9]:
print(X_train.shape)
print(y_train.shape)

(46908, 19)
(46908,)


## 4. Transformation Operations

### 4.1 Imputation Pipeline

In [10]:
# Function to rename the columns before putting back into imputation pipeline
def prefix_remover(X, prefixes):

    prefix_list = [f"{prefix}__" for prefix in prefixes]
    new_cols = X.columns
    
    for prefix in prefix_list:
        new_cols = [col.replace(prefix,"") if col.startswith(prefix) else col for col in new_cols]

    return X.rename(
        columns = dict(zip(X.columns,new_cols))
    )

In [11]:
# imputer to impute bathroom values based on num_bhk
class GroupAggregateImputer(BaseEstimator, TransformerMixin):

    def __init__(self,variable,group_col,estimator,add_indicator = False):
        self.variable = variable
        self.group_col = group_col
        self.estimator = estimator
        self.add_indicator = add_indicator

    def fit(self,X,y = None):

        self.group_medians_ = {}
        self.group_modes_ = {}

        if self.estimator == "median":
                self.group_medians_[self.variable] =  X.groupby(self.group_col)[self.variable].median()
                
        elif self.estimator == "mode":
                self.group_modes_[self.variable] =  X.groupby(self.group_col)[self.variable].agg(lambda x: x.mode().iloc[0])
            
        return self

    def transform(self,X):
       X = X.copy()

       if self.add_indicator:
           X = X.assign(**{
               f"{self.variable}_missingindicator" : lambda df:(
                     np.where(
                         df[self.variable].isnull(),
                         1,0
                     )
                 )
           })

       if self.estimator == "median":
               mask = X[self.variable].isnull()
               X.loc[mask,self.variable] = X.loc[mask,self.group_col].map(self.group_medians_[self.variable])
               
       elif self.estimator == "mode":
               mask = X[self.variable].isnull()
               X.loc[mask,self.variable] = X.loc[mask,self.group_col].map(self.group_modes_[self.variable]) 

       
       return X

In [12]:
furnishing_imputer = ColumnTransformer(transformers = [
    ("furnishing_imputer",GroupAggregateImputer(variable = "furnishing",group_col = "transaction",estimator = "mode"),["furnishing","transaction"])
],remainder = "passthrough")

furnishing_imputation_pipeline = Pipeline(steps = [
    ("furnishing_imputer",furnishing_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["furnishing_imputer","remainder"]}))
])

In [13]:
floor_num_imputer = ColumnTransformer(transformers = [
    ("floor_num_imputer",SimpleImputer(strategy = "median"),["floor_num"])
],remainder = "passthrough")

floor_num_imputation_pipeline = Pipeline(steps = [
    ("floor_num_imputer",floor_num_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["floor_num_imputer","remainder"]}))
])

In [14]:
num_floors_imputer = ColumnTransformer(transformers = [
    ("num_floors_imputer",GroupAggregateImputer(variable = "num_floors",group_col = "floor_num",estimator = "median"),["num_floors","floor_num"])
],remainder = "passthrough")

num_floors_imputation_pipeline = Pipeline(steps = [
    ("num_floors_imputer",num_floors_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["num_floors_imputer","remainder"]}))
])

In [15]:
balcony_imputer = ColumnTransformer(transformers = [
    ("balcony_imputer", GroupAggregateImputer(variable = "balcony",group_col = "num_bhk",estimator = "median", add_indicator = True),["balcony","num_bhk"])
],remainder = "passthrough")

balcony_imputation_pipeline = Pipeline(steps = [
    ("balcony_imputer",balcony_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["balcony_imputer","remainder"]}))
])

In [16]:
ownership_imputer = ColumnTransformer(transformers = [
    ("ownership_imputer",SimpleImputer(strategy = 'most_frequent',add_indicator = True),["ownership"])
], remainder = "passthrough")


ownership_imputation_pipeline = Pipeline(steps = [
    ("ownership_imputer",ownership_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["ownership_imputer","remainder"]}))
])

In [17]:
facing_imputer = ColumnTransformer(transformers = [
    ("facing_imputer",SimpleImputer(strategy = 'constant',fill_value = 'Missing',add_indicator = True),["facing"])
], remainder = "passthrough")

facing_imputation_pipeline = Pipeline(steps = [
    ("facing_imputer",facing_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["facing_imputer","remainder"]}))
])

In [18]:
overlooking_garden_imputer = ColumnTransformer(transformers = [
    ("overlooking_garden_imputer",SimpleImputer(strategy = 'constant',fill_value = -1),["overlooking_garden"])
], remainder = "passthrough")

overlooking_garden_imputation_pipeline = Pipeline(steps = [
    ("overlooking_garden_imputer",overlooking_garden_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["overlooking_garden_imputer","remainder"]}))
])

In [19]:
overlooking_mainroad_imputer = ColumnTransformer(transformers = [
    ("overlooking_mainroad_imputer",SimpleImputer(strategy = 'constant',fill_value = -1),["overlooking_mainroad"])
], remainder = "passthrough")

overlooking_mainroad_imputation_pipeline = Pipeline(steps = [
    ("overlooking_mainroad_imputer",overlooking_mainroad_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["overlooking_mainroad_imputer","remainder"]}))
])

In [20]:
overlooking_pool_imputer = ColumnTransformer(transformers = [
    ("overlooking_pool_imputer",SimpleImputer(strategy = 'constant',fill_value = -1),["overlooking_pool"])
], remainder = "passthrough")

overlooking_pool_imputation_pipeline = Pipeline(steps = [
    ("overlooking_pool_imputer",overlooking_pool_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["overlooking_pool_imputer","remainder"]}))
])

In [21]:
parking_cover_imputer =  ColumnTransformer(transformers = [
    ("parking_cover_imputer",SimpleImputer(strategy = 'constant',fill_value = "No parking"),["parking_cover"])
],remainder = "passthrough")

parking_cover_imputation_pipeline = Pipeline(steps = [
    ("parking_cover_imputer",parking_cover_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["parking_cover_imputer","remainder"]}))
])

In [22]:
parking_spots_imputer =  ColumnTransformer(transformers = [
    ("parking_spots_imputer",SimpleImputer(strategy = 'constant',fill_value = 0),["parking_spots"])
],remainder = "passthrough")

parking_spots_imputation_pipeline = Pipeline(steps = [
    ("parking_spots_imputer",parking_spots_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["parking_spots_imputer","remainder"]}))
])

In [75]:
carpet_area_imputer =  ColumnTransformer(transformers = [
    ("carpet_area_imputer",SimpleImputer(strategy = 'constant',fill_value = -1),["carpet_area"])
],remainder = "passthrough")

carpet_area_imputation_pipeline = Pipeline(steps = [
    ("carpet_area_imputer",carpet_area_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["carpet_area_imputer","remainder"]}))
])

In [76]:
super_area_imputer =  ColumnTransformer(transformers = [
    ("super_area_imputer",SimpleImputer(strategy = 'constant',fill_value = -1),["super_area"])
],remainder = "passthrough")

super_area_imputation_pipeline = Pipeline(steps = [
    ("super_area_imputer",super_area_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["super_area_imputer","remainder"]}))
])

In [77]:
imputation_pipeline = Pipeline(steps = [
    ("furnishing_imputation_pipeline",furnishing_imputation_pipeline),
    ("floor_num_imputation_pipeline",floor_num_imputation_pipeline),
    ("num_floors_imputation_pipeline",num_floors_imputation_pipeline),
    ("balcony_imputation_pipeline",balcony_imputation_pipeline),
    ("ownership_imputation_pipeline",ownership_imputation_pipeline),
    ("facing_imputation_pipeline",facing_imputation_pipeline),
    ("overlooking_garden_imputation_pipeline",overlooking_garden_imputation_pipeline),
    ("overlooking_mainroad_imputation_pipeline",overlooking_mainroad_imputation_pipeline),
    ("overlooking_pool_imputation_pipeline",overlooking_pool_imputation_pipeline),
    ("parking_cover_imputation_pipeline",parking_cover_imputation_pipeline),
    ("parking_spots_imputation_pipeline",parking_spots_imputation_pipeline),
    ("carpet_area_imputation_pipeline",carpet_area_imputation_pipeline),
    ("super_area_imputation_pipeline",super_area_imputation_pipeline)
])

imputed_df = imputation_pipeline.fit_transform(X_train)

In [78]:
imputed_df

Unnamed: 0,super_area,carpet_area,parking_spots,parking_cover,overlooking_pool,overlooking_mainroad,overlooking_garden,facing,missingindicator_facing,ownership,missingindicator_ownership,balcony,num_bhk,balcony_missingindicator,num_floors,floor_num,furnishing,transaction,location,bathroom,is_studio,price
0,-1.0,5500.0,0.0,No parking,1.0,1.0,1.0,North - East,False,Leasehold,False,6.0,6.0,0,8.0,8.0,Unfurnished,Resale,greater-noida,5.0,0,2923.0
1,1250.0,-1.0,1.0,Covered,-1.0,-1.0,-1.0,Missing,True,Freehold,True,1.0,2.0,0,4.0,2.0,Semi-Furnished,Resale,mangalore,2.0,0,3200.0
2,-1.0,1175.0,1.0,Covered,0.0,1.0,1.0,East,False,Freehold,False,4.0,3.0,0,10.0,5.0,Unfurnished,New Property,surat,3.0,0,5236.0
3,-1.0,1350.0,1.0,Covered,-1.0,-1.0,-1.0,East,False,Freehold,False,3.0,3.0,0,4.0,2.0,Semi-Furnished,Resale,gurgaon,3.0,0,9877.0
4,750.0,-1.0,0.0,No parking,-1.0,-1.0,-1.0,Missing,True,Freehold,True,1.0,2.0,0,3.0,3.0,Unfurnished,New Property,gwalior,2.0,0,3600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46903,-1.0,1155.0,0.0,No parking,0.0,0.0,1.0,East,False,Freehold,False,1.0,2.0,0,9.0,4.0,Unfurnished,New Property,hyderabad,2.0,0,6017.0
46904,660.0,-1.0,0.0,No parking,-1.0,-1.0,-1.0,Missing,True,Freehold,True,1.0,1.0,1,5.0,3.0,Unfurnished,Resale,surat,1.0,0,2424.0
46905,2500.0,-1.0,0.0,No parking,-1.0,-1.0,-1.0,Missing,True,Freehold,True,2.0,4.0,1,9.0,8.0,Unfurnished,Resale,mohali,4.0,0,2600.0
46906,-1.0,860.0,0.0,No parking,0.0,0.0,1.0,East,False,Freehold,False,2.0,2.0,0,3.0,2.0,Unfurnished,Resale,nagpur,2.0,0,5789.0


### 4.2 Column Transformers

In [25]:
transaction_transformer = Pipeline(steps = [
    ("grouper",RareLabelEncoder(tol = 0.1, n_categories = 2, replace_with = "Resale")),
    ("encoder",OneHotEncoder(sparse_output = False,handle_unknown = 'ignore'))
])

In [26]:
def house_size_binner(X):
    columns = X.columns.to_list()

    return (
        X.assign(
               house_size = lambda df: (
                np.select(
                [
                    df.num_bhk.between(1,3,inclusive = "left"),
                    df.num_bhk.between(3,4,inclusive = "left")
                ],
                ["small","normal"],
                default = "big"
               )
            ) 
        )
        .drop(columns = columns)
    )

In [27]:
num_bhk_pipe1 = Pipeline(steps = [
    ("scaler",MinMaxScaler())
])

num_bhk_pipe2 = Pipeline(steps = [
    ("house_size_binner",FunctionTransformer(func = house_size_binner)),
    ("encoder",OrdinalEncoder(categories = [["small","normal","big"]]))
])

num_bhk_transformer = FeatureUnion(transformer_list = [
    ("num_bhk_pipe1",num_bhk_pipe1),
    ("num_bhk_pipe2",num_bhk_pipe2)
])

In [28]:
def bathroom_num_binner(X):
    columns = X.columns.to_list()

    return (
        X.assign(
               bathroom_num = lambda df: (
                np.select(
                [
                    df.bathroom.between(1,3,inclusive = "left"),
                    df.bathroom.between(3,4,inclusive = "left")
                ],
                ["low","medium"],
                default = "high"
              )
            ) 
        )
        .drop(columns = columns)
    )

In [29]:
bathroom_pipe1 = Pipeline(steps = [
    ("scaler",MinMaxScaler())
])

bathroom_pipe2 = Pipeline(steps = [
    ("bathroom_num_binner",FunctionTransformer(func = bathroom_num_binner)),
    ("encoder",OrdinalEncoder(categories = [["low","medium","high"]]))
])

bathroom_transformer = FeatureUnion(transformer_list = [
    ("bathroom_pipe1",bathroom_pipe1),
    ("bathroom_pipe2",bathroom_pipe2)
])

In [30]:
def bathroom_num_binner(X):
    columns = X.columns.to_list()

    return (
        X.assign(
               bathroom_num = lambda df: (
                np.select(
                [
                    df.bathroom.between(1,3,inclusive = "left"),
                    df.bathroom.between(3,4,inclusive = "left")
                ],
                ["low","medium"],
                default = "high"
              )
            ) 
        )
        .drop(columns = columns)
    )

In [31]:
furnishing_pipe1 = Pipeline(steps = [
    ("encoder",OrdinalEncoder(categories = [["Unfurnished","Semi-Furnished","Furnished"]])),
])

furnishing_pipe2 = Pipeline(steps = [
    ("is_unfurnished",FunctionTransformer(func = lambda x: np.where(x == 'Unfurnished',1,0)))
])

furnishing_transformer = FeatureUnion(transformer_list = [
    ("furnishing_pipe1",furnishing_pipe1),
    ("furnishing_pipe2",furnishing_pipe2)
])

In [32]:
def floor_height_binner(X):

    columns = X.columns.to_list()

    return (
        X.assign(
            floor_height = lambda df:(
                    np.select(
                        [
                            (df.floor_num.between(0,3, inclusive = "left")),
                            (df.floor_num.between(3,6, inclusive = "left"))
                        ],
                        ["low","medium"],
                        default = "high"
                    )
            )
        )
        .drop(columns = columns)
    )

In [33]:
def building_height_binner(X):

    columns = X.columns.to_list()

    return (
        X.assign(
            building_height = lambda df:(
                np.select(
                [
                    (df.num_floors.between(0,5, inclusive = "left")),
                    (df.num_floors.between(5,13, inclusive = "left"))
                ],
                ["short","medium"],
                default = "tall"
            )
            )
        )
        .drop(columns = columns)
    )

In [34]:
floor_num_pipe1 = Pipeline(steps = [
    ("scaler",StandardScaler())
])

floor_num_pipe2 = Pipeline(steps = [
    ("floor_height_binner",FunctionTransformer(func = floor_height_binner)),
    ("encoder",OrdinalEncoder(categories = [["low","medium","high"]])),
])

floor_num_transformer = FeatureUnion(transformer_list = [
    ("floor_num_pipe1",floor_num_pipe1),
    ("floor_num_pipe2",floor_num_pipe2)
])

In [35]:
num_floors_pipe1 = Pipeline(steps = [
    ("scaler",StandardScaler())
])

num_floors_pipe2 = Pipeline(steps = [
    ("building_height_binner",FunctionTransformer(func = building_height_binner)),
    ("encoder",OrdinalEncoder(categories = [["short","medium","tall"]]))
])

num_floors_transformer = FeatureUnion(transformer_list = [
    ("num_floors_pipe1",num_floors_pipe1),
    ("num_floors_pipe2",num_floors_pipe2)
])

In [36]:
def city_binner(X):
    
     columns = X.columns.to_list()

     return (
        X.assign(
            city_tier = lambda df:(
                np.where(
                    df.location.isin(["mumbai","gurgaon","new-delhi"]),
                    1,
                    0
                )
            )
        )
        .drop(columns = columns)
    )

In [37]:
location_pipe1 = Pipeline(steps = [
    ("target_encoder", TargetEncoder())
])

location_pipe2 = Pipeline(steps = [
    ("city_binner",FunctionTransformer(func = city_binner))
])

location_transformer = FeatureUnion(transformer_list = [
    ("location_pipe1",location_pipe1),
    ("location_pipe2",location_pipe2)
])


In [38]:
def price_binner(X):

    columns = X.columns.to_list()

    return (
        X.assign(
               price_range = lambda df:(
                np.select(
                    [
                        df.price.between(0,4000,inclusive = "left"),
                        df.price.between(4000,6000,inclusive = "left")
                    ],
                    ["low","medium"],
                    default = "high"
                )
            )
        )
        .drop(columns = columns)
    )

In [39]:
price_pipe1 = Pipeline(steps = [
    ("log_transformer",FunctionTransformer(func =  lambda x: np.log(x))),
    ("scaler",StandardScaler())
])

price_pipe2 = Pipeline(steps = [
    ("price_binner",FunctionTransformer(func = price_binner)),
    ("encoder",OrdinalEncoder(categories = [["low","medium","high"]]))
])

price_transformer = FeatureUnion(transformer_list = [
    ("price_pipe1",price_pipe1),
    ("price_pipe2",price_pipe2)
])

In [40]:
balcony_transformer = Pipeline(steps = [
    ("nearest_integer",FunctionTransformer(func = lambda x : np.round(x))),
    ("scaler",MinMaxScaler())
])

In [41]:
ownership_transformer = Pipeline(steps = [
    ("encoder",OneHotEncoder(sparse_output = False,handle_unknown = 'ignore'))
])

# ownership_transformer.fit_transform(X_train[["ownership"]].dropna())

In [42]:
missingindicator_ownership_transformer = Pipeline(steps = [
    ("encoder",OneHotEncoder(drop = 'first',sparse_output = False,handle_unknown = 'ignore'))
])

In [43]:
def direction_binner(X):

     columns = X.columns.to_list()

     return (
        X.assign(
            direction_tier = lambda df:(
                np.where(
                    df.facing.isin(["North - East","North - West"]),
                    1,
                    0
                )
            )
        )
        .drop(columns = columns)
     )

In [44]:
facing_pipe1 = Pipeline(steps = [
    ("encoder",OneHotEncoder(sparse_output = False,handle_unknown = 'ignore'))
])

facing_pipe2 = Pipeline(steps = [
    ("direction_binner",FunctionTransformer(func = direction_binner))
])

facing_transformer = FeatureUnion(transformer_list = [
    ("facing_pipe1",facing_pipe1),
    ("facing_pipe2",facing_pipe2)
])

In [45]:
missingindicator_facing_transformer = Pipeline(steps = [
    ("encoder",OneHotEncoder(drop = 'first',sparse_output = False,handle_unknown = 'ignore'))
])

In [63]:
overlooking_garden_transformer = Pipeline(steps = [
    ("encoder",OneHotEncoder(sparse_output = False,handle_unknown = 'ignore'))
])

overlooking_mainroad_transformer = Pipeline(steps = [
    ("encoder",OneHotEncoder(categories=[[ -1, 0, 1 ]], drop=[-1], sparse_output=False, handle_unknown='ignore'))
])

overlooking_pool_transformer = Pipeline(steps = [
    ("encoder",OneHotEncoder(categories=[[ -1, 0, 1 ]], drop=[-1], sparse_output=False, handle_unknown='ignore'))
])

In [47]:
parking_cover_transformer = Pipeline(steps = [
    ("encoder",OneHotEncoder(sparse_output = False,handle_unknown = 'ignore'))
])

In [48]:
def has_parking(X):

    columns = X.columns.to_list()

    return (
        X.assign(
            has_parking = lambda df:(
                np.select(
                            [
                                df.parking_spots.eq(0),
                                df.parking_spots.eq(1)
                            ],
                            ["no parking","single"],
                            default = "multiple"
                        )
            )
        )
        .drop(columns = columns)
     )

In [70]:
parking_spots_transformer = Pipeline(steps = [
    ("has_parking",FunctionTransformer(func = has_parking)),
    ("encoder",OneHotEncoder(categories = [["multiple","single","no parking"]],drop = [["no parking"]],sparse_output = False,handle_unknown = 'ignore'))
])

In [145]:
def effective_cost(X):

    columns = X.columns.to_list()

    return(
        X
        .assign(
            effective_cost = lambda df:(
                np.where(
                    df.carpet_area.eq(-1),
                    df.super_area*df.price/1e7,
                    df.carpet_area*df.price/1e7
                )
            ),
            carpet_areamissing = lambda df:(
                np.where(
                    df.carpet_area.eq(-1),
                    1,
                    0
                )
            ),
            super_areamissing = lambda df:(
                np.where(
                    df.super_area.eq(-1),
                    1,
                    0
                )
            )
        )
        .drop(columns = columns)
    )

In [146]:
scaler_pipeline = Pipeline(steps = [
    ("log_transformer",FunctionTransformer(func = lambda x: np.log(x))),
    ("scaler",StandardScaler())
])

area_transformer = Pipeline(steps = [
    ("effective_cost",FunctionTransformer(func = effective_cost)),
    ("scaler_pipeline",ColumnTransformer(transformers = [
        ("scaler_pipeline",scaler_pipeline,["effective_cost"])
    ],remainder = "passthrough")),
])

In [149]:
column_transformer = ColumnTransformer(transformers = [
    ("transaction_transformer",transaction_transformer,["transaction"]),
    ("num_bhk_transformer",num_bhk_transformer,["num_bhk"]),
    ("bathroom_transformer",bathroom_transformer,["bathroom"]),
    ("furnishing_transformer",furnishing_transformer,["furnishing"]),
    ("floor_num_transformer",floor_num_transformer,["floor_num"]),
    ("num_floors_transformer",num_floors_transformer,["num_floors"]),
    ("location_transformer",location_transformer,["location"]),
    ("price_transformer",price_transformer,["price"]),
    ("balcony_transformer",balcony_transformer,["balcony"]),
    ("ownership_transformer",ownership_transformer,["ownership"]),
    ("missingindicator_ownership_transformer",missingindicator_ownership_transformer,["missingindicator_ownership"]),
    ("facing_transformer",facing_transformer,["facing"]),
    ("missingindicator_facing_transformer",missingindicator_ownership_transformer,["missingindicator_facing"]),
    ("overlooking_garden_transformer",overlooking_garden_transformer,["overlooking_garden"]),
    ("overlooking_mainroad_transformer",overlooking_mainroad_transformer,["overlooking_mainroad"]),
    ("overlooking_pool_transformer",overlooking_pool_transformer,["overlooking_pool"]),
    ("parking_cover_transformer",parking_cover_transformer,["parking_cover"]),
    ("parking_spots_transformer",parking_spots_transformer,["parking_spots"]),
    ("area_transformer",area_transformer,["carpet_area","super_area","price"])
],remainder = 'passthrough')

### 4.3 Final Preprocessor

In [150]:
feature_preprocessor = Pipeline(steps = [
    ("imputation_pipeline",imputation_pipeline),
    ("column_transformer",column_transformer)
])

preprocessed = feature_preprocessor.fit_transform(X_train,y_train)

In [151]:
preprocessed

Unnamed: 0,transaction_transformer__transaction_New Property,transaction_transformer__transaction_Resale,num_bhk_transformer__num_bhk,num_bhk_transformer__house_size,bathroom_transformer__bathroom,bathroom_transformer__bathroom_num,furnishing_transformer__0,furnishing_transformer__1,floor_num_transformer__floor_num,floor_num_transformer__floor_height,num_floors_transformer__num_floors,num_floors_transformer__building_height,location_transformer__location,location_transformer__city_tier,price_transformer__price,price_transformer__price_range,balcony_transformer__balcony,ownership_transformer__ownership_Co-operative Society,ownership_transformer__ownership_Freehold,ownership_transformer__ownership_Leasehold,ownership_transformer__ownership_Power Of Attorney,missingindicator_ownership_transformer__missingindicator_ownership_True,facing_transformer__facing_East,facing_transformer__facing_Missing,facing_transformer__facing_North,facing_transformer__facing_North - East,facing_transformer__facing_North - West,facing_transformer__facing_South,facing_transformer__facing_South - East,facing_transformer__facing_South -West,facing_transformer__facing_West,facing_transformer__direction_tier,missingindicator_facing_transformer__missingindicator_facing_True,overlooking_garden_transformer__overlooking_garden_-1.0,overlooking_garden_transformer__overlooking_garden_0.0,overlooking_garden_transformer__overlooking_garden_1.0,overlooking_mainroad_transformer__overlooking_mainroad_0.0,overlooking_mainroad_transformer__overlooking_mainroad_1.0,overlooking_pool_transformer__overlooking_pool_0.0,overlooking_pool_transformer__overlooking_pool_1.0,parking_cover_transformer__parking_cover_Covered,parking_cover_transformer__parking_cover_No parking,parking_cover_transformer__parking_cover_Open,parking_spots_transformer__has_parking_multiple,parking_spots_transformer__has_parking_single,area_transformer__scaler_pipeline__effective_cost,area_transformer__remainder__carpet_areamissing,area_transformer__remainder__super_areamissing,remainder__balcony_missingindicator,remainder__is_studio
0,0.0,1.0,0.555556,2.0,0.444444,2.0,0.0,1.0,1.053182,2.0,0.009999,1.0,0.723307,0,-1.321841,0.0,0.6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.725599,0,1,0,0
1,0.0,1.0,0.111111,0.0,0.111111,0.0,1.0,0.0,-0.544119,0.0,-0.681524,0.0,0.604335,0,-1.075715,0.0,0.1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.453649,1,0,0,0
2,1.0,0.0,0.222222,1.0,0.222222,1.0,0.0,1.0,0.254531,1.0,0.355760,1.0,0.838034,0,0.262850,1.0,0.4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.220824,0,1,0,0
3,0.0,1.0,0.222222,1.0,0.222222,1.0,1.0,0.0,-0.544119,0.0,-0.681524,0.0,1.525279,1,1.988094,2.0,0.3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.432573,0,1,0,0
4,1.0,0.0,0.111111,0.0,0.111111,0.0,0.0,1.0,-0.277902,1.0,-0.854405,0.0,0.414034,0,-0.755532,0.0,0.1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.069392,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46903,1.0,0.0,0.111111,0.0,0.111111,0.0,0.0,1.0,-0.011685,1.0,0.182880,1.0,0.951988,0,0.640794,2.0,0.1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.411735,0,1,0,0
46904,0.0,1.0,0.000000,0.0,0.000000,0.0,0.0,1.0,-0.277902,1.0,-0.508643,1.0,0.838034,0,-1.830705,0.0,0.1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.889272,1,0,1,0
46905,0.0,1.0,0.333333,2.0,0.333333,2.0,0.0,1.0,1.053182,2.0,0.182880,1.0,0.747556,0,-1.640165,0.0,0.2,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.306950,1,0,1,0
46906,0.0,1.0,0.111111,0.0,0.111111,0.0,0.0,1.0,-0.544119,0.0,-0.854405,0.0,0.602052,0,0.535784,1.0,0.2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.110810,0,1,0,0


## 5. Feature Selection

In [157]:
feature_selector = SelectBySingleFeaturePerformance(estimator = RandomForestRegressor(),scoring = 'r2')

In [158]:
feature_engineering_pipeline = Pipeline(steps = [
    ("feature_preprocessor",feature_preprocessor),
    ("feature_selector",feature_selector)
])

In [160]:
feature_engineering_pipeline.fit_transform(X_train,y_train)

Unnamed: 0,num_bhk_transformer__num_bhk,num_bhk_transformer__house_size,bathroom_transformer__bathroom,bathroom_transformer__bathroom_num,location_transformer__location,price_transformer__price,price_transformer__price_range,balcony_transformer__balcony,parking_spots_transformer__has_parking_multiple,area_transformer__scaler_pipeline__effective_cost
0,0.555556,2.0,0.444444,2.0,0.722517,-1.321841,0.0,0.6,0.0,1.725599
1,0.111111,0.0,0.111111,0.0,0.603057,-1.075715,0.0,0.1,0.0,-0.453649
2,0.222222,1.0,0.222222,1.0,0.818076,0.262850,1.0,0.4,0.0,0.220824
3,0.222222,1.0,0.222222,1.0,1.528566,1.988094,2.0,0.3,0.0,1.432573
4,0.111111,0.0,0.111111,0.0,0.433907,-0.755532,0.0,0.1,0.0,-1.069392
...,...,...,...,...,...,...,...,...,...,...
46903,0.111111,0.0,0.111111,0.0,0.961026,0.640794,2.0,0.1,0.0,0.411735
46904,0.000000,0.0,0.000000,0.0,0.819129,-1.830705,0.0,0.1,0.0,-1.889272
46905,0.333333,2.0,0.333333,2.0,0.745751,-1.640165,0.0,0.2,0.0,0.306950
46906,0.111111,0.0,0.111111,0.0,0.571197,0.535784,1.0,0.2,0.0,-0.110810
