## 1. Importing Libraries

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

import sklearn

from sklearn.ensemble import RandomForestRegressor


from sklearn.base import (
    BaseEstimator,
    TransformerMixin
)

from sklearn.impute import (
    SimpleImputer
)

from sklearn.preprocessing import (
	OneHotEncoder,
    OrdinalEncoder,
    TargetEncoder,
    StandardScaler,
    MinMaxScaler,
    FunctionTransformer
)

from sklearn.compose import (
    ColumnTransformer
)

from sklearn.pipeline import (
    Pipeline,
    FeatureUnion
)

from feature_engine.encoding import(
    RareLabelEncoder
)

from feature_engine.selection import(
    SelectBySingleFeaturePerformance,
    SmartCorrelatedSelection
)


import warnings

## 2. Display Settings

In [4]:
pd.set_option("display.max_columns", None)
sklearn.set_config(transform_output = "pandas")
warnings.filterwarnings('ignore')

## 3. Reading the Data

In [6]:
PROJECT_DIR = Path(r"F:\Rishabh\House-Price-Prediction-MLOps-Project")
DATA_DIR = "data/train"

In [7]:
def read_data(name):
    file_name = f"{name}.csv"
    file_path = PROJECT_DIR/DATA_DIR/file_name

    return pd.read_csv(file_path)

In [8]:
houses = read_data("train")

In [9]:
houses

Unnamed: 0,location,carpet_area,transaction,furnishing,facing,bathroom,balcony,ownership,super_area,num_bhk,is_studio,floor_num,num_floors,overlooking_garden,overlooking_mainroad,overlooking_pool,parking_spots,parking_cover,amount
0,bhiwadi,700.0,Resale,Semi-Furnished,North,2.0,2.0,Leasehold,,2.0,0,5.0,15.0,1.0,1.0,0.0,,,0.110
1,sonipat,1000.0,Resale,Unfurnished,North,2.0,2.0,Freehold,,3.0,0,2.0,3.0,1.0,1.0,1.0,1.0,Open,0.255
2,greater-noida,675.0,Resale,Unfurnished,East,2.0,3.0,Leasehold,,2.0,0,8.0,25.0,1.0,1.0,1.0,,,0.650
3,chennai,,Resale,Semi-Furnished,East,3.0,2.0,Freehold,1585.0,3.0,0,1.0,2.0,,,,,,0.870
4,chandigarh,1200.0,New Property,Semi-Furnished,East,3.0,3.0,Freehold,,3.0,0,6.0,12.0,1.0,1.0,0.0,1.0,Covered,1.370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42124,ahmedabad,765.0,Resale,Unfurnished,East,1.0,2.0,Freehold,,2.0,0,4.0,4.0,0.0,1.0,0.0,,,0.420
42125,mysore,,Resale,Semi-Furnished,East,2.0,1.0,Freehold,995.0,2.0,0,2.0,4.0,0.0,1.0,0.0,1.0,Covered,0.600
42126,vadodara,,Resale,Semi-Furnished,,3.0,,,1200.0,3.0,0,3.0,5.0,,,,,,0.420
42127,visakhapatnam,,Resale,Unfurnished,North,2.0,,Freehold,1050.0,2.0,0,5.0,5.0,0.0,1.0,0.0,,,0.750


In [10]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42129 entries, 0 to 42128
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   location              42129 non-null  object 
 1   carpet_area           22323 non-null  float64
 2   transaction           42129 non-null  object 
 3   furnishing            41594 non-null  object 
 4   facing                25409 non-null  object 
 5   bathroom              42129 non-null  float64
 6   balcony               31289 non-null  float64
 7   ownership             26442 non-null  object 
 8   super_area            19806 non-null  float64
 9   num_bhk               42129 non-null  float64
 10  is_studio             42129 non-null  int64  
 11  floor_num             40709 non-null  float64
 12  num_floors            40683 non-null  float64
 13  overlooking_garden    23679 non-null  float64
 14  overlooking_mainroad  23679 non-null  float64
 15  overlooking_pool   

In [11]:
X_train = houses.drop(columns = "amount")
y_train = houses.amount.copy()

In [12]:
print(X_train.shape)
print(y_train.shape)

(42129, 18)
(42129,)


## 4. Transformation Operations

### 4.1 Imputation Pipeline

In [15]:
# Function to rename the columns before putting back into imputation pipeline
def prefix_remover(X, prefixes):

    prefix_list = [f"{prefix}__" for prefix in prefixes]
    new_cols = X.columns
    
    for prefix in prefix_list:
        new_cols = [col.replace(prefix,"") if col.startswith(prefix) else col for col in new_cols]

    return X.rename(
        columns = dict(zip(X.columns,new_cols))
    )

In [16]:
# imputer to impute bathroom values based on num_bhk
class GroupAggregateImputer(BaseEstimator, TransformerMixin):

    def __init__(self,variable,group_col,estimator,add_indicator = False):
        self.variable = variable
        self.group_col = group_col
        self.estimator = estimator
        self.add_indicator = add_indicator

    def fit(self,X,y = None):

        self.group_medians_ = {}
        self.group_modes_ = {}

        if self.estimator == "median":
                self.group_medians_[self.variable] =  X.groupby(self.group_col)[self.variable].median()
                
        elif self.estimator == "mode":
                self.group_modes_[self.variable] =  X.groupby(self.group_col)[self.variable].agg(lambda x: x.mode().iloc[0])
            
        return self

    def transform(self,X):
       X = X.copy()

       if self.add_indicator:
           X = X.assign(**{
               f"{self.variable}_missingindicator" : lambda df:(
                     np.where(
                         df[self.variable].isnull(),
                         1,0
                     )
                 )
           })

       if self.estimator == "median":
               mask = X[self.variable].isnull()
               X.loc[mask,self.variable] = X.loc[mask,self.group_col].map(self.group_medians_[self.variable])
               
       elif self.estimator == "mode":
               mask = X[self.variable].isnull()
               X.loc[mask,self.variable] = X.loc[mask,self.group_col].map(self.group_modes_[self.variable]) 

       
       return X

In [17]:
furnishing_imputer = ColumnTransformer(transformers = [
    ("furnishing_imputer",GroupAggregateImputer(variable = "furnishing",group_col = "transaction",estimator = "mode"),["furnishing","transaction"])
],remainder = "passthrough")

furnishing_imputation_pipeline = Pipeline(steps = [
    ("furnishing_imputer",furnishing_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["furnishing_imputer","remainder"]}))
])

In [18]:
floor_num_imputer = ColumnTransformer(transformers = [
    ("floor_num_imputer",SimpleImputer(strategy = "median"),["floor_num"])
],remainder = "passthrough")

floor_num_imputation_pipeline = Pipeline(steps = [
    ("floor_num_imputer",floor_num_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["floor_num_imputer","remainder"]}))
])

In [19]:
num_floors_imputer = ColumnTransformer(transformers = [
    ("num_floors_imputer",GroupAggregateImputer(variable = "num_floors",group_col = "floor_num",estimator = "median"),["num_floors","floor_num"])
],remainder = "passthrough")

num_floors_imputation_pipeline = Pipeline(steps = [
    ("num_floors_imputer",num_floors_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["num_floors_imputer","remainder"]}))
])

In [20]:
balcony_imputer = ColumnTransformer(transformers = [
    ("balcony_imputer", GroupAggregateImputer(variable = "balcony",group_col = "num_bhk",estimator = "median", add_indicator = True),["balcony","num_bhk"])
],remainder = "passthrough")

balcony_imputation_pipeline = Pipeline(steps = [
    ("balcony_imputer",balcony_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["balcony_imputer","remainder"]}))
])

In [21]:
ownership_imputer = ColumnTransformer(transformers = [
    ("ownership_imputer",SimpleImputer(strategy = 'most_frequent',add_indicator = True),["ownership"])
], remainder = "passthrough")


ownership_imputation_pipeline = Pipeline(steps = [
    ("ownership_imputer",ownership_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["ownership_imputer","remainder"]}))
])

In [22]:
facing_imputer = ColumnTransformer(transformers = [
    ("facing_imputer",SimpleImputer(strategy = 'constant',fill_value = 'Missing',add_indicator = True),["facing"])
], remainder = "passthrough")

facing_imputation_pipeline = Pipeline(steps = [
    ("facing_imputer",facing_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["facing_imputer","remainder"]}))
])

In [23]:
overlooking_garden_imputer = ColumnTransformer(transformers = [
    ("overlooking_garden_imputer",SimpleImputer(strategy = 'constant',fill_value = -1),["overlooking_garden"])
], remainder = "passthrough")

overlooking_garden_imputation_pipeline = Pipeline(steps = [
    ("overlooking_garden_imputer",overlooking_garden_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["overlooking_garden_imputer","remainder"]}))
])

In [24]:
overlooking_mainroad_imputer = ColumnTransformer(transformers = [
    ("overlooking_mainroad_imputer",SimpleImputer(strategy = 'constant',fill_value = -1),["overlooking_mainroad"])
], remainder = "passthrough")

overlooking_mainroad_imputation_pipeline = Pipeline(steps = [
    ("overlooking_mainroad_imputer",overlooking_mainroad_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["overlooking_mainroad_imputer","remainder"]}))
])

In [25]:
overlooking_pool_imputer = ColumnTransformer(transformers = [
    ("overlooking_pool_imputer",SimpleImputer(strategy = 'constant',fill_value = -1),["overlooking_pool"])
], remainder = "passthrough")

overlooking_pool_imputation_pipeline = Pipeline(steps = [
    ("overlooking_pool_imputer",overlooking_pool_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["overlooking_pool_imputer","remainder"]}))
])

In [26]:
parking_cover_imputer =  ColumnTransformer(transformers = [
    ("parking_cover_imputer",SimpleImputer(strategy = 'constant',fill_value = "No parking"),["parking_cover"])
],remainder = "passthrough")

parking_cover_imputation_pipeline = Pipeline(steps = [
    ("parking_cover_imputer",parking_cover_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["parking_cover_imputer","remainder"]}))
])

In [27]:
parking_spots_imputer =  ColumnTransformer(transformers = [
    ("parking_spots_imputer",SimpleImputer(strategy = 'constant',fill_value = 0),["parking_spots"])
],remainder = "passthrough")

parking_spots_imputation_pipeline = Pipeline(steps = [
    ("parking_spots_imputer",parking_spots_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["parking_spots_imputer","remainder"]}))
])

In [28]:
carpet_area_imputer =  ColumnTransformer(transformers = [
    ("carpet_area_imputer",SimpleImputer(strategy = 'constant',fill_value = -1),["carpet_area"])
],remainder = "passthrough")

carpet_area_imputation_pipeline = Pipeline(steps = [
    ("carpet_area_imputer",carpet_area_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["carpet_area_imputer","remainder"]}))
])

In [29]:
super_area_imputer =  ColumnTransformer(transformers = [
    ("super_area_imputer",SimpleImputer(strategy = 'constant',fill_value = -1),["super_area"])
],remainder = "passthrough")

super_area_imputation_pipeline = Pipeline(steps = [
    ("super_area_imputer",super_area_imputer),
    ("prefix_remover",FunctionTransformer(func = prefix_remover, kw_args = {"prefixes" : ["super_area_imputer","remainder"]}))
])

In [30]:
imputation_pipeline = Pipeline(steps = [
    ("furnishing_imputation_pipeline",furnishing_imputation_pipeline),
    ("floor_num_imputation_pipeline",floor_num_imputation_pipeline),
    ("num_floors_imputation_pipeline",num_floors_imputation_pipeline),
    ("balcony_imputation_pipeline",balcony_imputation_pipeline),
    ("ownership_imputation_pipeline",ownership_imputation_pipeline),
    ("facing_imputation_pipeline",facing_imputation_pipeline),
    ("overlooking_garden_imputation_pipeline",overlooking_garden_imputation_pipeline),
    ("overlooking_mainroad_imputation_pipeline",overlooking_mainroad_imputation_pipeline),
    ("overlooking_pool_imputation_pipeline",overlooking_pool_imputation_pipeline),
    ("parking_cover_imputation_pipeline",parking_cover_imputation_pipeline),
    ("parking_spots_imputation_pipeline",parking_spots_imputation_pipeline),
    ("carpet_area_imputation_pipeline",carpet_area_imputation_pipeline),
    ("super_area_imputation_pipeline",super_area_imputation_pipeline)
])

imputed_df = imputation_pipeline.fit_transform(X_train)

In [31]:
imputed_df

Unnamed: 0,super_area,carpet_area,parking_spots,parking_cover,overlooking_pool,overlooking_mainroad,overlooking_garden,facing,missingindicator_facing,ownership,missingindicator_ownership,balcony,num_bhk,balcony_missingindicator,num_floors,floor_num,furnishing,transaction,location,bathroom,is_studio
0,-1.0,700.0,0.0,No parking,0.0,1.0,1.0,North,False,Leasehold,False,2.0,2.0,0,15.0,5.0,Semi-Furnished,Resale,bhiwadi,2.0,0
1,-1.0,1000.0,1.0,Open,1.0,1.0,1.0,North,False,Freehold,False,2.0,3.0,0,3.0,2.0,Unfurnished,Resale,sonipat,2.0,0
2,-1.0,675.0,0.0,No parking,1.0,1.0,1.0,East,False,Leasehold,False,3.0,2.0,0,25.0,8.0,Unfurnished,Resale,greater-noida,2.0,0
3,1585.0,-1.0,0.0,No parking,-1.0,-1.0,-1.0,East,False,Freehold,False,2.0,3.0,0,2.0,1.0,Semi-Furnished,Resale,chennai,3.0,0
4,-1.0,1200.0,1.0,Covered,0.0,1.0,1.0,East,False,Freehold,False,3.0,3.0,0,12.0,6.0,Semi-Furnished,New Property,chandigarh,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42124,-1.0,765.0,0.0,No parking,0.0,1.0,0.0,East,False,Freehold,False,2.0,2.0,0,4.0,4.0,Unfurnished,Resale,ahmedabad,1.0,0
42125,995.0,-1.0,1.0,Covered,0.0,1.0,0.0,East,False,Freehold,False,1.0,2.0,0,4.0,2.0,Semi-Furnished,Resale,mysore,2.0,0
42126,1200.0,-1.0,0.0,No parking,-1.0,-1.0,-1.0,Missing,True,Freehold,True,2.0,3.0,1,5.0,3.0,Semi-Furnished,Resale,vadodara,3.0,0
42127,1050.0,-1.0,0.0,No parking,0.0,1.0,0.0,North,False,Freehold,False,2.0,2.0,1,5.0,5.0,Unfurnished,Resale,visakhapatnam,2.0,0


### 4.2 Column Transformers

In [33]:
transaction_transformer = Pipeline(steps = [
    ("grouper",RareLabelEncoder(tol = 0.1, n_categories = 2, replace_with = "Resale")),
    ("encoder",OneHotEncoder(sparse_output = False,handle_unknown = 'ignore'))
])

In [34]:
def house_size_binner(X):
    columns = X.columns.to_list()

    return (
        X.assign(
               house_size = lambda df: (
                np.select(
                [
                    df.num_bhk.between(1,3,inclusive = "left"),
                    df.num_bhk.between(3,4,inclusive = "left")
                ],
                ["small","normal"],
                default = "big"
               )
            ) 
        )
        .drop(columns = columns)
    )

In [35]:
num_bhk_pipe1 = Pipeline(steps = [
    ("scaler",MinMaxScaler())
])

num_bhk_pipe2 = Pipeline(steps = [
    ("house_size_binner",FunctionTransformer(func = house_size_binner)),
    ("encoder",OrdinalEncoder(categories = [["small","normal","big"]]))
])

num_bhk_transformer = FeatureUnion(transformer_list = [
    ("num_bhk_pipe1",num_bhk_pipe1),
    ("num_bhk_pipe2",num_bhk_pipe2)
])

In [36]:
def bathroom_num_binner(X):
    columns = X.columns.to_list()

    return (
        X.assign(
               bathroom_num = lambda df: (
                np.select(
                [
                    df.bathroom.between(1,3,inclusive = "left"),
                    df.bathroom.between(3,4,inclusive = "left")
                ],
                ["low","medium"],
                default = "high"
              )
            ) 
        )
        .drop(columns = columns)
    )

In [37]:
bathroom_pipe1 = Pipeline(steps = [
    ("scaler",MinMaxScaler())
])

bathroom_pipe2 = Pipeline(steps = [
    ("bathroom_num_binner",FunctionTransformer(func = bathroom_num_binner)),
    ("encoder",OrdinalEncoder(categories = [["low","medium","high"]]))
])

bathroom_transformer = FeatureUnion(transformer_list = [
    ("bathroom_pipe1",bathroom_pipe1),
    ("bathroom_pipe2",bathroom_pipe2)
])

In [38]:
def bathroom_num_binner(X):
    columns = X.columns.to_list()

    return (
        X.assign(
               bathroom_num = lambda df: (
                np.select(
                [
                    df.bathroom.between(1,3,inclusive = "left"),
                    df.bathroom.between(3,4,inclusive = "left")
                ],
                ["low","medium"],
                default = "high"
              )
            ) 
        )
        .drop(columns = columns)
    )

In [39]:
furnishing_pipe1 = Pipeline(steps = [
    ("encoder",OrdinalEncoder(categories = [["Unfurnished","Semi-Furnished","Furnished"]])),
])

furnishing_pipe2 = Pipeline(steps = [
    ("is_unfurnished",FunctionTransformer(func = lambda x: np.where(x == 'Unfurnished',1,0)))
])

furnishing_transformer = FeatureUnion(transformer_list = [
    ("furnishing_pipe1",furnishing_pipe1),
    ("furnishing_pipe2",furnishing_pipe2)
])

In [40]:
def floor_height_binner(X):

    columns = X.columns.to_list()

    return (
        X.assign(
            floor_height = lambda df:(
                    np.select(
                        [
                            (df.floor_num.between(0,3, inclusive = "left")),
                            (df.floor_num.between(3,6, inclusive = "left"))
                        ],
                        ["low","medium"],
                        default = "high"
                    )
            )
        )
        .drop(columns = columns)
    )

In [41]:
def building_height_binner(X):

    columns = X.columns.to_list()

    return (
        X.assign(
            building_height = lambda df:(
                np.select(
                [
                    (df.num_floors.between(0,5, inclusive = "left")),
                    (df.num_floors.between(5,13, inclusive = "left"))
                ],
                ["short","medium"],
                default = "tall"
            )
            )
        )
        .drop(columns = columns)
    )

In [42]:
floor_num_pipe1 = Pipeline(steps = [
    ("scaler",StandardScaler())
])

floor_num_pipe2 = Pipeline(steps = [
    ("floor_height_binner",FunctionTransformer(func = floor_height_binner)),
    ("encoder",OrdinalEncoder(categories = [["low","medium","high"]])),
])

floor_num_transformer = FeatureUnion(transformer_list = [
    ("floor_num_pipe1",floor_num_pipe1),
    ("floor_num_pipe2",floor_num_pipe2)
])

In [43]:
num_floors_pipe1 = Pipeline(steps = [
    ("scaler",StandardScaler())
])

num_floors_pipe2 = Pipeline(steps = [
    ("building_height_binner",FunctionTransformer(func = building_height_binner)),
    ("encoder",OrdinalEncoder(categories = [["short","medium","tall"]]))
])

num_floors_transformer = FeatureUnion(transformer_list = [
    ("num_floors_pipe1",num_floors_pipe1),
    ("num_floors_pipe2",num_floors_pipe2)
])

In [44]:
def city_binner(X):
    
     columns = X.columns.to_list()

     return (
        X.assign(
            city_tier = lambda df:(
                np.where(
                    df.location.isin(["mumbai","gurgaon","new-delhi"]),
                    1,
                    0
                )
            )
        )
        .drop(columns = columns)
    )

In [45]:
location_pipe1 = Pipeline(steps = [
    ("target_encoder", TargetEncoder())
])

location_pipe2 = Pipeline(steps = [
    ("city_binner",FunctionTransformer(func = city_binner))
])

location_transformer = FeatureUnion(transformer_list = [
    ("location_pipe1",location_pipe1),
    ("location_pipe2",location_pipe2)
])


In [46]:
def price_binner(X):

    columns = X.columns.to_list()

    return (
        X.assign(
               price_range = lambda df:(
                np.select(
                    [
                        df.price.between(0,4000,inclusive = "left"),
                        df.price.between(4000,6000,inclusive = "left")
                    ],
                    ["low","medium"],
                    default = "high"
                )
            )
        )
        .drop(columns = columns)
    )

In [47]:
balcony_transformer = Pipeline(steps = [
    ("nearest_integer",FunctionTransformer(func = lambda x : np.round(x))),
    ("scaler",MinMaxScaler())
])

In [48]:
ownership_transformer = Pipeline(steps = [
    ("encoder",OneHotEncoder(sparse_output = False,handle_unknown = 'ignore'))
])

# ownership_transformer.fit_transform(X_train[["ownership"]].dropna())

In [49]:
missingindicator_ownership_transformer = Pipeline(steps = [
    ("encoder",OneHotEncoder(drop = 'first',sparse_output = False,handle_unknown = 'ignore'))
])

In [50]:
def direction_binner(X):

     columns = X.columns.to_list()

     return (
        X.assign(
            direction_tier = lambda df:(
                np.where(
                    df.facing.isin(["North - East","North - West"]),
                    1,
                    0
                )
            )
        )
        .drop(columns = columns)
     )

In [51]:
facing_pipe1 = Pipeline(steps = [
    ("encoder",OneHotEncoder(sparse_output = False,handle_unknown = 'ignore'))
])

facing_pipe2 = Pipeline(steps = [
    ("direction_binner",FunctionTransformer(func = direction_binner))
])

facing_transformer = FeatureUnion(transformer_list = [
    ("facing_pipe1",facing_pipe1),
    ("facing_pipe2",facing_pipe2)
])

In [52]:
missingindicator_facing_transformer = Pipeline(steps = [
    ("encoder",OneHotEncoder(drop = 'first',sparse_output = False,handle_unknown = 'ignore'))
])

In [53]:
overlooking_garden_transformer = Pipeline(steps = [
    ("encoder",OneHotEncoder(sparse_output = False,handle_unknown = 'ignore'))
])

overlooking_mainroad_transformer = Pipeline(steps = [
    ("encoder",OneHotEncoder(categories=[[ -1, 0, 1 ]], drop=[-1], sparse_output=False, handle_unknown='ignore'))
])

overlooking_pool_transformer = Pipeline(steps = [
    ("encoder",OneHotEncoder(categories=[[ -1, 0, 1 ]], drop=[-1], sparse_output=False, handle_unknown='ignore'))
])

In [54]:
parking_cover_transformer = Pipeline(steps = [
    ("encoder",OneHotEncoder(sparse_output = False,handle_unknown = 'ignore'))
])

In [55]:
def has_parking(X):

    columns = X.columns.to_list()

    return (
        X.assign(
            has_parking = lambda df:(
                np.select(
                            [
                                df.parking_spots.eq(0),
                                df.parking_spots.eq(1)
                            ],
                            ["no parking","single"],
                            default = "multiple"
                        )
            )
        )
        .drop(columns = columns)
     )

In [56]:
parking_spots_transformer = Pipeline(steps = [
    ("has_parking",FunctionTransformer(func = has_parking)),
    ("encoder",OneHotEncoder(categories = [["multiple","single","no parking"]],drop = [["no parking"]],sparse_output = False,handle_unknown = 'ignore'))
])

In [57]:
def effective_area(X):

    columns = X.columns.to_list()

    return(
        X
        .assign(
            effective_area = lambda df:(
                np.where(
                    df.carpet_area.eq(-1),
                    df.super_area,
                    df.carpet_area
                )
            ),
            carpet_areamissing = lambda df:(
                np.where(
                    df.carpet_area.eq(-1),
                    1,
                    0
                )
            ),
            super_areamissing = lambda df:(
                np.where(
                    df.super_area.eq(-1),
                    1,
                    0
                )
            )
        )
        .drop(columns = columns)
    )

In [58]:
scaler_pipeline = Pipeline(steps = [
    ("log_transformer",FunctionTransformer(func = lambda x: np.log(x))),
    ("scaler",StandardScaler())
])

area_transformer = Pipeline(steps = [
    ("effective_area",FunctionTransformer(func = effective_area)),
    ("scaler_pipeline",ColumnTransformer(transformers = [
        ("scaler_pipeline",scaler_pipeline,["effective_area"])
    ],remainder = "passthrough"))
])

In [59]:
def area_per_room(X):

    columns = X.columns.to_list()

    return(
        X
        .assign(
            area_per_room = lambda df:(
                np.where(
                    df.carpet_area.eq(-1),
                    df.super_area/df.num_bhk,
                    df.carpet_area/df.num_bhk
                )
            )
        )
        .drop(columns = columns)
    )

In [60]:
scaler_pipeline = Pipeline(steps = [
    ("log_transformer",FunctionTransformer(func = lambda x: np.log(x))),
    ("scaler",StandardScaler())
])

area_per_room_transformer = Pipeline(steps = [
    ("area_per_room",FunctionTransformer(func = area_per_room)),
    ("scaler_pipeline",ColumnTransformer(transformers = [
        ("scaler_pipeline",scaler_pipeline,["area_per_room"])
    ],remainder = "passthrough"))
])

In [61]:
def balcony_per_room(X):

    columns = X.columns.to_list()

    return(
        X
        .assign(
            balcony_per_room = lambda df:(
                df.balcony/df.num_bhk
            )
        )
        .drop(columns = columns)
    )

In [62]:
def bathroom_per_room(X):

    columns = X.columns.to_list()

    return(
        X
        .assign(
            bathroom_per_room = lambda df:(
                df.bathroom/df.num_bhk
            )
        )
        .drop(columns = columns)
    )

In [63]:
balcony_per_room_transformer = Pipeline(steps = [
    ("balcony_per_room",FunctionTransformer(func = balcony_per_room))
])

bathroom_per_room_transformer = Pipeline(steps = [
    ("bathroom_per_room",FunctionTransformer(func = bathroom_per_room))
])

In [64]:
column_transformer = ColumnTransformer(transformers = [
    ("transaction_transformer",transaction_transformer,["transaction"]),
    ("num_bhk_transformer",num_bhk_transformer,["num_bhk"]),
    ("bathroom_transformer",bathroom_transformer,["bathroom"]),
    ("furnishing_transformer",furnishing_transformer,["furnishing"]),
    ("floor_num_transformer",floor_num_transformer,["floor_num"]),
    ("num_floors_transformer",num_floors_transformer,["num_floors"]),
    ("location_transformer",location_transformer,["location"]),
    ("balcony_transformer",balcony_transformer,["balcony"]),
    ("ownership_transformer",ownership_transformer,["ownership"]),
    ("missingindicator_ownership_transformer",missingindicator_ownership_transformer,["missingindicator_ownership"]),
    ("facing_transformer",facing_transformer,["facing"]),
    ("missingindicator_facing_transformer",missingindicator_ownership_transformer,["missingindicator_facing"]),
    ("overlooking_garden_transformer",overlooking_garden_transformer,["overlooking_garden"]),
    ("overlooking_mainroad_transformer",overlooking_mainroad_transformer,["overlooking_mainroad"]),
    ("overlooking_pool_transformer",overlooking_pool_transformer,["overlooking_pool"]),
    ("parking_cover_transformer",parking_cover_transformer,["parking_cover"]),
    ("parking_spots_transformer",parking_spots_transformer,["parking_spots"]),
    ("area_transformer",area_transformer,["carpet_area","super_area"]),
    ("area_per_room_transformer",area_per_room_transformer,["carpet_area","super_area","num_bhk"]),
    ("balcony_per_room_transformer",balcony_per_room_transformer,["balcony","num_bhk"]),
    ("bathroom_per_room_transformer",bathroom_per_room_transformer,["bathroom","num_bhk"])
],remainder = 'passthrough')

### 4.3 Final Preprocessor

In [66]:
feature_preprocessor = Pipeline(steps = [
    ("imputation_pipeline",imputation_pipeline),
    ("column_transformer",column_transformer)
])

preprocessed = feature_preprocessor.fit_transform(X_train,y_train)

In [67]:
preprocessed

Unnamed: 0,transaction_transformer__transaction_New Property,transaction_transformer__transaction_Resale,num_bhk_transformer__num_bhk,num_bhk_transformer__house_size,bathroom_transformer__bathroom,bathroom_transformer__bathroom_num,furnishing_transformer__0,furnishing_transformer__1,floor_num_transformer__floor_num,floor_num_transformer__floor_height,num_floors_transformer__num_floors,num_floors_transformer__building_height,location_transformer__location,location_transformer__city_tier,balcony_transformer__balcony,ownership_transformer__ownership_Co-operative Society,ownership_transformer__ownership_Freehold,ownership_transformer__ownership_Leasehold,ownership_transformer__ownership_Power Of Attorney,missingindicator_ownership_transformer__missingindicator_ownership_True,facing_transformer__facing_East,facing_transformer__facing_Missing,facing_transformer__facing_North,facing_transformer__facing_North - East,facing_transformer__facing_North - West,facing_transformer__facing_South,facing_transformer__facing_South - East,facing_transformer__facing_South -West,facing_transformer__facing_West,facing_transformer__direction_tier,missingindicator_facing_transformer__missingindicator_facing_True,overlooking_garden_transformer__overlooking_garden_-1.0,overlooking_garden_transformer__overlooking_garden_0.0,overlooking_garden_transformer__overlooking_garden_1.0,overlooking_mainroad_transformer__overlooking_mainroad_0.0,overlooking_mainroad_transformer__overlooking_mainroad_1.0,overlooking_pool_transformer__overlooking_pool_0.0,overlooking_pool_transformer__overlooking_pool_1.0,parking_cover_transformer__parking_cover_Covered,parking_cover_transformer__parking_cover_No parking,parking_cover_transformer__parking_cover_Open,parking_spots_transformer__has_parking_multiple,parking_spots_transformer__has_parking_single,area_transformer__scaler_pipeline__effective_area,area_transformer__remainder__carpet_areamissing,area_transformer__remainder__super_areamissing,area_per_room_transformer__scaler_pipeline__area_per_room,balcony_per_room_transformer__balcony_per_room,bathroom_per_room_transformer__bathroom_per_room,remainder__balcony_missingindicator,remainder__is_studio
0,0.0,1.0,0.111111,0.0,0.111111,0.0,1.0,0.0,0.252900,1.0,1.214913,2.0,0.270073,0,0.2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.041041,0,1,-1.027736,1.000000,1.000000,0,0
1,0.0,1.0,0.222222,1.0,0.111111,0.0,0.0,1.0,-0.545902,0.0,-0.854100,0.0,0.571517,0,0.2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,-0.255425,0,1,-1.194975,0.666667,0.666667,0,0
2,0.0,1.0,0.111111,0.0,0.111111,0.0,0.0,1.0,1.051702,2.0,2.939092,2.0,0.717500,0,0.3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,-1.121145,0,1,-1.152394,1.500000,1.000000,0,0
3,0.0,1.0,0.222222,1.0,0.222222,1.0,1.0,0.0,-0.812170,0.0,-1.026518,0.0,0.669204,0,0.2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.759063,1,0,0.383786,0.666667,1.000000,0,0
4,1.0,0.0,0.222222,1.0,0.222222,1.0,1.0,0.0,0.519168,2.0,0.697660,1.0,0.770902,0,0.3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.146159,0,1,-0.570026,1.000000,1.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42124,0.0,1.0,0.111111,0.0,0.000000,0.0,0.0,1.0,-0.013367,1.0,-0.681682,0.0,1.020848,0,0.2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.845459,0,1,-0.723368,1.000000,0.500000,0,0
42125,0.0,1.0,0.111111,0.0,0.111111,0.0,1.0,0.0,-0.545902,0.0,-0.681682,0.0,0.863496,0,0.1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.266466,1,0,0.177670,0.500000,1.000000,0,0
42126,0.0,1.0,0.222222,1.0,0.222222,1.0,1.0,0.0,-0.279635,1.0,-0.509265,1.0,0.492338,0,0.2,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.146159,1,0,-0.570026,0.666667,1.000000,1,0
42127,0.0,1.0,0.111111,0.0,0.111111,0.0,0.0,1.0,0.252900,1.0,-0.509265,1.0,0.656961,0,0.2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.147959,1,0,0.362091,1.000000,1.000000,1,0


## 5. Feature Selection

In [69]:
drop_correlated = SmartCorrelatedSelection(estimator = RandomForestRegressor(), scoring = 'r2')
feature_selector = SelectBySingleFeaturePerformance(estimator = RandomForestRegressor(),scoring = 'r2',threshold = 0.01)

In [70]:
feature_engineering_pipeline = Pipeline(steps = [
    ("feature_preprocessor",feature_preprocessor),
    ("drop_correlated",drop_correlated),
    ("feature_selector",feature_selector)
])

In [71]:
feature_engineering_pipeline.fit_transform(X_train,y_train)

Unnamed: 0,transaction_transformer__transaction_New Property,num_bhk_transformer__num_bhk,bathroom_transformer__bathroom_num,floor_num_transformer__floor_num,num_floors_transformer__num_floors,location_transformer__location,location_transformer__city_tier,balcony_transformer__balcony,missingindicator_facing_transformer__missingindicator_facing_True,overlooking_garden_transformer__overlooking_garden_-1.0,overlooking_garden_transformer__overlooking_garden_1.0,overlooking_pool_transformer__overlooking_pool_1.0,parking_cover_transformer__parking_cover_No parking,parking_spots_transformer__has_parking_multiple,area_transformer__scaler_pipeline__effective_area,area_per_room_transformer__scaler_pipeline__area_per_room,balcony_per_room_transformer__balcony_per_room,bathroom_per_room_transformer__bathroom_per_room
0,0.0,0.111111,0.0,0.252900,1.214913,0.271093,0,0.2,0.0,0.0,1.0,0.0,1.0,0.0,-1.041041,-1.027736,1.000000,1.000000
1,0.0,0.222222,0.0,-0.545902,-0.854100,0.589594,0,0.2,0.0,0.0,1.0,1.0,0.0,0.0,-0.255425,-1.194975,0.666667,0.666667
2,0.0,0.111111,0.0,1.051702,2.939092,0.721551,0,0.3,0.0,0.0,1.0,1.0,1.0,0.0,-1.121145,-1.152394,1.500000,1.000000
3,0.0,0.222222,1.0,-0.812170,-1.026518,0.677852,0,0.2,0.0,1.0,0.0,0.0,1.0,0.0,0.759063,0.383786,0.666667,1.000000
4,1.0,0.222222,1.0,0.519168,0.697660,0.776275,0,0.3,0.0,0.0,1.0,0.0,0.0,0.0,0.146159,-0.570026,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42124,0.0,0.111111,0.0,-0.013367,-0.681682,1.017289,0,0.2,0.0,0.0,0.0,0.0,1.0,0.0,-0.845459,-0.723368,1.000000,0.500000
42125,0.0,0.111111,0.0,-0.545902,-0.681682,0.853839,0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,-0.266466,0.177670,0.500000,1.000000
42126,0.0,0.222222,1.0,-0.279635,-0.509265,0.493579,0,0.2,1.0,1.0,0.0,0.0,1.0,0.0,0.146159,-0.570026,0.666667,1.000000
42127,0.0,0.111111,0.0,0.252900,-0.509265,0.658090,0,0.2,0.0,0.0,0.0,0.0,1.0,0.0,-0.147959,0.362091,1.000000,1.000000
