In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score

# Linear models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# Support Vector Machines
from sklearn.svm import SVR, LinearSVR

# Tree-based models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
    HistGradientBoostingRegressor
)

# Neighbors
from sklearn.neighbors import KNeighborsRegressor

# Neural networks
from sklearn.neural_network import MLPRegressor

# Gaussian Processes
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

# Gradient boosting libraries outside sklearn
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import category_encoders as ce

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the stage 6 property data
df = pd.read_csv(r'Output Files\stage_6_property_data.csv') 
df.head(3)

Unnamed: 0,Transaction Type,Type of Property,City,Area,Commercial,Covered Area,Furnished Type,Bedrooms,Bathrooms,Balconies,Facing,Floor Level,Property Lifespan,House Help Room,Store Room,Luxury Category,Price (Crores)
0,New Property,Apartment,Thane,Kalyan West,Y,763.571943,Unfurnished,2,2.0,2.0,East,High rise (> 10),New construction,1,1,High,0.63
1,New Property,Apartment,Thane,Kalyan West,N,850.0,Unfurnished,2,2.0,2.0,East,High rise (> 10),New construction,1,1,High,0.54
2,New Property,Apartment,Thane,Kalyan West,N,1050.0,Unfurnished,2,2.0,3.0,East,Mid rise (5 to 10),Less than 5 years,1,1,High,0.9


In [3]:
# Check shape
df.shape

(6280, 17)

-----

### Nature of Categorical features:

In [4]:
# Print Data type of features
df.dtypes

Transaction Type      object
Type of Property      object
City                  object
Area                  object
Commercial            object
Covered Area         float64
Furnished Type        object
Bedrooms               int64
Bathrooms            float64
Balconies             object
Facing                object
Floor Level           object
Property Lifespan     object
House Help Room        int64
Store Room             int64
Luxury Category       object
Price (Crores)       float64
dtype: object

- Transaction Type: Nominal feature with no ordering.
- Type of Property: Nominal feature with no ordering.
- City: Nominal feature with no ordering.
- Area:  Nominal feature with no ordering. [High Cardinality: 53 unique values]
- Commercial: Nominal feature with no ordering.
- Furnished Type: Feature with inherent ordering.
- Balconies: Feature with inherent ordering.
- Facing: Nominal feature with no ordering.
- Floor Level: Feature with inherent ordering.
- Property Lifespane: Feature with inherent ordering.
- Luxury Category: Feature with inherent ordering.

In [5]:
# Numerical features
numerical_columns = []
binary_columns = []

# Nominal categorical fetures for one-hot encoding
nominal_columns_to_one_hot_encode = []

# Categorical features with inherent ordering to ordinally encode
ordered_columns_to_ordinal_encode = []

# Define specific ordering for each ordinal column
ordinal_categories = [
    ['Unfurnished', 'Semi-Furnished', 'Furnished'],  # Furnished Type
    ['1.0', '2.0', '3.0', '3+'],  # Balconies
    ['Low rise (< 5)', 'Mid rise (5 to 10)', 'High rise (> 10)'],  # Floor Level
    ['New construction', 'Less than 5 years', '5 to 10 years', '10 to 15 years', '15 to 20 years', 'Above 20 years'],  # Property Lifespan
    ['Basic', 'Medium', 'High']  # Luxury Category
]


------------

### Utilities

In [6]:
def scorer(X, y_transformed, model_name: str, model, preprocessor):
    '''
    This function builds a pipeline with a given regression model and a preconfigured preprocessor, then evaluates it using both R² score (via 10-fold cross-validation) and Mean Absolute Error (MAE) on a 
    hold-out test set.

    Args:
        model_name : Name of the regression model

        model : An instantiated regression model
    '''
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    # Average R2 score
    output.append(scores.mean())
    
    # For MAE
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    output.append(mean_absolute_error(np.expm1(y_test), np.expm1(y_pred)))
    
    return output
    

In [7]:
# ML models
model_dict = {
    # Linear Models
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),

    # Support Vector Machines
    'SVR': SVR(),
    'LinearSVR': LinearSVR(),

    # Tree-Based Models
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'ExtraTreesRegressor': ExtraTreesRegressor(),

    # Boosting Algorithms
    'GradientBoostingRegressor': GradientBoostingRegressor(),   # shallow trees as base learners
    'HistGradientBoostingRegressor': HistGradientBoostingRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'XGBRegressor': XGBRegressor(),
    'LGBMRegressor': LGBMRegressor(verbose=-1),
    'CatBoostRegressor': CatBoostRegressor(verbose=0),

    # Nearest Neighbors
    'KNeighborsRegressor': KNeighborsRegressor(),

    # Neural Networks
    'MLPRegressor': MLPRegressor(),

    # Gaussian Processes
    'GaussianProcessRegressor': GaussianProcessRegressor(kernel=RBF())
}


# Strategy
As the Machine learning algorithms only understands numbers, the categorical columns with object/str dtype needs to be transformed into numbers. For this 3 different encoding techniques will be deployed. The best performing combination of ML algorithm + encoding technique will finally be deployed on production for predicting prices of the property.

- Ordinal encoding: Ordinal encoding works well for tree-based models, but not for linear/distance-based models due to artificial ordering.

- One-hot encoding: One-hot encoding works well for linear and distance-based models, watch for high cardinality.

- Target encoding: Target encoding can be powerful for tree-based models with high cardinality features, but must be used with strict data leakage precautions.

------------------

In [8]:
# Independent features
X = df.drop(columns=['Price (Crores)'])

# Dependent feature [Target Variable]
y = df['Price (Crores)']

In [9]:
# Log transform y
y_transformed = np.log1p(y)

#### Strategy 1: Ordinal encoding

In [10]:
# Define specific ordering for each ordinal column
ordinal_categories = [
    list(df['Transaction Type'].value_counts(ascending=True).index), # Transaction Type → based on frequency
    list(df['Type of Property'].value_counts(ascending=True).index), # Type of Property → based on frequency
    list(df['City'].value_counts(ascending=True).index), # City → based on frequency
    list(df['Area'].value_counts(ascending=True).index), # Area → based on frequency
    list(df['Commercial'].value_counts(ascending=True).index), # Commercial → based on frequency
    list(df['Facing'].value_counts(ascending=True).index), # Facing → based on frequency
    ['Unfurnished', 'Semi-Furnished', 'Furnished'],  # Furnished Type
    ['1.0', '2.0', '3.0', '3+'],  # Balconies
    ['Low rise (< 5)', 'Mid rise (5 to 10)', 'High rise (> 10)'],  # Floor Level
    ['New construction', 'Less than 5 years', '5 to 10 years', '10 to 15 years', '15 to 20 years', 'Above 20 years'],  # Property Lifespan
    ['Basic', 'Medium', 'High']  # Luxury Category
]

In [11]:
# Column transformer for preprocessing
preprocessor_for_ordinal_encoding = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Covered Area', 'Bedrooms', 'Bathrooms']),
        ('cat', Pipeline([
            ('ord_enc', OrdinalEncoder(categories=ordinal_categories)),
            ('scaler', StandardScaler())  # Scale ordinals for balanced influence
        ]), ['Transaction Type', 'Type of Property', 'City', 'Area', 'Commercial', 'Facing', 'Furnished Type', 'Balconies', 'Floor Level', 'Property Lifespan', 'Luxury Category'])
    ], 
    remainder='passthrough'
)

In [12]:
# Train models
model_output_ordinal = []
for model_name, model in model_dict.items():
    model_output_ordinal.append(scorer(X, y_transformed, model_name, model, preprocessor_for_ordinal_encoding))

In [13]:
# Results Dataframe
model_df = pd.DataFrame(
    model_output_ordinal,
    columns=['Algorithm','R2 Score','MAE']
).sort_values('MAE').reset_index(drop=True)

# Display
model_df

Unnamed: 0,Algorithm,R2 Score,MAE
0,CatBoostRegressor,0.934032,0.437567
1,XGBRegressor,0.927103,0.456246
2,HistGradientBoostingRegressor,0.930373,0.46173
3,LGBMRegressor,0.92982,0.462912
4,RandomForestRegressor,0.912778,0.494068
5,GradientBoostingRegressor,0.906375,0.519525
6,ExtraTreesRegressor,0.889603,0.553852
7,DecisionTreeRegressor,0.840155,0.650933
8,MLPRegressor,0.851459,0.69241
9,SVR,0.846677,0.721951


##### Result:
- CatBoostRegressor performs the best with R2 score of 0.93 impyling it is able to explain ~93% variance in Price of the properties and mean absolute error of 0.43 Crores.

- As expected the Linear models: Linear regression, Ridge , Lasso and ElasticNet are the least performing as they do not go well with ordinal encoding for nominal categories with no inherent ordering. 

- As expected the tree based models performed good with Ordinal encoding.

------------------

#### Strategy 2: One-hot encoding
- Apply one hot encoding to those categorical columns where there is no inherent ordering present.

- Problem with one hot encoding: If high cardinality column is present then the a large number of columns would be added leading to high dimensionality and sparse data. Sparse data can work fine for some algorithms (like linear models), but others (like decision trees) might struggle if the dataset is huge and sparse. Dimensionality reduction techniques can be considered here to reduce training time (like OneHotEncoding with PCA).
    
- Performance of linear models like linear regression should improve after using OneHotEncoding.

In [14]:
# Define specific ordering for each ordinal column
ordinal_categories = [
    ['Unfurnished', 'Semi-Furnished', 'Furnished'],  # Furnished Type
    ['1.0', '2.0', '3.0', '3+'],  # Balconies
    ['Low rise (< 5)', 'Mid rise (5 to 10)', 'High rise (> 10)'],  # Floor Level
    ['New construction', 'Less than 5 years', '5 to 10 years', '10 to 15 years', '15 to 20 years', 'Above 20 years'],  # Property Lifespan
    ['Basic', 'Medium', 'High']  # Luxury Category
]

In [15]:
# Column transformer for preprocessing
preprocessor_for_onehot_ordinal_encoding = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Covered Area', 'Bedrooms', 'Bathrooms']),
        ('cat_nom', OneHotEncoder(drop='first', sparse_output=False), ['Transaction Type', 'Type of Property', 'City', 'Area', 'Commercial', 'Facing']),  # drop to remove multicollinearity
        ('cat_ord', Pipeline([
            ('ord_enc', OrdinalEncoder(categories=ordinal_categories)),
            ('scaler', StandardScaler())  # Scale ordinals for balanced influence
        ]), ['Furnished Type', 'Balconies', 'Floor Level', 'Property Lifespan', 'Luxury Category'])       
    ], 
    remainder='passthrough'
)

In [16]:
# Train models
model_output_onehot_ordinal = []
for model_name, model in model_dict.items():
    model_output_onehot_ordinal.append(scorer(X, y_transformed, model_name, model, preprocessor_for_onehot_ordinal_encoding))

In [17]:
# Results Dataframe
model_df = pd.DataFrame(
    model_output_onehot_ordinal,
    columns=['Algorithm','R2 Score','MAE']
).sort_values('MAE').reset_index(drop=True)

# Display
model_df

Unnamed: 0,Algorithm,R2 Score,MAE
0,CatBoostRegressor,0.939451,0.417511
1,XGBRegressor,0.931907,0.440708
2,ExtraTreesRegressor,0.928662,0.447016
3,RandomForestRegressor,0.926691,0.460712
4,MLPRegressor,0.928817,0.464162
5,HistGradientBoostingRegressor,0.930985,0.466341
6,LGBMRegressor,0.930253,0.466518
7,SVR,0.927814,0.483408
8,LinearRegression,0.92314,0.498475
9,Ridge,0.922999,0.498538


#### Result:
- CatBoostRegressor remains the best performing algorithm with R2 score of 0.93. The mean absolute error has further reduced down to 0.41 crores.

- High-cardinality one-hot feature 'Area' impact on tree-based models:

    - The presence of a high-cardinality feature like Area exploded the feature space after one-hot encoding, with most added columns being zero for any given row, resulting in sparse data.

    - Single Decision Tree:

        - One tree tries to fit all the data by itself. Overfits rare categories because each unique one-hot column can create splits that perfectly match a few rows.

        - Fragments the data into very small subsets, reducing statistical power in leaves.

        - Wastes tree depth exploring mostly-zero irrelevant features.
        
        - Lacks any averaging mechanism, so high variance leads to poor generalization.

    - AdaBoost and Gradient Boosting (with shallow trees):

        - Typically use shallow base learners (depth-1 stumps or shallow trees).

        - Cannot capture complex patterns in high-dimensional sparse one-hot data.

        - Still overfits rare categories quickly due to many sparse column, leading to memorization rather than generalization.

        - Lack the averaging effect of Random Forest, so variance remains high and performance drops.
    

- On contarary why some tree ensembles still performed well?:

    - Tree-based models like Random Forest, Extra Trees, and XGBoost are more robust to high-cardinality one-hot features.

    - Random Forest / Extra Trees build many deep trees on bootstrapped subsets of data and features. Even if some splits overfit rare one-hot columns, averaging across hundreds of trees reduces variance.

    - XGBoost (Extreme Gradient Boosting) with deep trees handles high-dimensional sparse input better than AdaBoost with shallow stumps.

- Summary of boosting/tree behavior:

    - Shallow boosting + high-cardinality one-hot → performance drop

    - Deep ensemble trees + high-cardinality one-hot → performance usually stable

- Impact on linear models:

    - Linear models like Linear Regression and Ridge Regression improved as expected with one-hot encoding of nominal features.

    - These models just multiply each feature by a weight and sum them, so sparse columns (mostly zeros) are effectively ignored, making them robust to high-dimensional one-hot features.


--------------

#### Strategy 3: OneHotEncoding with PCA
The presence of a high-cardinality categorical feature like Area creates many sparse columns after one-hot encoding, which can cause tree-based models without averaging, such as a single Decision Tree, as well as boosting methods like AdaBoost and Gradient Boosting with shallow trees, to overfit and struggle to generalize. Applying PCA or other dimensionality reduction techniques in this scenario can help reduce sparsity and improve the performance of these models.

In [18]:
# Define specific ordering for each ordinal column
ordinal_categories = [
    ['Unfurnished', 'Semi-Furnished', 'Furnished'],  # Furnished Type
    ['1.0', '2.0', '3.0', '3+'],  # Balconies
    ['Low rise (< 5)', 'Mid rise (5 to 10)', 'High rise (> 10)'],  # Floor Level
    ['New construction', 'Less than 5 years', '5 to 10 years', '10 to 15 years', '15 to 20 years', 'Above 20 years'],  # Property Lifespan
    ['Basic', 'Medium', 'High']  # Luxury Category
]

In [19]:
# Column transformer for preprocessing
preprocessor_for_onehotPCA_ordinal_encoding = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Covered Area', 'Bedrooms', 'Bathrooms']),
        ('cat_nom_low_cardinality', OneHotEncoder(drop='first', sparse_output=False), ['Transaction Type', 'Type of Property', 'City', 'Commercial']),  # drop to remove multicollinearity
        ('cat_nom_high_cardinality', Pipeline([
            ('one-hot', OneHotEncoder(drop='first',sparse_output=False)),  # sparse_output=False outputs a dense NumPy array (all 0/1 stored explicitly), as next pipeline step is PCA that requires a dense array as input (PCA in scikit-learn does not accept sparse matrices.)
            ('pca', PCA(n_components=0.95))  # PCA after one-hot encoding of high cardinality features
            ]), ['Area', 'Facing']), 
        ('cat_ord', Pipeline([
            ('ord_enc', OrdinalEncoder(categories=ordinal_categories)),
            ('scaler', StandardScaler())  # Scale ordinals for balanced influence
        ]), ['Furnished Type', 'Balconies', 'Floor Level', 'Property Lifespan', 'Luxury Category'])       
    ], 
    remainder='passthrough'
)

In [20]:
# Train models
model_output_onehotPCA_ordinal = []
for model_name, model in model_dict.items():
    model_output_onehotPCA_ordinal.append(scorer(X, y_transformed, model_name, model, preprocessor_for_onehotPCA_ordinal_encoding))

In [21]:
# Results Dataframe
model_df = pd.DataFrame(
    model_output_onehotPCA_ordinal,
    columns=['Algorithm','R2 Score','MAE']
).sort_values('MAE').reset_index(drop=True)

# Display
model_df

Unnamed: 0,Algorithm,R2 Score,MAE
0,CatBoostRegressor,0.936254,0.435504
1,ExtraTreesRegressor,0.925133,0.446187
2,XGBRegressor,0.927795,0.452415
3,HistGradientBoostingRegressor,0.930293,0.457147
4,LGBMRegressor,0.930842,0.461177
5,RandomForestRegressor,0.921182,0.475768
6,GradientBoostingRegressor,0.921041,0.487937
7,MLPRegressor,0.918219,0.489049
8,SVR,0.920494,0.499373
9,Ridge,0.909452,0.521613


#### Result:
- The performance of Gradient Boosting has improved.
    - Gradient Boosting builds trees sequentially on residuals. Even with the same depth, trees now operate on denser, lower-dimensional features, which reduces overfitting to rare categories and allows the boosting process to focus on meaningful variance.

- Single Decison Tree still perform poorly as it still has no averaging, so high variance remains — it overfits whatever patterns remain in the PCA-transformed data.

- AdaBoost with shallow learners still cannot model complex interactions in the reduced feature space and hence performs poorly.

- The overall best performance still remains around 0.93 with an MAE of 0.43 for CatBoostRegressor.

--------------------

#### Strategy 4: Target encoding for High cardinality feature

- Target encoding is useful for columns with very high cardinality.

- Encoding is done on the basis of target variable.

- Caution! Target encoding can result in Data leakage problem. First peform train-test split and then do target encoding to prevent data leakage problem. In our case we are safe as we are doing cross-validation.

- Goes well with tree based models. Results are not that good with linear models (but better than ordinal encoding).

In [22]:
# Define specific ordering for each ordinal column
ordinal_categories = [
    ['Unfurnished', 'Semi-Furnished', 'Furnished'],  # Furnished Type
    ['1.0', '2.0', '3.0', '3+'],  # Balconies
    ['Low rise (< 5)', 'Mid rise (5 to 10)', 'High rise (> 10)'],  # Floor Level
    ['New construction', 'Less than 5 years', '5 to 10 years', '10 to 15 years', '15 to 20 years', 'Above 20 years'],  # Property Lifespan
    ['Basic', 'Medium', 'High']  # Luxury Category
]

In [23]:
# Column transformer for preprocessing
preprocessor_for_onehotpca_ordinal_target_encoding = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Covered Area', 'Bedrooms', 'Bathrooms']),
        ('cat_nom_low_cardinality', OneHotEncoder(drop='first', sparse_output=False), ['Transaction Type', 'Type of Property', 'City', 'Commercial']), # drop to remove multicollinearity
        ('cat_nom_medium_cardinality', Pipeline([
            ('one-hot', OneHotEncoder(drop='first',sparse_output=False)),  # sparse_output=False outputs a dense NumPy array (all 0/1 stored explicitly), as next pipeline step is PCA that requires a dense array as input (PCA in scikit-learn does not accept sparse matrices.)
            ('pca', PCA(n_components=0.95))  # PCA after one-hot encoding of high cardinality features, keep enough components to explain 95% variance
            ]), ['Facing']),
        ('cat_nom_high_cardinality', ce.TargetEncoder(), ['Area']),  # Target encoding applied to high cardinality column
        ('cat_ord', Pipeline([
            ('ord_enc', OrdinalEncoder(categories=ordinal_categories)),
            ('scaler', StandardScaler())  # Scale ordinals for balanced influence
        ]), ['Furnished Type', 'Balconies', 'Floor Level', 'Property Lifespan', 'Luxury Category'])       
    ], 
    remainder='passthrough'
)

In [24]:
# Train models
model_output_onehot_ordinal_target = []
for model_name, model in model_dict.items():
    model_output_onehot_ordinal_target.append(scorer(X, y_transformed, model_name, model, preprocessor_for_onehotpca_ordinal_target_encoding))

In [25]:
# Results Dataframe
model_df = pd.DataFrame(
    model_output_onehot_ordinal_target,
    columns=['Algorithm','R2 Score','MAE']
).sort_values('MAE').reset_index(drop=True)

# Display
model_df

Unnamed: 0,Algorithm,R2 Score,MAE
0,CatBoostRegressor,0.940072,0.422087
1,HistGradientBoostingRegressor,0.938291,0.42648
2,RandomForestRegressor,0.934167,0.432124
3,LGBMRegressor,0.937967,0.433236
4,XGBRegressor,0.933978,0.433715
5,GradientBoostingRegressor,0.934308,0.442967
6,ExtraTreesRegressor,0.932601,0.445489
7,SVR,0.919576,0.514771
8,MLPRegressor,0.916932,0.524745
9,Ridge,0.911748,0.541722


#### Results:
CatBoostregressor has achieved the best R2 score of 0.94 explaining 94% of variance in the Price of the properties with an MAE of 0.42 Crores.

---------------

# Summary:

CatBoostRegressor with One-Hot encoding over nominal categorical features and Target encoding over high cardinal feature gave the least Mean absolute error of 0.42 Crores while explaining 94% variance in the Price of the properties.