In [1]:
# Import Libraries
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import category_encoders as ce

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score

from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the stage 6 property data
df = pd.read_csv(r'Output Files\stage_6_property_data.csv') 
df.head(3)

Unnamed: 0,City,Area,Type of Property,Transaction Type,Property Lifespan,Commercial,Covered Area,Bedrooms,Bathrooms,Balconies,...,Water Storage,Vaastu Compliant,Visitor Parking,Intercom Facility,Maintenance Staff,Internet/Wi-Fi Connectivity,DTH Television Facility,Piped Gas,Jogging and Strolling Track,Price (Crores)
0,Thane,Kalyan West,Apartment,New Property,New construction,Y,763.571943,2,2.0,2.0,...,1,1,1,1,1,1,1,1,1,0.63
1,Thane,Kalyan West,Apartment,New Property,New construction,N,850.0,2,2.0,2.0,...,1,1,1,1,1,1,1,1,1,0.54
2,Thane,Kalyan West,Apartment,New Property,Less than 5 years,N,1050.0,2,2.0,3.0,...,1,1,1,1,1,1,1,1,1,0.9


In [3]:
# Check shape
df.shape

(6280, 47)

In [4]:
# Independent features
X = df.drop(columns=['Price (Crores)'])

# Dependent feature [Target Variable]
y = df['Price (Crores)']

# Log transform y
y_transformed = np.log1p(y)

In [5]:
# Split into train and final test
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

# Model Building and Hyperparameter Tuning

In [6]:
# Define specific ordering for each ordinal column
ordinal_categories = [
    ['New construction', 'Less than 5 years', '5 to 10 years', '10 to 15 years', '15 to 20 years', 'Above 20 years'],  # Property Lifespan
    ['1.0', '2.0', '3.0', '3+'],  # Balconies
    ['Unfurnished', 'Semi-Furnished', 'Furnished'],  # Furnished Type
    ['Low rise (< 5)', 'Mid rise (5 to 10)', 'High rise (> 10)'],  # Floor Level
]

In [7]:
# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Covered Area', 'Bedrooms', 'Bathrooms', 'dist_to_csia_airport_km', 'dist_to_thane_station_km', 'dist_to_andheri_station_km', 'dist_to_csmt_station_km', \
                                   'dist_to_nariman_point_km', 'dist_to_andheri_east_comm_km', 'dist_to_marine_drive_km', 'dist_to_navi_mumbai_airport_km', 'dist_to_vashi_station_km', \
                                   'dist_to_sanjay_gandhi_np_km', 'dist_to_phoenix_mall_kurla_km', 'dist_to_kokilaben_hospital_km', 'dist_to_dh_ambani_school_km']),

        ('cat_nom_low_cardinality', OneHotEncoder(drop='first', sparse_output=False), ['City', 'Type of Property', 'Transaction Type', 'Commercial']),  # Drop to remove multicollinearity
        
        ('cat_nom_medium_cardinality', Pipeline([
            ('one-hot', OneHotEncoder(drop='first',sparse_output=False)),  # sparse_output=False outputs a dense NumPy array (all 0/1 stored explicitly), as next pipeline step is PCA that requires a dense array as input (PCA in scikit-learn does not accept sparse matrices.)
            ('pca', PCA(n_components=0.95))  # PCA after one-hot encoding of high cardinality features, keep enough components to explain 95% variance
            ]), ['Facing']),
        
        ('cat_nom_high_cardinality', ce.TargetEncoder(), ['Area']),  # Target encoding applied to high cardinality column
        
        ('cat_ord', Pipeline([
            ('ord_enc', OrdinalEncoder(categories=ordinal_categories)),
            ('scaler', StandardScaler())  # Scale ordinals for balanced influence
        ]), ['Property Lifespan', 'Balconies', 'Furnished Type', 'Floor Level'])       
    ], 
    remainder='passthrough'
)

In [8]:
# CatBoost model with early stopping (applies in each CV fold)
catboost_model = CatBoostRegressor(
    loss_function='RMSE',
    eval_metric='RMSE',
    early_stopping_rounds=50,
    verbose=False,
    random_state=42
)

# Build Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', catboost_model)
])

# K-fold cross validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Parameter space
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__depth': [4, 6, 8, 10],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__l2_leaf_reg': [1, 3, 5, 7, 9]
}

# Randomized search
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=20,  # Number of random combinations
    cv=kfold,
    scoring='r2',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

In [9]:
# Run search
random_search.fit(X_train, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


In [10]:
# Best Parameters
print("Best Params:", random_search.best_params_)

# Best score
print("Best CV Score:", random_search.best_score_)

Best Params: {'regressor__n_estimators': 300, 'regressor__learning_rate': 0.1, 'regressor__l2_leaf_reg': 3, 'regressor__depth': 8}
Best CV Score: 0.9394561724411854


In [11]:
# Construct final pipeline with best hyperparameters
final_pipeline = random_search.best_estimator_

In [12]:
# Train the final pipeline
final_pipeline.fit(X,y_transformed)

#### Check Mean absolute error of the final Price predictive model

In [13]:
# Check MAE
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)
    
final_pipeline.fit(X_train,y_train)
    
y_pred = final_pipeline.predict(X_test)
    
print('Mean absolute error of final model is:', round(mean_absolute_error(np.expm1(y_test), np.expm1(y_pred)), 2))

Mean absolute error of final model is: 0.41


In [14]:
# R2 score of final model
print('R² score of final model is:', round(r2_score(np.expm1(y_test), np.expm1(y_pred)), 2))

R² score of final model is: 0.93


In [15]:
# Save the training dateset of the final model pipeline 
X_train.to_csv('Output Files\\Price Prediction model files\\X_train_for_CatBoost.csv', index=False)
y_train.to_csv('Output Files\\Price Prediction model files\\y_train_for_CatBoost.csv', index=False)

In [16]:
# Save the testing dateset of the final model pipeline
X_test.to_csv('Output Files\\Price Prediction model files\\X_test_for_CatBoost.csv', index=False)
y_test.to_csv('Output Files\\Price Prediction model files\\y_test_for_CatBoost.csv', index=False)

------------------

### Predictions for query using final model

In [17]:
# Make query dataframe
query = pd.DataFrame(data=[['Mumbai', 'Goregaon East', 'Apartment',	'New Property',	'New construction', 'N', 590, 1, 1, '1.0', 1, 1, 1, 1, \
                            8.92, 12.75, 5.59, 25.57, 27.26, 6.07, 25.29, 29.36, 18.76, 6.77, 10.41, 6.09, 12.16, 'Unfurnished', 'High rise (> 10)', 'East', \
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
             columns=X.columns)
query

Unnamed: 0,City,Area,Type of Property,Transaction Type,Property Lifespan,Commercial,Covered Area,Bedrooms,Bathrooms,Balconies,...,Security,Water Storage,Vaastu Compliant,Visitor Parking,Intercom Facility,Maintenance Staff,Internet/Wi-Fi Connectivity,DTH Television Facility,Piped Gas,Jogging and Strolling Track
0,Mumbai,Goregaon East,Apartment,New Property,New construction,N,590,1,1,1.0,...,1,1,1,1,1,1,1,1,1,1


In [18]:
# Predict for query
predicted_price = round(np.expm1(final_pipeline.predict(query)[0]), 2)
print(f'The estimated price of the query property is approximatley {predicted_price} Crores')

The estimated price of the query property is approximatley 0.9 Crores


-----------

# Export the model

In [19]:
from joblib import dump, load

# Dump the final model
dump(final_pipeline, 'Output Files\\Price Prediction model files\\predict_price_model_catBoost.joblib')


['Output Files\\Price Prediction model files\\predict_price_model_catBoost.joblib']

In [20]:
# Load
model = load('Output Files\\Price Prediction model files\\predict_price_model_catBoost.joblib')

# Predict for query property
print(f'The estimated price of the query property is approximatley {round(np.expm1(model.predict(query)[0]), 2)} Crores')

The estimated price of the query property is approximatley 0.9 Crores
