In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# Display all columns
pd.set_option('display.max_columns', None)

In [2]:
# Load the stage 6 property data
df = pd.read_csv(r'Output Files\stage_6_property_data.csv') 
df.head(3)

Unnamed: 0,City,Area,Type of Property,Transaction Type,Property Lifespan,Commercial,Covered Area,Bedrooms,Bathrooms,Balconies,House Help Room,Store Room,Puja Room,Study,dist_to_csia_airport_km,dist_to_thane_station_km,dist_to_andheri_station_km,dist_to_csmt_station_km,dist_to_nariman_point_km,dist_to_andheri_east_comm_km,dist_to_marine_drive_km,dist_to_navi_mumbai_airport_km,dist_to_vashi_station_km,dist_to_sanjay_gandhi_np_km,dist_to_phoenix_mall_kurla_km,dist_to_kokilaben_hospital_km,dist_to_dh_ambani_school_km,Furnished Type,Floor Level,Facing,Power Back Up,Lift,Club House,Swimming Pool,Park,Parking,Security,Water Storage,Vaastu Compliant,Visitor Parking,Intercom Facility,Maintenance Staff,Internet/Wi-Fi Connectivity,DTH Television Facility,Piped Gas,Jogging and Strolling Track,Price (Crores)
0,Thane,Kalyan West,Apartment,New Property,New construction,Y,763.571943,2,2.0,2.0,1,1,1,1,32.01,16.86,32.22,45.19,47.27,31.16,45.7,30.01,23.31,27.87,30.9,33.78,34.33,Unfurnished,High rise (> 10),East,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.63
1,Thane,Kalyan West,Apartment,New Property,New construction,N,850.0,2,2.0,2.0,1,1,1,1,32.01,16.86,32.22,45.19,47.27,31.16,45.7,30.01,23.31,27.87,30.9,33.78,34.33,Unfurnished,High rise (> 10),East,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.54
2,Thane,Kalyan West,Apartment,New Property,Less than 5 years,N,1050.0,2,2.0,3.0,1,1,1,1,32.01,16.86,32.22,45.19,47.27,31.16,45.7,30.01,23.31,27.87,30.9,33.78,34.33,Unfurnished,Mid rise (5 to 10),East,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.9


In [3]:
# Check shape
df.shape

(6280, 47)

--------------

### Nature of Categorical features:

In [4]:
# Print Data type of features
df.dtypes

City                               object
Area                               object
Type of Property                   object
Transaction Type                   object
Property Lifespan                  object
Commercial                         object
Covered Area                      float64
Bedrooms                            int64
Bathrooms                         float64
Balconies                          object
House Help Room                     int64
Store Room                          int64
Puja Room                           int64
Study                               int64
dist_to_csia_airport_km           float64
dist_to_thane_station_km          float64
dist_to_andheri_station_km        float64
dist_to_csmt_station_km           float64
dist_to_nariman_point_km          float64
dist_to_andheri_east_comm_km      float64
dist_to_marine_drive_km           float64
dist_to_navi_mumbai_airport_km    float64
dist_to_vashi_station_km          float64
dist_to_sanjay_gandhi_np_km       

- Transaction Type: Nominal feature with no ordering.
- Type of Property: Nominal feature with no ordering.
- City: Nominal feature with no ordering.
- Area:  Nominal feature with no ordering. [High Cardinality: 53 unique values]
- Commercial: Nominal feature with no ordering.
- Furnished Type: Feature with inherent ordering.
- Balconies: Feature with inherent ordering.
- Facing: Nominal feature with no ordering.
- Floor Level: Feature with inherent ordering.
- Property Lifespane: Feature with inherent ordering.
- Luxury Category: Feature with inherent ordering.

In [6]:
# Numerical features
numerical_columns = ['Covered Area', 'Bedrooms', 'Bathrooms', 'dist_to_csia_airport_km', 'dist_to_thane_station_km', 'dist_to_andheri_station_km', 'dist_to_csmt_station_km', \
                    'dist_to_nariman_point_km', 'dist_to_andheri_east_comm_km', 'dist_to_marine_drive_km', 'dist_to_navi_mumbai_airport_km', 'dist_to_vashi_station_km', \
                    'dist_to_sanjay_gandhi_np_km', 'dist_to_phoenix_mall_kurla_km', 'dist_to_kokilaben_hospital_km', 'dist_to_dh_ambani_school_km']

# Nominal categorical fetures for one-hot encoding
nominal_columns_to_one_hot_encode = ['Transaction Type', 'Type of Property', 'City', 'Area', 'Commercial', 'Facing']

# Categorical features with inherent ordering to ordinally encode
ordered_columns_to_ordinal_encode = ['Furnished Type', 'Balconies', 'Floor Level', 'Property Lifespan']

# Define specific ordering for each ordinal/ordered column
ordinal_categories = [
    ['Unfurnished', 'Semi-Furnished', 'Furnished'],  # Furnished Type
    ['1.0', '2.0', '3.0', '3+'],  # Balconies
    ['Low rise (< 5)', 'Mid rise (5 to 10)', 'High rise (> 10)'],  # Floor Level
    ['New construction', 'Less than 5 years', '5 to 10 years', '10 to 15 years', '15 to 20 years', 'Above 20 years'],  # Property Lifespan
]


--------------

In [7]:
# Independent features
X = df.drop(columns=['Price (Crores)'])

# Dependent feature [Target Variable]
y = df['Price (Crores)']

# Baseline models

#### Linear Regression

- Standardise the data as Linear models are scale dependant.
- One-Hot encoding for nominal categorical features having no inherent ordering.
- Apply log-transformation to the target Price variable as it is right-skewed.

In [8]:
# log1p transform the right-skewed target variable Price
y_transformed = np.log1p(y)

In [9]:
# ColumnTransformer for pre-processing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat_nom', OneHotEncoder(drop='first'), nominal_columns_to_one_hot_encode),
        ('cat_ord', Pipeline([
            ('ord_enc', OrdinalEncoder(categories=ordinal_categories)),
            ('scaler', StandardScaler())  # Scale ordinals for balanced influence
        ]), ordered_columns_to_ordinal_encode)
    ],
    remainder='passthrough'
)

In [10]:
# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [11]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

# Average score
print('Average R2 score:', round(scores.mean(), 2))

# Standard deviation of scores
print('Standard deviation of scores:', round(scores.std(), 2))

Average R2 score: 0.92
Standard deviation of scores: 0.01


##### Linear regression model has achieved a very good R2 score of 0.92 meaning it can explain 92% variance in the Price of the properties. Also, the standard deviation of scores is 0.01 implying that the scores were consistent.

Check the Mean absolute error for the Linear regression model:

In [12]:
from sklearn.model_selection import train_test_split

# Split the data in training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [13]:
# Fit the training data in pipeline
pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat_nom', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['Unfurnished', 'Semi-Furnished', ...], ['1.0', '2.0', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [14]:
# Prediction on test data
y_pred = pipeline.predict(X_test)

In [15]:
# Check the mean absolute error
from sklearn.metrics import mean_absolute_error

print('Mean absolute error of the Linear regression model is:', round(mean_absolute_error(np.expm1(y_test), np.expm1(y_pred)), 2))

Mean absolute error of the Linear regression model is: 0.5


------------------

#### Support Vector Machine (SVM)

- SVM is a distance-based model that relies on geometric distances between data points in feature space to find the optimal separating hyperplane. The nominal categorical features with no inherent ordering has to be one-hot encoded.

- If we ordinally encode nominal categories as integers (e.g., Thane=0, Mumbai=1), the model interprets these as points on a continuous numeric scale. This imposes a fake ordering and creates artificial distances (e.g., Mumbai is 1 unit farther from Thane), which don’t reflect any real relationship. As a result, the SVM’s distance calculations and thus the decision boundary gets biased.

In [16]:
# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf', C=1.0, epsilon=0.1))
])

In [17]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

# Average score
print('Average R2 score:', round(scores.mean(), 2))

# Standard deviation of scores
print('Standard deviation of scores:', round(scores.std(), 2))

Average R2 score: 0.93
Standard deviation of scores: 0.01


##### Accuracy has increased slightly using Support Vector Machine regressor.

Check the Mean absolute error for the SVM regression model:

In [18]:
# Split the data in training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [19]:
# Fit the training data in pipeline
pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat_nom', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,"[['Unfurnished', 'Semi-Furnished', ...], ['1.0', '2.0', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,1.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [20]:
# Predict on test data
y_pred = pipeline.predict(X_test)

In [21]:
# Check the mean absolute error
print('Mean absolute error of the SVM regression model is:', round(mean_absolute_error(np.expm1(y_test), np.expm1(y_pred)), 2))

Mean absolute error of the SVM regression model is: 0.46


The Mean absolute error has come down significantly.

---------------

Performance of the price predictive model can be further improved by trying out:
- Different algorithms.
- Hyperparameter tuning.