In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# Display all columns
pd.set_option('display.max_columns', None)

In [2]:
# Load the stage 6 property data
df = pd.read_csv(r'Output Files\stage_6_property_data.csv') 
df.head(3)

Unnamed: 0,City,Area,Type of Property,Transaction Type,Property Lifespan,Commercial,Covered Area,Bedrooms,Bathrooms,Balconies,House Help Room,Store Room,Puja Room,Study,dist_to_csia_airport_km,dist_to_thane_station_km,dist_to_andheri_station_km,dist_to_csmt_station_km,dist_to_nariman_point_km,dist_to_andheri_east_comm_km,dist_to_marine_drive_km,dist_to_navi_mumbai_airport_km,dist_to_vashi_station_km,dist_to_sanjay_gandhi_np_km,dist_to_phoenix_mall_kurla_km,dist_to_kokilaben_hospital_km,dist_to_dh_ambani_school_km,Furnished Type,Floor Level,Facing,Power Back Up,Lift,Club House,Swimming Pool,Park,Parking,Security,Water Storage,Vaastu Compliant,Visitor Parking,Intercom Facility,Maintenance Staff,Internet/Wi-Fi Connectivity,DTH Television Facility,Piped Gas,Jogging and Strolling Track,Price (Crores)
0,Thane,Kalyan West,Apartment,New Property,New construction,Y,763.571943,2,2.0,2.0,1,1,1,1,32.01,16.86,32.22,45.19,47.27,31.16,45.7,30.01,23.31,27.87,30.9,33.78,34.33,Unfurnished,High rise (> 10),East,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.63
1,Thane,Kalyan West,Apartment,New Property,New construction,N,850.0,2,2.0,2.0,1,1,1,1,32.01,16.86,32.22,45.19,47.27,31.16,45.7,30.01,23.31,27.87,30.9,33.78,34.33,Unfurnished,High rise (> 10),East,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.54
2,Thane,Kalyan West,Apartment,New Property,Less than 5 years,N,1050.0,2,2.0,3.0,1,1,1,1,32.01,16.86,32.22,45.19,47.27,31.16,45.7,30.01,23.31,27.87,30.9,33.78,34.33,Unfurnished,Mid rise (5 to 10),East,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.9


In [3]:
# Check shape
df.shape

(6280, 47)

--------------

In [4]:
# Independent features
X = df.drop(columns=['Price (Crores)'])

# Dependent feature [Target Variable]
y = df['Price (Crores)']

# Baseline models

#### Linear Regression

In [5]:
# log1p transform the right-skewed target variable Price
y_transformed = np.log1p(y)

In [6]:
# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Covered Area', 'Bedrooms', 'Bathrooms', 'dist_to_csia_airport_km', 'dist_to_thane_station_km', 'dist_to_andheri_station_km', 'dist_to_csmt_station_km', \
                                   'dist_to_nariman_point_km', 'dist_to_andheri_east_comm_km', 'dist_to_marine_drive_km', 'dist_to_navi_mumbai_airport_km', 'dist_to_vashi_station_km', \
                                   'dist_to_sanjay_gandhi_np_km', 'dist_to_phoenix_mall_kurla_km', 'dist_to_kokilaben_hospital_km', 'dist_to_dh_ambani_school_km']),
        
        ('cat', OrdinalEncoder(), ['City', 'Area', 'Type of Property', 'Transaction Type', 'Property Lifespan', 'Commercial', 'Balconies', 'Furnished Type', 'Floor Level', 'Facing'])
    ], 
    remainder='passthrough'
)

In [7]:
# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [8]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

# Average score
print('Average R2 score:', round(scores.mean(), 2))

# Standard deviation of scores
print('Standard deviation of scores:', round(scores.std(), 2))

Average R2 score: 0.85
Standard deviation of scores: 0.01


##### Linear regression model has achieved an R2 score of 0.85 meaning it can explain 85% variance in the Price of the properties. Also, the standard deviation of scores is 0.01 implying that the scores were consistent.

Check the Mean absolute error for the Linear regression model:

In [9]:
from sklearn.model_selection import train_test_split

# Split the data in training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [10]:
# Fit the training data in pipeline
pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [11]:
# Prediction on test data
y_pred = pipeline.predict(X_test)

In [12]:
# Check the mean absolute error
from sklearn.metrics import mean_absolute_error

print('Mean absolute error of the Linear regression model is:', round(mean_absolute_error(np.expm1(y_test), np.expm1(y_pred)), 2))

Mean absolute error of the Linear regression model is: 0.64


##### The baseline Linear regression model is having a high Mean Absolute Error of 0.64 Crores. Note that for property price prediction the 'Mean Absolute Error' is the most important performance metric for the regression model.

------------------

#### Support Vector Machine (SVM)

In [13]:
# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf', C=1.0, epsilon=0.1))
])

In [14]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

# Average score
print('Average R2 score:', round(scores.mean(), 2))

# Standard deviation of scores
print('Standard deviation of scores:', round(scores.std(), 2))

Average R2 score: 0.88
Standard deviation of scores: 0.01


##### Accuracy has increased slightly using Support Vector Machine regressor.

Check the Mean absolute error for the SVM regression model:

In [15]:
# Split the data in training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [16]:
# Fit the training data in pipeline
pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,1.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [17]:
# Predict on test data
y_pred = pipeline.predict(X_test)

In [18]:
# Check the mean absolute error
print('Mean absolute error of the SVM regression model is:', round(mean_absolute_error(np.expm1(y_test), np.expm1(y_pred)), 2))

Mean absolute error of the SVM regression model is: 0.63


##### The Mean absolute error is still high.

---------------

Performance of the price predictive model (both R2 accuracy and Mean absolute error) can be further improved by trying out:
- Advanced ML algorithms.
- Hyperparameter tuning.