# Building a baseline model


In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("gurgaon_properties_post_feature_selection.csv")

In [6]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,0.0,36.0,3.0,2.0,2.0,1.0,850.0,0.0,0.0,0.0,1.0,1.0,0.82
1,0.0,95.0,2.0,2.0,2.0,1.0,1226.0,1.0,0.0,0.0,1.0,2.0,0.95
2,0.0,103.0,2.0,2.0,1.0,1.0,1000.0,0.0,0.0,0.0,1.0,0.0,0.32
3,0.0,99.0,3.0,4.0,4.0,3.0,1615.0,1.0,0.0,1.0,0.0,2.0,1.6
4,0.0,5.0,2.0,2.0,1.0,3.0,582.0,0.0,1.0,0.0,0.0,2.0,0.48


In [57]:
# one hot encode -> sector, balcony, agePossession, furnishing type, luxury category, floor category
x = df.drop(columns=['price'])
y = df['price']


In [58]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [59]:
df.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category', 'price'],
      dtype='object')

In [60]:
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [61]:
y_transformed = np.log1p(y)

In [62]:
# creating transformation on diffrent columns
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OneHotEncoder(drop= 'first'), columns_to_encode)
], 
remainder= 'passthrough')

In [63]:
# time of creating a pipeline
Pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
]
)

In [64]:
# kfold cross validation for scoring
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(Pipeline, x, y_transformed, cv=kfold)

In [65]:
scores.mean()

0.8845360715052786

In [66]:
scores.std()

0.014784881452420021

In [67]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y_transformed, test_size=0.2, random_state=42)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(2843, 12) (2843,) (711, 12) (711,)


In [68]:
Pipeline.fit(x_train, y_train)

In [69]:
y_pred= Pipeline.predict(x_test)

In [70]:
y_pred = np.expm1(y_pred)

In [71]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test), y_pred)

0.5324591082613233

### Our baseline model performs well, achieving a **Mean Absolute Error (MAE) of 0.53**. The **cross-validation score** is **above 88%**, with a **standard deviation of approximately 0.14%**, indicating stable performance even with the default parameter settings of the **SVR algorithm**.
