In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv("gurgaon_properties_post_feature_selection.csv")

In [3]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,floorNum,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_score,price
0,0.0,36.0,3.0,2.0,2.0,1.0,1.0,850.0,0.0,0.0,0.0,1.0,0.82
1,0.0,95.0,2.0,2.0,2.0,2.0,1.0,1226.0,1.0,0.0,0.0,1.0,0.95
2,0.0,103.0,2.0,2.0,1.0,0.0,1.0,1000.0,0.0,0.0,0.0,1.0,0.32
3,0.0,99.0,3.0,4.0,4.0,2.0,3.0,1615.0,1.0,0.0,1.0,0.0,1.6
4,0.0,5.0,2.0,2.0,1.0,2.0,3.0,582.0,0.0,1.0,0.0,0.0,0.48


In [4]:
col_ohe=["property_type","sector","agePossession","furnishing_type","luxury_score","floorNum"]

In [5]:
df[col_ohe]

Unnamed: 0,property_type,sector,agePossession,furnishing_type,luxury_score,floorNum
0,0.0,36.0,1.0,0.0,1.0,1.0
1,0.0,95.0,1.0,0.0,1.0,2.0
2,0.0,103.0,1.0,0.0,1.0,0.0
3,0.0,99.0,3.0,1.0,0.0,2.0
4,0.0,5.0,3.0,0.0,0.0,2.0
...,...,...,...,...,...,...
3549,0.0,90.0,3.0,0.0,2.0,2.0
3550,1.0,12.0,3.0,0.0,0.0,1.0
3551,0.0,23.0,0.0,1.0,2.0,2.0
3552,1.0,44.0,0.0,0.0,2.0,2.0


In [10]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import KFold,cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

In [11]:
X=df.drop("price",axis=1)

In [12]:
X.head(3)

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,floorNum,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_score
0,0.0,36.0,3.0,2.0,2.0,1.0,1.0,850.0,0.0,0.0,0.0,1.0
1,0.0,95.0,2.0,2.0,2.0,2.0,1.0,1226.0,1.0,0.0,0.0,1.0
2,0.0,103.0,2.0,2.0,1.0,0.0,1.0,1000.0,0.0,0.0,0.0,1.0


In [14]:
y=df["price"]
y.head(2)

0    0.82
1    0.95
Name: price, dtype: float64

In [15]:
y_transformed=np.log1p(y)

In [28]:
# Creating a column transformer for preprocessing

preprocessor=ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat',OneHotEncoder(),col_ohe)
    ],
    remainder='passthrough'
)

In [29]:
svr=SVR(kernel='rbf')

In [31]:
#creating a pipeline
pipeline=Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',svr)
]
)

In [32]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [33]:
scores.mean()

0.8842548911061348

In [36]:
scores.std()

0.014655824854239748

In [34]:
#the r2 score is good in base model , but we cant rely on only r2 score 

In [35]:
#calculate MAE

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [44]:
X_train,X_test,y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [46]:
pipeline.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [47]:
y_pred=pipeline.predict(X_test)

In [48]:
y_pred = np.expm1(y_pred)

In [51]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.522409997980398

In [52]:
#although our r2 score is good , but this MAE shows how icorrect and unreliable prediction can be done by our model