In [128]:
import numpy as np
import pandas as pd

In [129]:
df = pd.read_csv('gurgaon_properties_post_feature_selection.csv')

In [130]:
df.head()

Unnamed: 0,sector,bedRoom,bathroom,balcony,agePossession,property_type,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,27.0,4,5,4.0,1.0,0.0,3333.0,1,0,2,0.0,2.0,6.0
1,9.0,2,2,3.0,3.0,0.0,1281.0,0,0,1,0.0,1.0,1.0
2,64.0,5,5,4.0,3.0,1.0,5580.0,1,1,2,2.0,2.0,9.25
3,95.0,2,2,2.0,3.0,0.0,850.0,0,0,2,2.0,2.0,0.07
4,71.0,4,4,4.0,1.0,1.0,3950.0,0,0,1,1.0,0.0,8.5


In [131]:
#  one hot encode -> sector, balcony, agePosession, furnishing type, luxury category, floor category

In [132]:
X = df.drop(columns=['price'])
y = df['price']

In [133]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [134]:
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']


In [135]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [154]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OneHotEncoder(handle_unknown="ignore",drop='first'), columns_to_encode)
    ], 
    remainder='passthrough'
)


In [155]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])


In [156]:
print(X.shape, y_transformed.shape)


(3504, 12) (3504,)


In [157]:
print(pipeline)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['property_type', 'bedRoom',
                                                   'bathroom', 'built_up_area',
                                                   'servant room',
                                                   'store room']),
                                                 ('cat',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore'),
                                                  ['sector', 'balcony',
                                                   'agePossession',
                                                   'furnishing_type',
                                                   'luxury_category',
                                  

In [158]:
print(X.isnull().sum())

sector             0
bedRoom            0
bathroom           0
balcony            0
agePossession      0
property_type      0
built_up_area      0
servant room       0
store room         0
furnishing_type    0
luxury_category    0
floor_category     0
dtype: int64


In [159]:
print(y_transformed.isnull().sum())

0


In [160]:
y_transformed.head()

0    1.945910
1    0.693147
2    2.327278
3    0.067659
4    2.251292
Name: price, dtype: float64

In [161]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [162]:
scores.mean()

0.8559706188949914

In [163]:
scores.std()

0.031556167023069966

In [164]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [165]:
pipeline.fit(X_train, Y_train)

In [166]:
y_pred = pipeline.predict(X_test)

In [167]:
y_pred = np.expm1(y_pred)

In [168]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(Y_test), y_pred)

0.6065101274603454