In [71]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [72]:
data = pd.read_csv('gurgaon-post-feature-selection.csv')

In [73]:
data.sample(5)

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,study room,store room,furnished_type,luxury_category,floor_category,price
2226,0.0,62.0,3,3,2.0,1.0,1630.0,0,0,0,0,1.0,2.0,2.55
1788,0.0,65.0,3,3,3.0,3.0,1524.0,1,1,0,1,2.0,1.0,2.36
2269,0.0,71.0,3,3,2.0,3.0,1300.0,0,0,0,0,1.0,1.0,1.25
2844,0.0,95.0,2,2,0.0,1.0,758.0,0,0,0,0,1.0,1.0,0.27
1953,0.0,66.0,3,4,3.0,3.0,2200.0,0,0,0,0,1.0,1.0,3.9


In [74]:
# one hot encoding -> sector, balcony, agepossession, furnished_type, luxury_category, floor category

In [75]:
X = data.drop(columns = 'price')
y = data['price']

In [76]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [77]:
# apply log1p transform on price 
y_transformed = np.log1p(y)

In [78]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size = 0.2, random_state = 42)

In [79]:
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnished_type', 'luxury_category', 'floor_category']

In [80]:
# creating columntransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers = [
        ('numerical', StandardScaler(), ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('category', OneHotEncoder(handle_unknown = 'ignore'), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [81]:
# create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

In [82]:
# K-Fold cross-validation
kfold = KFold(n_splits = 10, shuffle = True, random_state = 42)
scores = cross_val_score(pipeline, X, y_transformed, cv = kfold, scoring = 'r2')

In [83]:
scores.mean()

np.float64(0.8835047358222882)

In [84]:
scores.std()

np.float64(0.013756844646369358)

In [85]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size = 0.2, random_state = 42)

In [86]:
pipeline.fit(X_train, y_train)

In [87]:
y_pred = pipeline.predict(X_test)

In [88]:
y_pred = np.expm1(y_pred)

In [89]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test), y_pred)

0.591068452227957