In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, PowerTransformer, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer

In [23]:
import warnings
warnings.filterwarnings('ignore')

In [24]:
from pathlib import Path

path = Path.cwd().parent
data_path = path / 'data/raw'

In [25]:
from sklearn import set_config

set_config(transform_output="pandas")

In [26]:
df = pd.read_csv(f"{data_path}/final_data.csv")

In [8]:
df.head()

Unnamed: 0,propertytype,bedrooms,bathrooms,balconies,furnished,transactiontype,ageofcons,additionalRooms,region,locality,superbuiltupareasqft,price
0,multistorey apartment,4.0,4.0,4.0,furnished,resale,above 20 years,1,noida,sector 108,3850.0,1.73
1,multistorey apartment,2.0,2.0,2.0,furnished,new property,less than 5 years,1,noida,sector 76,1180.0,1.5
2,multistorey apartment,3.0,3.0,2.0,unfurnished,resale,15 to 20 years,1,noida,sector 29,1800.0,2.5
3,builder floor apartment,3.0,3.0,1.0,unfurnished,resale,above 20 years,1,noida,sector 49,1050.0,0.43
4,multistorey apartment,3.0,3.0,3.0,unfurnished,resale,less than 5 years,0,noida,sector 143,940.0,0.62


In [9]:
df.shape

(47734, 12)

In [27]:
X = df.drop(columns=["price"])
y = df["price"]

In [28]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
     

In [29]:
print("The size of train data is",X_train.shape)
print("The shape of test data is",X_test.shape)

The size of train data is (38187, 11)
The shape of test data is (9547, 11)


In [13]:
X.head()

Unnamed: 0,propertytype,bedrooms,bathrooms,balconies,furnished,transactiontype,ageofcons,additionalRooms,region,locality,superbuiltupareasqft
0,multistorey apartment,4.0,4.0,4.0,furnished,resale,above 20 years,1,noida,sector 108,3850.0
1,multistorey apartment,2.0,2.0,2.0,furnished,new property,less than 5 years,1,noida,sector 76,1180.0
2,multistorey apartment,3.0,3.0,2.0,unfurnished,resale,15 to 20 years,1,noida,sector 29,1800.0
3,builder floor apartment,3.0,3.0,1.0,unfurnished,resale,above 20 years,1,noida,sector 49,1050.0
4,multistorey apartment,3.0,3.0,3.0,unfurnished,resale,less than 5 years,0,noida,sector 143,940.0


In [30]:
ohe_encode = ['transactiontype','region','propertytype','furnished','ageofcons']
target_encode = ['locality']
robust_scaler = ['bedrooms','bathrooms','balconies','superbuiltupareasqft']

In [31]:
pt = FunctionTransformer(np.log1p, inverse_func=np.expm1, validate=True)
y_train_pt = pt.fit_transform(y_train.values.reshape(-1,1))
y_test_pt = pt.transform(y_test.values.reshape(-1,1))

In [32]:
preprocessor = ColumnTransformer(
    [
        ('ohe_encode', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), ohe_encode),
        ('robust_scaler', RobustScaler(), robust_scaler)
    ],remainder="passthrough", n_jobs=-1, force_int_remainder_cols=False,verbose_feature_names_out=False
)


preprocessor.set_output(transform="pandas")

0,1,2
,transformers,"[('ohe_encode', ...), ('robust_scaler', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,-1
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False


In [33]:
# Creating a pipeline
pipeline = Pipeline([
    ('target_encoder', ce.TargetEncoder(cols=target_encode)),
    ('preprocessor', preprocessor)
    
])

In [34]:
# do data preprocessing

X_train_trans = pipeline.fit_transform(X_train, y_train)

X_test_trans = pipeline.transform(X_test)

In [35]:
from sklearn.svm import SVR

svm = SVR(kernel='rbf')


In [37]:
from sklearn.model_selection import KFold, cross_val_score

# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(svm, X_train_trans, y_train_pt, cv=kfold, scoring='neg_mean_squared_error')

In [39]:
-scores.mean()

np.float64(0.03694583326339191)