In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA


In [4]:
df = pd.read_csv('gurgaon_properties_post_feature_selection.csv')

In [5]:
df.head()

Unnamed: 0,sector,bedRoom,bathroom,balcony,agePossession,property_type,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,27.0,4,5,4.0,1.0,0.0,3333.0,1,0,2,0.0,2.0,6.0
1,9.0,2,2,3.0,3.0,0.0,1281.0,0,0,1,0.0,1.0,1.0
2,64.0,5,5,4.0,3.0,1.0,5580.0,1,1,2,2.0,2.0,9.25
3,95.0,2,2,2.0,3.0,0.0,850.0,0,0,2,2.0,2.0,0.07
4,71.0,4,4,4.0,1.0,1.0,3950.0,0,0,1,1.0,0.0,8.5


In [6]:
df['furnishing_type'].value_counts()

furnishing_type
1    2313
2    1004
0     187
Name: count, dtype: int64

In [7]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished

df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished', 1.0:'semifurnished',2.0:'furnished'})

In [8]:
df.head()

Unnamed: 0,sector,bedRoom,bathroom,balcony,agePossession,property_type,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,27.0,4,5,4.0,1.0,0.0,3333.0,1,0,furnished,0.0,2.0,6.0
1,9.0,2,2,3.0,3.0,0.0,1281.0,0,0,semifurnished,0.0,1.0,1.0
2,64.0,5,5,4.0,3.0,1.0,5580.0,1,1,furnished,2.0,2.0,9.25
3,95.0,2,2,2.0,3.0,0.0,850.0,0,0,furnished,2.0,2.0,0.07
4,71.0,4,4,4.0,1.0,1.0,3950.0,0,0,semifurnished,1.0,0.0,8.5


In [9]:
X=df.drop(columns=['price'])
y=df['price']

In [10]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

#### Ordinal Encoding

In [11]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']


In [16]:
# Creating a column transformer for preprocessing

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom','bathroom','built_up_area','servant room','store room']),
        ('cat',OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [17]:
# Creating a pipeline
pipeline=Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [18]:
# K-Fold cross-validation
kfold= KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [19]:
scores.mean(), scores.std()

(0.6362179749102765, 0.10637896843976678)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [21]:
pipeline.fit(X_train, y_train)

In [22]:
y_pred = pipeline.predict(X_test)

In [23]:
y_pred = np.expm1(y_pred)

In [24]:
mean_absolute_error(np.expm1(y_test), y_pred)

1.0168071201865807

In [None]:
def scorer(model_name, model):
    output =[]
    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-Fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y