In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder
from sklearn.model_selection import train_test_split

In [1]:
from sklearn import set_config

set_config(transform_output="pandas")

In [4]:
df = pd.read_csv("final_imputed_data.csv")

In [5]:
df.head()

Unnamed: 0,property_type,price,carpet_area,bedroom,bathroom,status,transaction_type,additional_room,regions,balconies_imputed,balconies_iter,floorNum_iter,furnished_status_imputed,luxury_score_iter,luxury_category,floor_category
0,flat,0.52,1200.0,3.0,2.0,ready to move,resale,0.0,bangalore,2.0,3.0,4.0,0,10.0,Medium,Mid Floor
1,flat,1.64,1286.0,3.0,3.0,under construction,new property,0.0,bangalore,3.0,3.0,6.0,0,27.0,High,Mid Floor
2,flat,1.41,952.0,2.0,2.0,ready to move,new property,0.0,bangalore,2.0,2.0,4.0,0,8.0,Low,Mid Floor
3,flat,0.67,1128.0,3.0,2.0,ready to move,new property,0.0,bangalore,2.0,3.0,1.0,0,9.0,Low,Low Floor
4,villa,1.05,960.0,3.0,3.0,ready to move,new property,1.0,bangalore,2.0,2.0,4.0,0,15.0,Medium,Mid Floor


In [8]:
df.drop(columns=["status","floorNum_iter","luxury_score_iter","balconies_imputed"], inplace=True)

## Data preparation

In [11]:
X = df.drop(columns=["price"])
y = df["price"]

In [14]:
X_train.head()

Unnamed: 0,property_type,carpet_area,bedroom,bathroom,transaction_type,additional_room,regions,balconies_iter,furnished_status_imputed,luxury_category,floor_category
2112,flat,900.0,2.0,2.0,resale,1.0,bangalore,2.0,0,Medium,Mid Floor
7571,builder-floor,753.2,2.0,2.0,resale,0.0,new delhi,2.0,2,Low,Low Floor
764,flat,1306.0,3.0,3.0,resale,0.0,bangalore,2.0,0,Low,Mid Floor
17291,residential,2160.0,5.0,5.0,resale,0.0,bangalore,4.0,1,Medium,Low Floor
16069,flat,1050.0,3.0,2.0,resale,0.0,bangalore,2.0,2,Low,Low Floor


In [22]:
ohe_encode = ["property_type", 'transaction_type']
ordinal_encode = ['luxury_category','floor_category']
binary_encode = ['regions']
num_encode = ['carpet_area','bedroom','bathroom','balconies_iter']

In [18]:
luxury_order = ['Low', 'Medium', 'High']

floor_order = ['Low Floor', 'Mid Floor', 'High Floor']

In [23]:
preprocessor = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False), ohe_encode),
        ("ordinal", OrdinalEncoder(categories=[luxury_order, floor_order]), ordinal_encode),
        ("binary", ce.BinaryEncoder(), binary_encode),
        ("num", StandardScaler(), num_encode)
    ], remainder="passthrough", n_jobs=-1, force_int_remainder_cols=False,verbose_feature_names_out=False
)

preprocessor.set_output(transform="pandas")

In [107]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

In [104]:
y_transformed = np.log1p(y)

In [108]:
from sklearn.model_selection import KFold, cross_val_score

# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [109]:
scores.mean()

0.7760599549680276

## Train Initial Baseline Model

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [127]:
from sklearn.preprocessing import FunctionTransformer

pt = FunctionTransformer(np.log1p, inverse_func=np.expm1, validate=True)
y_train_pt = pt.fit_transform(y_train.values.reshape(-1,1))
y_test_pt = pt.transform(y_test.values.reshape(-1,1))



In [128]:
pipeline.fit(X_train,y_train_pt)

  y = column_or_1d(y, warn=True)


In [129]:
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)


In [130]:
# get the actual predictions values

y_pred_train_org = pt.inverse_transform(y_train_pred.reshape(-1,1))

y_pred_test_org = pt.inverse_transform(y_test_pred.reshape(-1,1))

In [131]:
from sklearn.metrics import mean_absolute_error, r2_score

print(f"The train error is {mean_absolute_error(y_train,y_pred_train_org):.2f} rupees")
print(f"The test error is {mean_absolute_error(y_test,y_pred_test_org):.2f} rupees")

The train error is 0.63 rupees
The test error is 0.73 rupees


In [132]:
print(f"The train r2 score is {r2_score(y_train,y_pred_train_org):.2f}")
print(f"The test r2 score is {r2_score(y_test,y_pred_test_org):.2f}")

The train r2 score is 0.71
The test r2 score is 0.59
