In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv("gurgaon_properties_post_feature_selection_v2.csv")
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [3]:
df['furnishing_type'].value_counts()

furnishing_type
0.0    2349
1.0    1018
2.0     187
Name: count, dtype: int64

In [4]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({
    0.0: "unfurnished",
    1.0: "semifurnished",
    2.0: "furnished"
})

In [5]:
X = df.drop(columns=['price'])
y = df['price']

In [6]:
# Applying log transform
y_transformed = np.log1p(y)

## Ordinal Encoding

In [7]:
df.columns

Index(['property_type', 'sector', 'price', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [8]:
columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [9]:
# Creating a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ],
    remainder='passthrough'
)

In [10]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [12]:
# kfold cross validation

def apply_cross_validation(pipeline, X, y, scoring='r2'):
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=kfold, scoring=scoring)
    return scores

In [13]:
scores = apply_cross_validation(pipeline, X, y_transformed)

In [14]:
(scores.mean(), scores.std())

(0.7363096633436828, 0.0323800575442993)

In [15]:
def get_mean_absolute_error(model, X, y, test_size=0.2, random_state=42):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_absolute_error(np.expm1(y_test), np.expm1(y_pred))

In [16]:
get_mean_absolute_error(pipeline, X, y_transformed)

0.946382216008936

In [17]:
def scorer(model_name, pipeline):
    output = [model_name]
    
    scores = apply_cross_validation(pipeline, X, y_transformed)    
    output.append(scores.mean())
    
    output.append(get_mean_absolute_error(pipeline, X, y_transformed))
    
    return output

In [18]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor

import xgboost
from xgboost import XGBRegressor


models = {
    "linear_reg": LinearRegression(),
    "svr": SVR(),
    "ridge": Ridge(),
    "lasso": Lasso(),
    "decision_tree": DecisionTreeRegressor(),
    "random_forest": RandomForestRegressor(),
    "gradient_boosting": GradientBoostingRegressor(),
    "extra_trees": ExtraTreesRegressor(),
    "adaboost": AdaBoostRegressor(),
    "xgboost": XGBRegressor(),
    "mlp": MLPRegressor()
}

In [19]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ],
    remainder='passthrough'
)

models_output = []
for model_name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    models_output.append(scorer(model_name, pipeline))

In [20]:
models_output

[['linear_reg', 0.7363096633436828, 0.946382216008936],
 ['svr', 0.7642012011196353, 0.847263647348393],
 ['ridge', 0.7363125343993552, 0.9463387741853373],
 ['lasso', 0.05943378064493573, 1.528905986892753],
 ['decision_tree', 0.775040289151986, 0.7448570174327465],
 ['random_forest', 0.8829391206067069, 0.5275484512481895],
 ['gradient_boosting', 0.8725438180268734, 0.5758600982386398],
 ['extra_trees', 0.8679367816815453, 0.539975903180785],
 ['adaboost', 0.7560632831805515, 0.8270751293106879],
 ['xgboost', 0.8894876835260124, 0.5040475141482346],
 ['mlp', 0.8050152635104061, 0.7027311175746906]]

In [21]:
models_df = pd.DataFrame(models_output, columns=['model', 'r2', 'mae'])
models_df.sort_values(['mae'])

Unnamed: 0,model,r2,mae
9,xgboost,0.889488,0.504048
5,random_forest,0.882939,0.527548
7,extra_trees,0.867937,0.539976
6,gradient_boosting,0.872544,0.57586
10,mlp,0.805015,0.702731
4,decision_tree,0.77504,0.744857
8,adaboost,0.756063,0.827075
1,svr,0.764201,0.847264
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382


## OneHotEncoding

In [22]:
# Creating a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first'), ['sector', 'agePossession', 'furnishing_type'])
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

In [23]:
scores = apply_cross_validation(pipeline, X, y_transformed)

In [24]:
{
    "mean r2_score": scores.mean(),
    "std": scores.std()
}

{'mean r2_score': 0.8546067827628422, 'std': 0.015998393588058008}

In [25]:
get_mean_absolute_error(pipeline, X, y_transformed)

0.6497458331374444

In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first'), ['sector', 'agePossession', 'furnishing_type'])
    ],
    remainder='passthrough'
)

models_output_ohe = []

for model_name, model in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])
    models_output_ohe.append(scorer(model_name, pipeline))



In [27]:
models_df_ohe = pd.DataFrame(models_output_ohe, columns=['model', 'r2', 'mae'])
models_df_ohe.sort_values(['mae'])

Unnamed: 0,model,r2,mae
7,extra_trees,0.893317,0.466486
9,xgboost,0.89585,0.493456
5,random_forest,0.889876,0.506274
10,mlp,0.874388,0.549745
6,gradient_boosting,0.876655,0.569446
0,linear_reg,0.854607,0.649746
2,ridge,0.854673,0.65299
4,decision_tree,0.805136,0.694985
1,svr,0.769741,0.834124
8,adaboost,0.753145,0.858093


### OHE with PCA

In [32]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['sector', 'agePossession', 'furnishing_type'])
    ],
    remainder='passthrough'
)

models_output_ohe_with_pca = []

for model_name, model in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("pca", PCA(n_components=0.95)),
        ("regressor", model)
    ])
    models_output_ohe_with_pca.append(scorer(model_name, pipeline))

In [33]:
models_df_ohe_with_pca = pd.DataFrame(models_output_ohe_with_pca, columns=['model', 'r2', 'mae'])
models_df_ohe_with_pca.sort_values(['mae'])

Unnamed: 0,model,r2,mae
5,random_forest,0.758416,0.651935
7,extra_trees,0.735388,0.702402
4,decision_tree,0.691654,0.756785
9,xgboost,0.619406,0.949518
6,gradient_boosting,0.611482,0.987906
8,adaboost,0.304426,1.336619
1,svr,0.218068,1.361187
10,mlp,0.207729,1.424288
2,ridge,0.062253,1.526707
0,linear_reg,0.062253,1.526707


## Target Encoder

In [30]:
import category_encoders as ce

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

models_output_te = []

for model_name, model in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])
    models_output_te.append(scorer(model_name, pipeline))

In [34]:
models_df_te = pd.DataFrame(models_output_te, columns=['model', 'r2', 'mae'])
models_df_te.sort_values(['mae'])

Unnamed: 0,model,r2,mae
9,xgboost,0.904798,0.447518
7,extra_trees,0.900989,0.457692
5,random_forest,0.901347,0.461171
6,gradient_boosting,0.888824,0.509788
4,decision_tree,0.821711,0.562958
10,mlp,0.845372,0.614649
8,adaboost,0.818688,0.685114
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851


## Hyperparameter Tuning

In [35]:
from sklearn.model_selection import GridSearchCV

In [39]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300, 500],  # Number of trees in the forest
    'regressor__max_depth': [None, 10, 20, 30],       # Maximum depth of the trees
    'regressor__max_samples': [0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [40]:
columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor())
])

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [41]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 160 candidates, totalling 1600 fits


800 fits failed out of a total of 1600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
588 fits failed with the following error:
Traceback (most recent call last):
  File "D:\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\anaconda\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "D:\anaconda\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params(

In [46]:
search.best_params_

{'regressor__max_depth': 20,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 300}

In [48]:
search.best_score_

0.9026345675494738

In [59]:
data = [['flat', 'sector 89', 2, 2, '2', 'New Property', 1226.0, 1, 0, 'unfurnished', 'Low', 'Mid Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
           'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DF
one_df = pd.DataFrame(data, columns=columns)
one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 89,2,2,2,New Property,1226.0,1,0,unfurnished,Low,Mid Floor


In [60]:
np.expm1(search.best_estimator_.predict(one_df))

array([0.93668137])

In [57]:
df.iloc[1]

property_type              flat
sector                sector 89
price                      0.95
bedRoom                     2.0
bathroom                    2.0
balcony                       2
agePossession      New Property
built_up_area            1226.0
servant room                1.0
store room                  0.0
furnishing_type     unfurnished
luxury_category             Low
floor_category        Mid Floor
Name: 1, dtype: object

In [61]:
import pickle

In [62]:
with open('pipeline.pkl', 'wb') as file:
    pickle.dump(search.best_estimator_, file)

In [63]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [64]:
get_mean_absolute_error(search.best_estimator_, X, y_transformed)

0.46637575802056175