In [91]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.metrics import mean_absolute_error, r2_score

from sklearn.decomposition import PCA

In [92]:
df = pd.read_csv('/content/drive/MyDrive/House-Price/datasets/Processed/gurgaon_properties_post_feature_selection_v2.csv')

In [93]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [96]:
cat_col = []
for i in df.columns:
    if isinstance(df[i].dtype, object):
        cat_col.append(i)
print(cat_col)

['property_type', 'sector', 'price', 'bedRoom', 'bathroom', 'balcony', 'agePossession', 'built_up_area', 'servant room', 'store room', 'furnishing_type', 'luxury_category', 'floor_category']


In [97]:
for i in cat_col:
    print(df[i].value_counts())
    print("**"*20)

flat     2804
house     750
Name: property_type, dtype: int64
****************************************
sohna road    163
sector 85     108
sector 102    107
sector 92     100
sector 69      93
             ... 
sector 80       5
sector 30       5
sector 73       3
sector 88       3
sector 27       2
Name: sector, Length: 104, dtype: int64
****************************************
1.25    80
1.20    64
0.90    63
1.50    63
1.10    59
        ..
6.21     1
4.65     1
4.16     1
2.38     1
0.57     1
Name: price, Length: 461, dtype: int64
****************************************
3.0     1512
2.0      969
4.0      647
5.0      171
1.0      127
6.0       53
9.0       25
7.0       20
8.0       18
10.0      12
Name: bedRoom, dtype: int64
****************************************
3.0     1061
2.0     1032
4.0      804
5.0      285
1.0      150
6.0      116
9.0       39
7.0       36
8.0       19
10.0       9
11.0       2
12.0       1
Name: bathroom, dtype: int64
*********************************

In [98]:
df['furnishing_type'].value_counts()

0.0    2349
1.0    1018
2.0     187
Name: furnishing_type, dtype: int64

### Significance:-

- 0 = Unfurnished
- 1 = Semifurnished
- 2 = Furnished


In [99]:
df['furnishing_type'] = df['furnishing_type'].replace({0.0:"unfurnished",
                                                       1.0:"semifurnished",
                                                       2.0:"furnished"})

In [100]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


### seperating Independent and dependant columns for furthur processing

In [101]:
X = df.drop(columns=['price'])
y = df['price']

In [102]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

# **Ordinal Encoding**

In [103]:
for i in cat_col:
    print(i)

property_type
sector
price
bedRoom
bathroom
balcony
agePossession
built_up_area
servant room
store room
furnishing_type
luxury_category
floor_category


In [104]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession',
                     'furnishing_type', 'luxury_category', 'floor_category']

In [105]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room'] ),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ],
    remainder = 'passthrough'
)

In [106]:
# creating Pipeline

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor', LinearRegression())
])

In [107]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
score = cross_val_score(pipeline, X, y_transformed, cv = kfold, scoring='r2')

In [108]:
score.mean(),score.std()

(0.7363096633436828, 0.0323800575442993)

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [110]:
pipeline.fit(X_train, y_train)

In [111]:
y_pred = pipeline.predict(X_test)

In [112]:
y_pred = np.expm1(y_pred)

In [113]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.9463822160089363

# **lets make custom e function for models**

In [114]:
def scorer(model_name, model):
    output = []

    output.append(model_name)

    # creating Pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv = kfold, scoring='r2')

    output.append(scores.mean())

    # splling a data
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed,
                                                        test_size=0.2,
                                                        random_state=42)

    # fititng a data to pipeline
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    return output

In [115]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

model_dict = {
    'linear_reg': LinearRegression(),
    'svr': SVR(),
    'ridge': Ridge(),
    'LASSO': Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest': RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost': XGBRegressor()
}

models = {key: value for key, value in model_dict.items()}


In [116]:
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [117]:
model_output

[['linear_reg', 0.7363096633436828, 0.9463822160089363],
 ['svr', 0.7642021216646014, 0.8472636473483917],
 ['ridge', 0.7363125343993554, 0.9463387741853387],
 ['LASSO', 0.05943378064493573, 1.528905986892753],
 ['decision tree', 0.7750671748209206, 0.7364644435973571],
 ['random forest', 0.8810072685373458, 0.5345546597949038],
 ['extra trees', 0.8682527040592614, 0.5532726545278014],
 ['gradient boosting', 0.8727443208556972, 0.5753298579540989],
 ['adaboost', 0.7610272245431803, 0.8373577271493254],
 ['mlp', 0.8023171295308007, 0.693421369021888],
 ['xgboost', 0.8894876835260124, 0.5040475127230885]]

In [118]:
model_df = pd.DataFrame(model_output, columns=['Name', 'R2_score', 'MAE'])

In [119]:
model_df.sort_values(['MAE'])

Unnamed: 0,Name,R2_score,MAE
10,xgboost,0.889488,0.504048
5,random forest,0.881007,0.534555
6,extra trees,0.868253,0.553273
7,gradient boosting,0.872744,0.57533
9,mlp,0.802317,0.693421
4,decision tree,0.775067,0.736464
8,adaboost,0.761027,0.837358
1,svr,0.764202,0.847264
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382


# **OneHotEncoding**

In [120]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room'] ),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ],
    remainder = 'passthrough'
)

In [121]:
# creating a Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [122]:
# K-Fold Cross-Validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv = kfold, scoring='r2')

In [123]:
scores.mean()

0.8546091800469252

In [124]:
scores.std()

0.015997419584139923

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,
                                                    test_size=0.2,random_state=42)

In [126]:
pipeline.fit(X_train, y_train)

In [127]:
y_pred = pipeline.predict(X_test)

In [128]:
y_pred = np.expm1(y_pred)

In [129]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6497561575485861

# **Custome Function for Model Pipeline**

In [130]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv = kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,
                                                        test_size=0.2,random_state=42)
    pipeline.fit(X_train, y_train)

    y_red = pipeline.predict(X_test)
    y_red = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [131]:
model_dict = {
    'linear_reg': LinearRegression(),
    'svr': SVR(),
    'ridge': Ridge(),
    'LASSO': Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest': RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost': XGBRegressor()
}

In [132]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [133]:
model_df = pd.DataFrame(model_output, columns=['Name','R2_Score','MAE'])

In [134]:
model_df.sort_values(['MAE'])

Unnamed: 0,Name,R2_Score,MAE
0,linear_reg,0.854609,0.649756
1,svr,0.769752,0.649756
2,ridge,0.854644,0.649756
3,LASSO,0.059434,0.649756
4,decision tree,0.803426,0.649756
5,random forest,0.891303,0.649756
6,extra trees,0.893575,0.649756
7,gradient boosting,0.876817,0.649756
8,adaboost,0.754089,0.649756
9,mlp,0.878518,0.649756


# **OneHotEncoding With PCA**

In [135]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ],
    remainder='passthrough'
)

In [136]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [137]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [138]:
scores.mean()

0.06225201431451136

In [139]:
scores.std()

0.019860594071640144

In [140]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [141]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [142]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [143]:
model_df = pd.DataFrame(model_output, columns=['Name','R2_Score','MAE'])

In [144]:
model_df.sort_values(['MAE'])

Unnamed: 0,Name,R2_Score,MAE
5,random forest,0.761804,0.638929
6,extra trees,0.73972,0.693852
4,decision tree,0.696182,0.75729
10,xgboost,0.620664,0.948597
7,gradient boosting,0.610604,0.987906
8,adaboost,0.309961,1.345226
1,svr,0.218073,1.361198
9,mlp,0.214233,1.396805
2,ridge,0.062252,1.526707
0,linear_reg,0.062252,1.526707


# **Target Encoder**

In [145]:
!pip install category_encoders



In [146]:
import category_encoders  as ce

columns_to_encode = ['property_type','sector', 'balcony',
                     'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
        transformers = [
            ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
            ('cat', OrdinalEncoder(), columns_to_encode),
            ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
            ('target_enc', ce.TargetEncoder(), ['sector'])
        ],
    remainder='passthrough'
)

In [147]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [148]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [149]:
scores.mean(),scores.std()

(0.829521918225536, 0.01838446337912283)

In [150]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output

In [151]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [152]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [153]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.904798,0.447518
5,random forest,0.901709,0.454179
6,extra trees,0.902553,0.459235
7,gradient boosting,0.889142,0.508025
4,decision tree,0.827823,0.584607
9,mlp,0.849718,0.618616
8,adaboost,0.820183,0.691278
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851


# **Hyperparameter Tuning**

In [154]:
pip install hyperopt



In [155]:
from sklearn.model_selection import GridSearchCV

In [156]:
# param_grid = {
#     'regressor__n_estimators': [50, 100, 200, 300],
#     'regressor__max_depth': [None, 10, 20, 30],
#     'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
#     'regressor__max_features': ['auto', 'sqrt']
# }

In [None]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__subsample': [0.5, 0.75, 1.0],
    'regressor__colsample_bytree': [0.5, 0.75, 1.0],
    'regressor__gamma': [0, 0.1, 0.2, 0.5],
    'regressor__reg_alpha': [0, 0.1, 0.5, 1.0],
    'regressor__reg_lambda': [0, 0.1, 0.5, 1.0]
}

In [157]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [158]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

In [159]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [160]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [None]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


In [None]:
final_pipe = search.best_estimator_

In [None]:
search.best_params_

In [None]:
search.best_score_

In [None]:
final_pipe.fit(X,y_transformed)

# **Exporting the model**

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ],
    remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=500))
])

In [None]:
pipeline.fit(X,y_transformed)

In [None]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [None]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [None]:
X.head(4)

# **Trying out the predictions**

In [None]:
X.columns

In [None]:
X.iloc[0].values

In [None]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

In [None]:
np.expm1(pipeline.predict(one_df))

In [None]:
X.dtypes

In [None]:

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

search.fit(X, y_transformed)

final_pipe = search.best_estimator_

search.best_params_

search.best_score_

final_pipe.fit(X,y_transformed)

In [85]:
from hyperopt import fmin, tpe, hp, Trials
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
import category_encoders as ce
from sklearn.metrics import r2_score

# Define the search space
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 500, 50),  # Changed to generate integer values
    'max_depth': hp.choice('max_depth', [None, 5, 10, 15, 20]),  # Added 5 and 15 to the choices
    'max_samples': hp.uniform('max_samples', 0.1, 1.0),  # Changed to uniform distribution
    'max_features': hp.choice('max_features', ['auto', 'sqrt', None])  # Added None to the choices
}

# Define a custom scorer
scorer = make_scorer(score_func=r2_score)

# Define a function to optimize
def objective(params):
    # Cast 'n_estimators' to an integer
    params['n_estimators'] = int(params['n_estimators'])

    # Define RandomForestRegressor within the objective function
    regressor = RandomForestRegressor(**params)

    # Update the pipeline
    pipeline.set_params(regressor=regressor)

    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring=scorer, n_jobs=-1)
    return -scores.mean()  # We use negative mean score because fmin seeks minimum

# Initialize Trials to keep track of the results
trials = Trials()

# Run hyperparameter optimization
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,  # Adjust this as needed
    trials=trials
)

# Get the best parameters
best_params = {key: val[best[key]] if isinstance(val, list) else val for key, val in space.items()}

# Print the best parameters
print("Best parameters:", best_params)

# You can also print the best score
print("Best score:", -trials.best_trial['result']['loss'])

100%|██████████| 50/50 [20:13<00:00, 24.27s/trial, best loss: -0.9020483302749905]
Best parameters: {'n_estimators': <hyperopt.pyll.base.Apply object at 0x7a8511170fd0>, 'max_depth': <hyperopt.pyll.base.Apply object at 0x7a8511171060>, 'max_samples': <hyperopt.pyll.base.Apply object at 0x7a8511171270>, 'max_features': <hyperopt.pyll.base.Apply object at 0x7a8511171360>}
Best score: 0.9020483302749905


In [90]:
list(zip(df.columns, df.dtypes))

[('property_type', dtype('O')),
 ('sector', dtype('O')),
 ('price', dtype('float64')),
 ('bedRoom', dtype('float64')),
 ('bathroom', dtype('float64')),
 ('balcony', dtype('O')),
 ('agePossession', dtype('O')),
 ('built_up_area', dtype('float64')),
 ('servant room', dtype('float64')),
 ('store room', dtype('float64')),
 ('furnishing_type', dtype('O')),
 ('luxury_category', dtype('O')),
 ('floor_category', dtype('O'))]