In [1]:
import os
import sys
import zipfile
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

%matplotlib inline

sys.path.append('../')
from src.data_preprocess import DataPreprocessor

pd.set_option('display.max_colwidth', 2000)

%load_ext autoreload
%autoreload 2

In [2]:
train_df = pd.read_csv('../input/train.csv')
train_df.head(2)

Unnamed: 0,listing_id,title,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,...,furnishing,available_unit_types,total_num_units,property_details_url,lat,lng,elevation,subzone,planning_area,price
0,122881,hdb flat for sale in 866 yishun street 81,sembawang / yishun (d27),866 yishun street 81,hdb 4 rooms,,1988.0,3.0,2.0,1115,...,unspecified,,116.0,https://www.99.co/singapore/hdb/866-yishun-street-81-adxawp85buupmsq7gwdjverc,1.414399,103.837196,0,yishun south,yishun,514500.0
1,259374,hdb flat for sale in 506b serangoon north avenue 4,hougang / punggol / sengkang (d19),hdb-serangoon estate,hdb,99-year leasehold,1992.0,4.0,2.0,1575,...,unspecified,"1, 2, 3, 4, 5, 6 br",,https://www.99.co/singapore/hdb/hdbserangoon-estate-demrpm6ryc3l9buf846erprb,1.372597,103.875625,0,serangoon north,serangoon,995400.0


In [3]:
sg_commercial_centres_df = pd.read_csv('../input/auxiliary-data/sg-commerical-centres.csv')
sg_commercial_centres_df.loc[sg_commercial_centres_df['type']=='IEPB', 'type'] = 'IEBP'
sg_commercial_centres_df.head(2)

Unnamed: 0,name,type,lat,lng,subzone,planning_area
0,Central Business District,CR,1.286768,103.854529,clifford pier,downtown core
1,Jurong Lake District,CR,1.334085,103.734513,lakeside (business),jurong east


In [4]:
sg_primary_schools_df = pd.read_csv('../input/auxiliary-data/sg-primary-schools.csv')
sg_primary_schools_df.head(2)

Unnamed: 0,name,lat,lng,subzone,planning_area
0,Admiralty Primary School,1.442941,103.800345,woodlands east,serangoon
1,Ahmad Ibrahim Primary School,1.433849,103.83271,yishun west,serangoon


In [5]:
sg_secondary_schools_df = pd.read_csv('../input/auxiliary-data/sg-secondary-schools.csv')
sg_secondary_schools_df.head(2)

Unnamed: 0,name,lat,lng,subzone,planning_area
0,Admiralty Secondary School,1.445912,103.802908,woodlands east,woodlands
1,Ahmad Ibrahim Secondary School,1.436095,103.830055,yishun west,yishun


In [6]:
sg_shopping_mall_df = pd.read_csv('../input/auxiliary-data/sg-shopping-malls.csv')
sg_shopping_mall_df.head(2)

Unnamed: 0,name,lat,lng,subzone,planning_area
0,10 AM,1.275568,103.863591,marina south,marina south
1,313@Somerset,1.301013,103.83854,somerset,orchard


In [7]:
train_df_clean = DataPreprocessor.data_preprocessing_v2(
    train_df, 
    test=False, 
    uncertain=False, 
    drop_na=False, 
    remove_original_attributes=True,
    if_auxiliary=True,
    sg_commercial_centres_df=sg_commercial_centres_df,
    sg_primary_schools_df=sg_primary_schools_df,
    sg_secondary_schools_df=sg_secondary_schools_df,
    sg_shopping_mall_df=sg_shopping_mall_df
)

Processed function: Function 'preprocess_available_unit_types' executed in 3.8160s & Processing function: preprocess_planning_area: 100%|██████████| 14/14 [00:37<00:00,  2.67s/it]
Processed function: Function 'preprocess_secondary_school' executed in 19.4976s & Processing function: preprocess_shopping_mall: 100%|██████████| 4/4 [01:33<00:00, 23.38s/it]  


In [8]:
train_df_clean = train_df_clean[train_df_clean.price!=0]

In [9]:
# fig = px.box(train_df_clean, x="tenure_cat_0", y="price")
# fig.show()
# fig = px.histogram(train_df_clean, x="price")
# fig.show()

In [10]:
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from hyperopt import fmin, hp, tpe, STATUS_OK, STATUS_FAIL, Trials
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

  from pandas import MultiIndex, Int64Index


In [11]:
train_df_model = train_df_clean
train_df_model_nonNa = train_df_clean.dropna()

X_nonNa = train_df_model_nonNa.drop('price', axis=1,inplace=False).astype(float)
y_nonNa = train_df_model_nonNa['price'].astype(float)
X = train_df_model.drop('price', axis=1,inplace=False).astype(float)
y = train_df_model['price'].astype(float)

In [12]:
import warnings
warnings.filterwarnings('ignore')
# XGB parameters
xgb_reg_params={
    'n_estimators': hp.quniform('n_estimators', 100, 500, 10),
    'max_depth': hp.quniform("max_depth", 5, 100, 5),
    'gamma': hp.uniform('gamma', 2,10),
    'reg_alpha' : hp.quniform('reg_alpha', 10,300,5),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.0,1),
    'min_child_weight' : hp.quniform('min_child_weight', 5, 50, 1),
}
xgb_fit_params = {
    'eval_metric': 'rmse',
    'early_stopping_rounds': 10,
    'verbose': False
}
xgb_para = dict()
xgb_para['reg_params'] = xgb_reg_params
xgb_para['fit_params'] = xgb_fit_params
xgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))
xgb_para['score_func' ] = lambda y, pred: r2_score(y, pred)

# Random Forest
random_forest_reg_params={
    'n_estimators':hp.uniform('n_estimators',100,500),
    'max_depth':hp.uniform('max_depth',5,50),
    'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
    'min_samples_split':hp.uniform('min_samples_split',2,6)}
random_forest_fit_params = {
}
random_forest_para = dict()
random_forest_para['reg_params'] = random_forest_reg_params
random_forest_para['fit_params'] = random_forest_fit_params
random_forest_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))
random_forest_para['score_func' ] = lambda y, pred: r2_score(y, pred)



class HPOpt(object):

    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test  = x_test
        self.y_train = y_train
        self.y_test  = y_test

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials

    def xgb_reg(self, para):
        space = para['reg_params']
        reg = xgb.XGBRegressor(
            n_estimators =int(space['n_estimators']), 
            max_depth = int(space['max_depth']),
            gamma = float(space['gamma']),
            reg_alpha = int(space['reg_alpha']),
            colsample_bytree=float(space['colsample_bytree']),
            min_child_weight=int(space['min_child_weight']),
            n_jobs = 4
        )
        return self.train_reg(reg, para)

    def random_forest_reg(self, para):
        space = para['reg_params']
        reg = RandomForestRegressor(
            n_estimators=int(space['n_estimators']),
            max_depth=int(space['max_depth']),
            min_samples_leaf=int(space['min_samples_leaf']),
            min_samples_split=int(space['min_samples_split']),
            n_jobs=-1
        )
        return self.train_reg(reg, para)

#     def ctb_reg(self, para):
#         reg = ctb.CatBoostRegressor(**para['reg_params'])
#         return self.train_reg(reg, para)

    def train_reg(self, reg, para):
        # reg.fit(self.x_train, self.y_train)
        try:
            reg.fit(self.x_train, self.y_train,
                    eval_set=[(self.x_train, self.y_train), (self.x_test, self.y_test)],
                    **para['fit_params'])
        except:
            reg.fit(self.x_train, self.y_train,
                    **para['fit_params'])
        pred = reg.predict(self.x_test)
        loss = para['loss_func'](self.y_test, pred)
        score = para['score_func'](self.y_test, pred)
        return {'loss': loss, 'score': score,'status': STATUS_OK}

# X_train, X_test, y_train, y_test = train_test_split(X, y)
# obj = HPOpt(X_train, X_test, y_train, y_test)
# xgb_opt = obj.process(fn_name='xgb_reg', space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=500)
# best loss: 2271478.595400419
# X_train, X_test, y_train, y_test = train_test_split(X_nonNa, y_nonNa)
# obj = HPOpt(X_train, X_test, y_train, y_test)
# random_forest_opt = obj.process(fn_name='random_forest_reg', space=random_forest_para, trials=Trials(), algo=tpe.suggest, max_evals=100)

In [13]:
test_df = pd.read_csv('../input/test.csv')
display(test_df.head(1))

test_df_clean = DataPreprocessor.data_preprocessing_v2(
    test_df.reset_index(), 
    test=True, 
    uncertain=False, 
    drop_na=False, 
    remove_original_attributes=True,
    if_auxiliary=True,
    sg_commercial_centres_df=sg_commercial_centres_df,
    sg_primary_schools_df=sg_primary_schools_df,
    sg_secondary_schools_df=sg_secondary_schools_df,
    sg_shopping_mall_df=sg_shopping_mall_df
)
test_df_clean_ordered=test_df_clean.sort_values("index").reset_index(drop=True).drop(columns=['index'])
display(test_df_clean_ordered.head(1))

test_df_true = pd.read_csv('../input/example-submission.csv')
display(test_df_true.head(1))

Unnamed: 0,listing_id,title,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,floor_level,furnishing,available_unit_types,total_num_units,property_details_url,lat,lng,elevation,subzone,planning_area
0,777912,1 bed condo for sale in the gazania,17 how sun drive,the gazania,condo,freehold,2022.0,1.0,1.0,463,,unfurnished,"studio, 1, 2, 3, 4, 5 br",250.0,https://www.99.co/singapore/condos-apartments/the-gazania,1.344334,103.87869,0,upper paya lebar,serangoon


Processed function: Function 'preprocess_available_unit_types' executed in 1.6690s & Processing function: preprocess_planning_area: 100%|██████████| 14/14 [00:18<00:00,  1.30s/it]
Processed function: Function 'preprocess_secondary_school' executed in 11.0512s & Processing function: preprocess_shopping_mall: 100%|██████████| 4/4 [00:52<00:00, 13.06s/it]  


Unnamed: 0,built_year,num_beds,num_baths,size_sqft,lat,lng,property_type_cat_0,tenure_cat_0,tenure_cat_1,tenure_cat_2,...,IHL_cat_0,pri_sch_dist,number_of_close_pri_sch,pri_sch_cat_0,sec_sch_dist,number_of_close_sec_sch,sec_sch_cat_0,shopping_mall_dist,number_of_close_shopping_mall,shopping_mall_cat_0
0,2022.0,1.0,1.0,463,1.344334,103.87869,5.0,0.0,0.0,1.0,...,4.0,0.342815,0,144.0,0.345402,0,115.0,0.988207,0,98.0


Unnamed: 0,Id,Predicted
0,0,955900.0


In [18]:
from sklearn.metrics import mean_squared_error
from IPython.display import clear_output

best_score = 1500000
score = np.inf
score_temp = np.inf
counter = 0
best_settings = {}
while best_score < score:
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    # X_train, y_train = X, y
    # X_test = test_df_clean_ordered
    # y_test = test_df_true["Predicted"]
    
    obj = HPOpt(X_train, X_test, y_train, y_test)
    xgb_opt = obj.process(fn_name='xgb_reg', space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=150)
    space = xgb_opt[0]
    xgb_regressor = xgb.XGBRegressor(
        n_estimators =int(space['n_estimators']), 
        max_depth = int(space['max_depth']), 
        gamma = float(space['gamma']),
        reg_alpha = int(space['reg_alpha']),
        min_child_weight=int(space['min_child_weight']),
        colsample_bytree=float(space['colsample_bytree']))

    # define model evaluation method
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    # evaluate model
    scores = cross_val_score(xgb_regressor, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    rmse_scroe = np.sqrt(np.mean(-scores))#np.sqrt(mean_squared_error(y_test, xgb_regressor.fit(X,y).predict(X_test)))
    clear_output(wait=True)
    if rmse_scroe < score_temp:
        best_settings = xgb_opt[0]
        score_temp = rmse_scroe
        print(f"score update = {score_temp}")
        if np.round(score_temp, 3) < np.round(best_score, 3):
            best_score = score_temp
            break
    # if np.mean(scores) > score_temp:
    #     best_settings = xgb_opt[0]
    #     score_temp = np.mean(scores)
    #     print(f"score update = {score_temp}")
    #     if np.round(score_temp, 3) > np.round(best_score, 3):
    #         best_score = score_temp
    #         break
    print(f"current score = {score_temp} vs best score = {best_score}")
    if counter == 100: break
    counter += 1
    print(f"counter = {counter}")

counter = 38
 49%|████▊     | 73/150 [03:20<05:24,  4.21s/trial, best loss: 1734482.7244767763]

In [17]:
np.sqrt(np.mean(-scores))

2236287.250292887

In [61]:
# from sklearn.metrics import mean_squared_error
# from IPython.display import clear_output

# best_score = 1700000
# score = np.inf
# score_temp = np.inf
# counter = 0
# best_settings = {}
# while best_score < score:
#     # X_train, X_test, y_train, y_test = train_test_split(X, y)
#     X_train, y_train = X, y
#     X_test = test_df_clean_ordered
#     y_test = test_df_true["Predicted"]
    
#     obj = HPOpt(X_train, X_test, y_train, y_test)
#     xgb_opt = obj.process(fn_name='xgb_reg', space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=150)
#     space = xgb_opt[0]
#     xgb_regressor = xgb.XGBRegressor(
#         n_estimators =int(space['n_estimators']), 
#         max_depth = int(space['max_depth']), 
#         gamma = float(space['gamma']),
#         reg_alpha = int(space['reg_alpha']),
#         min_child_weight=int(space['min_child_weight']),
#         colsample_bytree=float(space['colsample_bytree']))

#     # define model evaluation method
#     # cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
#     # evaluate model
#     # scores = cross_val_score(xgb_regressor, X, y, scoring='r2', cv=cv, n_jobs=-1)
#     rmse_scroe = np.sqrt(mean_squared_error(y_test, xgb_regressor.fit(X,y).predict(X_test)))
#     clear_output(wait=True)
#     if rmse_scroe < score_temp:
#         best_settings = xgb_opt[0]
#         score_temp = rmse_scroe
#         print(f"score update = {score_temp}")
#         if np.round(score_temp, 3) < np.round(best_score, 3):
#             best_score = score_temp
#             break
#     # if np.mean(scores) > score_temp:
#     #     best_settings = xgb_opt[0]
#     #     score_temp = np.mean(scores)
#     #     print(f"score update = {score_temp}")
#     #     if np.round(score_temp, 3) > np.round(best_score, 3):
#     #         best_score = score_temp
#     #         break
#     if counter % 5 == 0:
#         print(f"current score = {score_temp} vs best score = {best_score}")
#     if counter == 100: break
#     counter += 1
#     print(f"counter = {counter}")

current score = 1761279.2758206131 vs best score = 1700000


In [62]:
X_train, y_train = X, y
X_test = test_df_clean_ordered
y_test = test_df_true["Predicted"]
rmse_scroe = np.sqrt(mean_squared_error(y_test, xgb_regressor.fit(X,y).predict(X_test)))
rmse_scroe

1825912.6790148737

In [63]:
best_settings

{'colsample_bytree': 0.6195632033944107,
 'gamma': 9.46917152912689,
 'max_depth': 25.0,
 'min_child_weight': 6.0,
 'n_estimators': 110.0,
 'reg_alpha': 50.0}

In [39]:
# from sklearn.model_selection import RandomizedSearchCV
# params = { 'max_depth': [9, 10, 11],
#            'learning_rate': [0.01,0.015, 0.02],
#            'subsample': np.arange(0.7, 0.8, 0.05),
#            'colsample_bytree': np.arange(0.5, 0.9, 0.1),
#            'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
#            'n_estimators': [230, 240, 250]}
# xgbr = xgb.XGBRegressor(seed = 20)
# clf = RandomizedSearchCV(estimator=xgbr,
#                          param_distributions=params,
#                          scoring='neg_mean_squared_error',
#                          n_iter=25,
#                          verbose=1)
# clf.fit(X, y)
# print("Best parameters:", clf.best_params_)
# print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

In [65]:
# best!!
# params = {'colsample_bytree': 0.7425090821378345,
#  'gamma': 8.545654866461705,
#  'max_depth': 35,
#  'min_child_weight': 12,
#  'n_estimators': 420,
#  'reg_alpha': 120}

# params = {'colsample_bytree': 0.49052308187216853,
#  'gamma': 4.2350263144815425,
#  'max_depth': 15,
#  'min_child_weight': 9,
#  'n_estimators': 130,
#  'reg_alpha': 95}

# {'colsample_bytree': 0.6848985371564676,
#  'gamma': 2.662303553904137,
#  'max_depth': 55,
#  'min_child_weight': 9,
#  'n_estimators': 150,
#  'reg_alpha': 215}

params = {'colsample_bytree': 0.6195632033944107,
 'gamma': 9.46917152912689,
 'max_depth': 25,
 'min_child_weight': 6,
 'n_estimators': 110,
 'reg_alpha': 50}

xgb_regressor = xgb.XGBRegressor(**params)
xgb_regressor.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6195632033944107,
             enable_categorical=False, gamma=9.46917152912689, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=25,
             min_child_weight=6, missing=nan, monotone_constraints='()',
             n_estimators=110, n_jobs=8, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=50, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [66]:
# xgb_regressor.fit(X, y)
# from sklearn.tree import DecisionTreeRegressor
# from explainerdashboard import RegressionExplainer
# explainer = RegressionExplainer(xgb_regressor, X, y)
# from explainerdashboard import ExplainerDashboard
# ExplainerDashboard(explainer).run()

In [67]:
# test_df_clean1 = DataPreprocessor.data_preprocessing_v1(test_df, test=True, uncertain=False, drop_na=False, remove_original_attributes=True)
# test_df_clean1

In [43]:
test_df_clean = DataPreprocessor.data_preprocessing_v2(
    test_df.reset_index(), 
    test=True, 
    uncertain=False, 
    drop_na=False, 
    remove_original_attributes=True,
    if_auxiliary=True,
    sg_commercial_centres_df=sg_commercial_centres_df,
    sg_primary_schools_df=sg_primary_schools_df,
    sg_secondary_schools_df=sg_secondary_schools_df,
    sg_shopping_mall_df=sg_shopping_mall_df
)

Processed function: Function 'preprocess_available_unit_types' executed in 1.6840s & Processing function: preprocess_planning_area: 100%|██████████| 14/14 [00:17<00:00,  1.22s/it]
Processed function: Function 'preprocess_secondary_school' executed in 10.2190s & Processing function: preprocess_shopping_mall: 100%|██████████| 4/4 [00:54<00:00, 13.69s/it]  


In [68]:
test_df_clean_ordered=test_df_clean.sort_values("index").reset_index(drop=True).drop(columns=['index'])
test_df_clean_ordered

Unnamed: 0,built_year,num_beds,num_baths,size_sqft,lat,lng,property_type_cat_0,tenure_cat_0,tenure_cat_1,tenure_cat_2,...,IHL_cat_0,pri_sch_dist,number_of_close_pri_sch,pri_sch_cat_0,sec_sch_dist,number_of_close_sec_sch,sec_sch_cat_0,shopping_mall_dist,number_of_close_shopping_mall,shopping_mall_cat_0
0,2022.0,1.0,1.0,463,1.344334,103.878690,5.0,0.0,0.0,1.0,...,4.0,0.342815,0,144.0,0.345402,0,115.0,0.988207,0,98.0
1,2017.0,3.0,3.0,1033,1.380281,103.943878,5.0,0.0,1.0,0.0,...,1.0,1.078614,0,139.0,1.397316,0,141.0,0.282580,1,90.0
2,2007.0,1.0,3.0,570,1.294668,103.850074,5.0,0.0,1.0,0.0,...,4.0,0.345021,0,149.0,0.565801,0,117.0,0.213473,12,83.0
3,2017.0,3.0,2.0,1216,1.373120,103.746094,0.0,0.0,1.0,0.0,...,0.0,0.704299,0,27.0,1.067386,0,15.0,0.504271,1,15.0
4,1973.0,3.0,2.0,936,1.341468,103.849047,0.0,0.0,1.0,0.0,...,4.0,0.420901,0,86.0,0.360447,0,4.0,1.015136,0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6961,2009.0,5.0,6.0,10000,1.249534,103.844281,14.0,0.0,1.0,0.0,...,4.0,2.963650,0,162.0,3.874357,0,132.0,2.889320,0,149.0
6962,2010.0,5.0,7.0,6973,1.332492,103.800004,13.0,0.0,0.0,1.0,...,2.0,0.743945,0,170.0,0.322473,0,125.0,0.967857,0,129.0
6963,2025.0,3.0,3.0,1141,1.294106,103.836735,5.0,0.0,0.0,1.0,...,4.0,1.443341,0,169.0,0.872009,0,121.0,0.532290,3,124.0
6964,1975.0,3.0,1.0,818,1.303736,103.911438,0.0,0.0,1.0,0.0,...,4.0,0.148005,1,12.0,0.749617,0,2.0,0.729364,4,77.0


In [79]:
params = {'colsample_bytree': 0.9988638693168667,
 'gamma': 4.626267990192529,
 'max_depth': 10,
 'min_child_weight': 29,
 'n_estimators': 440,
 'reg_alpha': 148}


# params = {'colsample_bytree': 0.49052308187216853,
#  'gamma': 4.2350263144815425,
#  'max_depth': 15,
#  'min_child_weight': 9,
#  'n_estimators': 130,
#  'reg_alpha': 95}



# params = {'colsample_bytree': 0.6848985371564676,
#  'gamma': 2.662303553904137,
#  'max_depth': 55,
#  'min_child_weight': 9,
#  'n_estimators': 150,
#  'reg_alpha': 215}
 
xgb_regressor = xgb.XGBRegressor(**params)
xgb_regressor.fit(X, y)

test_df_model = test_df_clean_ordered#.drop(['lat', 'lng'], axis=1,inplace=False)
X_test = test_df_model.astype(float)
output = pd.DataFrame(xgb_regressor.predict(X_test)).reset_index().rename(columns={"index": "Id", 0: "Predicted"})

In [80]:
output.to_csv('./auxiliary/submission.csv', index=False)

In [81]:
output.head()

Unnamed: 0,Id,Predicted
0,0,1123513.0
1,1,1591084.0
2,2,1360562.0
3,3,684877.6
4,4,586035.1


In [None]:
from IPython.display import clear_output

best_score = 0.8317589525052792
score = 0
score_temp = 0
counter = 0
best_settings = {}
while best_score > score:
    X_train, X_test, y_train, y_test = train_test_split(X_nonNa, y_nonNa)
    obj = HPOpt(X_train, X_test, y_train, y_test)
    random_forest_opt = obj.process(fn_name='random_forest_reg', space=random_forest_para, trials=Trials(), algo=tpe.suggest, max_evals=100)
    print(random_forest_opt)
    space = random_forest_opt[0]
    random_forest_regressor = RandomForestRegressor(
        n_estimators=int(space['n_estimators']),
        max_depth=int(space['max_depth']),
        min_samples_leaf=int(space['min_samples_leaf']),
        min_samples_split=int(space['min_samples_split']),
        n_jobs=-1
    )   

    # define model evaluation method
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    # evaluate model
    scores = cross_val_score(xgb_regressor, X, y, scoring='r2', cv=cv, n_jobs=-1)
    clear_output(wait=True)
    if np.mean(scores) > score_temp:
        best_settings = random_forest_opt[0]
        score_temp = np.mean(scores)
        print(f"score update = {score_temp}")
        if np.round(score_temp, 3) > np.round(best_score, 3):
            best_score = score_temp
            break
    if counter % 5 == 0:
        print(f"current score = {score_temp} vs best score = {best_score}")
    if counter == 100: break
    counter += 1
    print(f"counter = {counter}")

In [None]:
{'max_depth': 36.51513768750169, 'min_samples_leaf': 1.9899352241424428, 'min_samples_split': 2.0022872258452344, 'n_estimators': 197.17077017731373}