In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import optuna
import os

# Load the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# Separate features and target
X = train.drop(columns=["price_doc", "id"], errors='ignore')
y = train["price_doc"]

# Handle Outliers (Example: Remove extreme price outliers)
q1 = y.quantile(0.01)
q99 = y.quantile(0.99)
outlier_mask = (y >= q1) & (y <= q99)
X = X[outlier_mask]
y = y[outlier_mask]

# Impute missing values
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Numerical imputation
num_imputer = SimpleImputer(strategy='median')
X[numerical_features] = num_imputer.fit_transform(X[numerical_features])
test[numerical_features] = num_imputer.transform(test[numerical_features])

# Categorical encoding and imputation
cat_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = cat_imputer.fit_transform(X[categorical_features])
test[categorical_features] = cat_imputer.transform(test[categorical_features])

label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# Feature Scaling
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])
test[numerical_features] = scaler.transform(test[numerical_features])

# XGBoost for Feature Selection
xgb_fs_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_jobs=-1,
    random_state=42,
    n_estimators=100
)
xgb_fs_model.fit(X, y)

# Get feature importance scores
feature_importances = pd.Series(xgb_fs_model.feature_importances_, index=X.columns)

# Define a function to evaluate the number of top features
def evaluate_top_k_features(k):
    top_features = feature_importances.nlargest(k).index
    X_k = X[top_features]
    X_train, X_val, y_train, y_val = train_test_split(X_k, y, test_size=0.2, random_state=42)

    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 3, 15),
            "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.05),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 5),
            "reg_alpha": trial.suggest_float("reg_alpha", 0, 1),
            "reg_lambda": trial.suggest_float("reg_lambda", 0, 1),
        }
        model = xgb.XGBRegressor(
            objective="reg:squarederror", tree_method="hist", random_state=42, **params
        )
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        return rmse

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=30)

    return study.best_params, study.best_value

# Evaluate different top-k features
best_k = None
best_rmse = float("inf")
best_model = None
test_predictions_best = None

for k in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:  # Test different numbers of features
    print(f"Evaluating top-{k} features...")
    params, rmse = evaluate_top_k_features(k)
    print(f"Top-{k} features: Best RMSE = {rmse:.4f} with params = {params}")
    
    if rmse < best_rmse:
        best_rmse = rmse
        best_k = k
        # Train the model with the best parameters on all data
        top_features = feature_importances.nlargest(k).index
        X_train_full = X[top_features]
        X_train, X_val, y_train, y_val = train_test_split(X_train_full, y, test_size=0.2, random_state=42)

        best_model = xgb.XGBRegressor(
            objective="reg:squarederror", tree_method="hist", random_state=42, **params
        )
        best_model.fit(X_train_full, y)
        test_predictions_best = best_model.predict(test[top_features])

# Save the final submission file using the sample_submission structure
final_submission = sample_submission.copy()
final_submission["price_doc"] = test_predictions_best

# Prepare Submission Directory
if not os.path.exists("submissions5"):
    os.makedirs("submissions5")

# Save the final submission file
final_submission_file = f"submissions5/final_submission_top_{best_k}_features_rmse_{best_rmse:.4f}.csv"
final_submission.to_csv(final_submission_file, index=False)

print(f"Final submission file saved as {final_submission_file}")


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-11-26 11:19:06,759] A new study created in memory with name: no-name-ac9c236b-cc41-4c46-b2ca-64355bed7c47


Evaluating top-10 features...


[I 2024-11-26 11:19:39,097] Trial 0 finished with value: 11853122.48945553 and parameters: {'n_estimators': 723, 'max_depth': 14, 'learning_rate': 0.04240300716253512, 'subsample': 0.698879965625048, 'colsample_bytree': 0.850835095080857, 'gamma': 3.7820923127655064, 'reg_alpha': 0.2949933753586558, 'reg_lambda': 0.1496604493706618}. Best is trial 0 with value: 11853122.48945553.
[I 2024-11-26 11:19:39,989] Trial 1 finished with value: 11712574.575729195 and parameters: {'n_estimators': 273, 'max_depth': 3, 'learning_rate': 0.047936228341382975, 'subsample': 0.9241106695685668, 'colsample_bytree': 0.745871238421505, 'gamma': 0.786446555149708, 'reg_alpha': 0.40607833231843826, 'reg_lambda': 0.6455466002430947}. Best is trial 1 with value: 11712574.575729195.
[I 2024-11-26 11:19:42,544] Trial 2 finished with value: 11798499.169527147 and parameters: {'n_estimators': 466, 'max_depth': 7, 'learning_rate': 0.028620497414232022, 'subsample': 0.6755251961650299, 'colsample_bytree': 0.9306006

Top-10 features: Best RMSE = 11623925.9813 with params = {'n_estimators': 538, 'max_depth': 14, 'learning_rate': 0.006171866168655084, 'subsample': 0.7624987554572065, 'colsample_bytree': 0.8067352944959315, 'gamma': 4.0747682395302025, 'reg_alpha': 0.8642525133472556, 'reg_lambda': 0.5285025509778825}


[I 2024-11-26 11:25:07,825] A new study created in memory with name: no-name-077c648b-f0ee-46bc-97c5-01cbf5155ae6


Evaluating top-20 features...


[I 2024-11-26 11:25:11,454] Trial 0 finished with value: 11681588.610352216 and parameters: {'n_estimators': 115, 'max_depth': 11, 'learning_rate': 0.039080453544491224, 'subsample': 0.6821465651355672, 'colsample_bytree': 0.8642146885565127, 'gamma': 2.4146871229275275, 'reg_alpha': 0.8385757190602112, 'reg_lambda': 0.725117647686715}. Best is trial 0 with value: 11681588.610352216.
[I 2024-11-26 11:25:15,174] Trial 1 finished with value: 11677271.747147456 and parameters: {'n_estimators': 762, 'max_depth': 5, 'learning_rate': 0.011638774394022382, 'subsample': 0.8393594161314082, 'colsample_bytree': 0.7278838108616051, 'gamma': 1.5421458900186762, 'reg_alpha': 0.1847866178476777, 'reg_lambda': 0.33941162235919575}. Best is trial 1 with value: 11677271.747147456.
[I 2024-11-26 11:25:23,933] Trial 2 finished with value: 13373690.073102754 and parameters: {'n_estimators': 374, 'max_depth': 10, 'learning_rate': 0.002402936012968041, 'subsample': 0.9765307675197737, 'colsample_bytree': 0.

Top-20 features: Best RMSE = 11568479.1261 with params = {'n_estimators': 474, 'max_depth': 14, 'learning_rate': 0.007250437825168963, 'subsample': 0.935796495240531, 'colsample_bytree': 0.6890057360034052, 'gamma': 0.868725789069803, 'reg_alpha': 0.23334306689435536, 'reg_lambda': 0.15294204491984847}


[I 2024-11-26 11:39:55,178] A new study created in memory with name: no-name-fc04b9d5-5bac-4dfb-a60e-4deca90b3497


Evaluating top-30 features...


[I 2024-11-26 11:39:58,277] Trial 0 finished with value: 11755595.320199074 and parameters: {'n_estimators': 728, 'max_depth': 3, 'learning_rate': 0.04624731607044253, 'subsample': 0.8959770363968204, 'colsample_bytree': 0.6727703338634022, 'gamma': 1.9118002196579935, 'reg_alpha': 0.6936064774999169, 'reg_lambda': 0.36284742974647555}. Best is trial 0 with value: 11755595.320199074.
[I 2024-11-26 11:40:02,400] Trial 1 finished with value: 11656767.53480433 and parameters: {'n_estimators': 416, 'max_depth': 7, 'learning_rate': 0.020786892800811804, 'subsample': 0.8502974946569559, 'colsample_bytree': 0.7712304540103672, 'gamma': 3.7561721589870656, 'reg_alpha': 0.30139910682671456, 'reg_lambda': 0.861748587657731}. Best is trial 1 with value: 11656767.53480433.
[I 2024-11-26 11:40:04,889] Trial 2 finished with value: 11841029.632259183 and parameters: {'n_estimators': 503, 'max_depth': 3, 'learning_rate': 0.005003219697465913, 'subsample': 0.7553758052490211, 'colsample_bytree': 0.6856

Top-30 features: Best RMSE = 11549081.1322 with params = {'n_estimators': 971, 'max_depth': 15, 'learning_rate': 0.015714972703938433, 'subsample': 0.9971904070152132, 'colsample_bytree': 0.6000296527419543, 'gamma': 4.7597832411231185, 'reg_alpha': 0.5433321567982826, 'reg_lambda': 0.020590360542651953}


[I 2024-11-26 12:08:38,433] A new study created in memory with name: no-name-bba3c9a2-7f8f-4d4a-a130-893008eab01a


Evaluating top-40 features...


[I 2024-11-26 12:08:56,759] Trial 0 finished with value: 11834950.25527424 and parameters: {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.04784388303701776, 'subsample': 0.7977790851565285, 'colsample_bytree': 0.6654475051814644, 'gamma': 1.0390880290615967, 'reg_alpha': 0.1197661311579451, 'reg_lambda': 0.3042930922499131}. Best is trial 0 with value: 11834950.25527424.
[I 2024-11-26 12:09:05,196] Trial 1 finished with value: 11659397.732650861 and parameters: {'n_estimators': 870, 'max_depth': 6, 'learning_rate': 0.007384508158893317, 'subsample': 0.8845419868250178, 'colsample_bytree': 0.920737937555878, 'gamma': 2.482360558709379, 'reg_alpha': 0.8316793195657638, 'reg_lambda': 0.3227641273328905}. Best is trial 1 with value: 11659397.732650861.
[I 2024-11-26 12:09:33,008] Trial 2 finished with value: 11634180.248693928 and parameters: {'n_estimators': 631, 'max_depth': 11, 'learning_rate': 0.010273937754231181, 'subsample': 0.6942909358840507, 'colsample_bytree': 0.76420

Top-40 features: Best RMSE = 11561193.6036 with params = {'n_estimators': 385, 'max_depth': 14, 'learning_rate': 0.016198241112386914, 'subsample': 0.9652722865093808, 'colsample_bytree': 0.7466296428684015, 'gamma': 0.5364537713151437, 'reg_alpha': 0.9836032923948145, 'reg_lambda': 0.4731375044085797}
Evaluating top-50 features...


[I 2024-11-26 12:36:31,249] Trial 0 finished with value: 11664327.90989416 and parameters: {'n_estimators': 641, 'max_depth': 5, 'learning_rate': 0.011475516960465618, 'subsample': 0.8866874241531582, 'colsample_bytree': 0.6199711497746673, 'gamma': 2.1601400599092124, 'reg_alpha': 0.7956009875857676, 'reg_lambda': 0.3182484166167898}. Best is trial 0 with value: 11664327.90989416.
[I 2024-11-26 12:36:36,404] Trial 1 finished with value: 11766519.11485902 and parameters: {'n_estimators': 803, 'max_depth': 4, 'learning_rate': 0.03792502327798109, 'subsample': 0.6456276545529517, 'colsample_bytree': 0.9241718200312993, 'gamma': 0.9338464619756937, 'reg_alpha': 0.17089179724248238, 'reg_lambda': 0.11334994759003314}. Best is trial 0 with value: 11664327.90989416.
[I 2024-11-26 12:37:32,616] Trial 2 finished with value: 11705940.260072103 and parameters: {'n_estimators': 456, 'max_depth': 13, 'learning_rate': 0.04637703880542672, 'subsample': 0.7966765385044818, 'colsample_bytree': 0.96374

Top-50 features: Best RMSE = 11546757.2269 with params = {'n_estimators': 893, 'max_depth': 15, 'learning_rate': 0.0043751929695513125, 'subsample': 0.6940344252284879, 'colsample_bytree': 0.8741690269806949, 'gamma': 1.9827279380898586, 'reg_alpha': 0.372873675496563, 'reg_lambda': 0.05501784565981067}


[I 2024-11-26 13:06:38,711] A new study created in memory with name: no-name-5f770ab2-badf-48a9-ab85-8121993c5457


Evaluating top-60 features...


[I 2024-11-26 13:06:43,712] Trial 0 finished with value: 11760150.650082411 and parameters: {'n_estimators': 863, 'max_depth': 3, 'learning_rate': 0.04406039015898655, 'subsample': 0.6840867924367637, 'colsample_bytree': 0.6299916220161503, 'gamma': 0.4268344792045847, 'reg_alpha': 0.05387638270741879, 'reg_lambda': 0.2544174566861723}. Best is trial 0 with value: 11760150.650082411.
[I 2024-11-26 13:07:08,502] Trial 1 finished with value: 11762811.044765618 and parameters: {'n_estimators': 805, 'max_depth': 9, 'learning_rate': 0.034291204908382764, 'subsample': 0.953529487049541, 'colsample_bytree': 0.8640342721068899, 'gamma': 2.2008216615718794, 'reg_alpha': 0.12729050093197403, 'reg_lambda': 0.013642626593088458}. Best is trial 0 with value: 11760150.650082411.
[I 2024-11-26 13:07:17,443] Trial 2 finished with value: 11657735.63113191 and parameters: {'n_estimators': 624, 'max_depth': 7, 'learning_rate': 0.009350637943081629, 'subsample': 0.6076082849000619, 'colsample_bytree': 0.6

Top-60 features: Best RMSE = 11571899.2274 with params = {'n_estimators': 292, 'max_depth': 12, 'learning_rate': 0.015137915285793137, 'subsample': 0.7258799729320937, 'colsample_bytree': 0.8370362408656667, 'gamma': 2.8798569071946396, 'reg_alpha': 0.1735570587916092, 'reg_lambda': 0.9908783037322321}
Evaluating top-70 features...


[I 2024-11-26 13:20:30,414] Trial 0 finished with value: 11592645.245011128 and parameters: {'n_estimators': 359, 'max_depth': 11, 'learning_rate': 0.01091144074078695, 'subsample': 0.6973489520090123, 'colsample_bytree': 0.898854914212389, 'gamma': 0.9914294761363746, 'reg_alpha': 0.9582074122208283, 'reg_lambda': 0.25877007326867696}. Best is trial 0 with value: 11592645.245011128.
[I 2024-11-26 13:20:46,061] Trial 1 finished with value: 11732470.927159296 and parameters: {'n_estimators': 941, 'max_depth': 7, 'learning_rate': 0.015475724890943874, 'subsample': 0.7641222816312061, 'colsample_bytree': 0.8370830538412912, 'gamma': 4.422364549022956, 'reg_alpha': 0.6941772803275915, 'reg_lambda': 0.47440798508475357}. Best is trial 0 with value: 11592645.245011128.
[I 2024-11-26 13:21:08,503] Trial 2 finished with value: 11776928.543104714 and parameters: {'n_estimators': 932, 'max_depth': 8, 'learning_rate': 0.024992089870414125, 'subsample': 0.8771776538977325, 'colsample_bytree': 0.93

Top-70 features: Best RMSE = 11559738.1509 with params = {'n_estimators': 644, 'max_depth': 12, 'learning_rate': 0.005618357720174925, 'subsample': 0.7072140668881807, 'colsample_bytree': 0.9166594729730219, 'gamma': 2.4260219580577465, 'reg_alpha': 0.10206884328213371, 'reg_lambda': 0.15890319071920778}
Evaluating top-80 features...


[I 2024-11-26 13:48:18,659] A new study created in memory with name: no-name-b82453bf-c029-412e-978e-e6ca6ee4aa58
[I 2024-11-26 13:48:50,233] Trial 0 finished with value: 11743790.59517634 and parameters: {'n_estimators': 523, 'max_depth': 10, 'learning_rate': 0.02811774554828561, 'subsample': 0.7151429330510728, 'colsample_bytree': 0.9922410100739827, 'gamma': 0.1441716930621162, 'reg_alpha': 0.05981487003097863, 'reg_lambda': 0.37030418668849874}. Best is trial 0 with value: 11743790.59517634.
[I 2024-11-26 13:49:23,790] Trial 1 finished with value: 11640596.644970194 and parameters: {'n_estimators': 440, 'max_depth': 11, 'learning_rate': 0.018454525225894483, 'subsample': 0.8997424676018171, 'colsample_bytree': 0.707083407947934, 'gamma': 0.707311839186443, 'reg_alpha': 0.5724976733886997, 'reg_lambda': 0.8451359451014407}. Best is trial 1 with value: 11640596.644970194.
[I 2024-11-26 13:49:25,760] Trial 2 finished with value: 11685796.676544413 and parameters: {'n_estimators': 163,

Top-80 features: Best RMSE = 11560115.4433 with params = {'n_estimators': 678, 'max_depth': 15, 'learning_rate': 0.008974509844067908, 'subsample': 0.7898094774202821, 'colsample_bytree': 0.9034302916337132, 'gamma': 2.408130519984519, 'reg_alpha': 0.91954931048039, 'reg_lambda': 0.597416762454551}
Evaluating top-90 features...


[I 2024-11-26 14:34:35,979] A new study created in memory with name: no-name-07077c68-2b5d-4f2f-83e8-0b6c270b5ab6
[I 2024-11-26 14:35:02,813] Trial 0 finished with value: 12109369.657416519 and parameters: {'n_estimators': 267, 'max_depth': 11, 'learning_rate': 0.005702824700373133, 'subsample': 0.6046103431736545, 'colsample_bytree': 0.8438351234688313, 'gamma': 2.5337193789777697, 'reg_alpha': 0.026174429011803224, 'reg_lambda': 0.8462665610430956}. Best is trial 0 with value: 12109369.657416519.
[I 2024-11-26 14:35:33,107] Trial 1 finished with value: 11712230.33618503 and parameters: {'n_estimators': 218, 'max_depth': 12, 'learning_rate': 0.04237174535160784, 'subsample': 0.6745853153586069, 'colsample_bytree': 0.9862343564482955, 'gamma': 1.8291296013587022, 'reg_alpha': 0.19681601083316969, 'reg_lambda': 0.5736716615467293}. Best is trial 1 with value: 11712230.33618503.
[I 2024-11-26 14:36:54,236] Trial 2 finished with value: 11635101.720193718 and parameters: {'n_estimators': 5

Top-90 features: Best RMSE = 11557940.0108 with params = {'n_estimators': 370, 'max_depth': 14, 'learning_rate': 0.010213130303822225, 'subsample': 0.6765603269622176, 'colsample_bytree': 0.9189316792044802, 'gamma': 3.6780229374628495, 'reg_alpha': 0.6409528347226071, 'reg_lambda': 0.31479542014006545}
Evaluating top-100 features...


[I 2024-11-26 15:12:10,743] A new study created in memory with name: no-name-76ed4ffa-2753-4922-8e34-366c3d3bef29
[I 2024-11-26 15:13:34,868] Trial 0 finished with value: 11657826.47039754 and parameters: {'n_estimators': 288, 'max_depth': 14, 'learning_rate': 0.0447945815278724, 'subsample': 0.7173322182423058, 'colsample_bytree': 0.8221114304827797, 'gamma': 4.279839980691496, 'reg_alpha': 0.08456985361237912, 'reg_lambda': 0.02459865739905387}. Best is trial 0 with value: 11657826.47039754.
[I 2024-11-26 15:13:43,636] Trial 1 finished with value: 11667521.27685338 and parameters: {'n_estimators': 630, 'max_depth': 5, 'learning_rate': 0.011411909214257668, 'subsample': 0.6261826269733218, 'colsample_bytree': 0.8381039760942728, 'gamma': 1.4235459829204278, 'reg_alpha': 0.5007518136644165, 'reg_lambda': 0.22664618710067752}. Best is trial 0 with value: 11657826.47039754.
[I 2024-11-26 15:14:20,350] Trial 2 finished with value: 11629668.432691434 and parameters: {'n_estimators': 576, '

Top-100 features: Best RMSE = 11510181.6748 with params = {'n_estimators': 638, 'max_depth': 13, 'learning_rate': 0.009032888165546774, 'subsample': 0.9591515613470262, 'colsample_bytree': 0.6844171793158738, 'gamma': 4.87906163439442, 'reg_alpha': 0.40782446819729457, 'reg_lambda': 0.2841053248975898}
Final submission file saved as submissions5/final_submission_top_100_features_rmse_11510181.6748.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import optuna
import os

# Load the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# Separate features and target
X = train.drop(columns=["price_doc", "id"], errors='ignore')
y = train["price_doc"]

# Handle Outliers (Example: Remove extreme price outliers)
q1 = y.quantile(0.01)
q99 = y.quantile(0.99)
outlier_mask = (y >= q1) & (y <= q99)
X = X[outlier_mask]
y = y[outlier_mask]

# Impute missing values
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Numerical imputation
num_imputer = SimpleImputer(strategy='median')
X[numerical_features] = num_imputer.fit_transform(X[numerical_features])
test[numerical_features] = num_imputer.transform(test[numerical_features])

# Categorical encoding and imputation
cat_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = cat_imputer.fit_transform(X[categorical_features])
test[categorical_features] = cat_imputer.transform(test[categorical_features])

label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# Feature Scaling
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])
test[numerical_features] = scaler.transform(test[numerical_features])

# XGBoost for Feature Selection
xgb_fs_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_jobs=-1,
    random_state=42,
    n_estimators=100
)
xgb_fs_model.fit(X, y)

# Get feature importance scores
feature_importances = pd.Series(xgb_fs_model.feature_importances_, index=X.columns)

# Define a function to evaluate the number of top features
def evaluate_top_k_features(k):
    top_features = feature_importances.nlargest(k).index
    X_k = X[top_features]
    X_train, X_val, y_train, y_val = train_test_split(X_k, y, test_size=0.2, random_state=42)

    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 3, 15),
            "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.05),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 5),
            "reg_alpha": trial.suggest_float("reg_alpha", 0, 1),
            "reg_lambda": trial.suggest_float("reg_lambda", 0, 1),
        }
        model = xgb.XGBRegressor(
            objective="reg:squarederror", tree_method="hist", random_state=42, **params
        )
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        return rmse

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=30)

    return study.best_params, study.best_value

# Evaluate different top-k features
best_k = None
best_rmse = float("inf")
best_model = None
test_predictions_best = None

for k in [110, 120, 130, 140, 150, 160, 170, 180, 190, 200]:  # Test different numbers of features
    print(f"Evaluating top-{k} features...")
    params, rmse = evaluate_top_k_features(k)
    print(f"Top-{k} features: Best RMSE = {rmse:.4f} with params = {params}")
    
    if rmse < best_rmse:
        best_rmse = rmse
        best_k = k
        # Train the model with the best parameters on all data
        top_features = feature_importances.nlargest(k).index
        X_train_full = X[top_features]
        X_train, X_val, y_train, y_val = train_test_split(X_train_full, y, test_size=0.2, random_state=42)

        best_model = xgb.XGBRegressor(
            objective="reg:squarederror", tree_method="hist", random_state=42, **params
        )
        best_model.fit(X_train_full, y)
        test_predictions_best = best_model.predict(test[top_features])

# Save the final submission file using the sample_submission structure
final_submission = sample_submission.copy()
final_submission["price_doc"] = test_predictions_best

# Prepare Submission Directory
if not os.path.exists("submissions5"):
    os.makedirs("submissions5")

# Save the final submission file
final_submission_file = f"submissions5/final_submission_top_{best_k}_features_rmse_{best_rmse:.4f}.csv"
final_submission.to_csv(final_submission_file, index=False)

print(f"Final submission file saved as {final_submission_file}")


Evaluating top-110 features...


[I 2024-11-26 17:44:12,103] A new study created in memory with name: no-name-8ca6ff2b-554c-4ef7-bd51-df2777f80649
[I 2024-11-26 17:45:30,412] Trial 0 finished with value: 11562127.074038265 and parameters: {'n_estimators': 373, 'max_depth': 13, 'learning_rate': 0.016135069206283093, 'subsample': 0.969509695908804, 'colsample_bytree': 0.6449701497452142, 'gamma': 4.762292087863329, 'reg_alpha': 0.05642548077680365, 'reg_lambda': 0.2940976783748539}. Best is trial 0 with value: 11562127.074038265.
[I 2024-11-26 17:47:40,959] Trial 1 finished with value: 11678641.720181199 and parameters: {'n_estimators': 512, 'max_depth': 14, 'learning_rate': 0.0366336098021867, 'subsample': 0.6898289423523121, 'colsample_bytree': 0.739817455197801, 'gamma': 3.650466701483353, 'reg_alpha': 0.5210372178060307, 'reg_lambda': 0.2927125754594764}. Best is trial 0 with value: 11562127.074038265.
[I 2024-11-26 17:49:28,055] Trial 2 finished with value: 11545626.993324464 and parameters: {'n_estimators': 249, '

Top-110 features: Best RMSE = 11505893.1348 with params = {'n_estimators': 966, 'max_depth': 14, 'learning_rate': 0.005433020622158103, 'subsample': 0.9467626338208265, 'colsample_bytree': 0.6653981017646022, 'gamma': 4.5147629818758155, 'reg_alpha': 0.08064963132741576, 'reg_lambda': 0.1282605544799642}
Evaluating top-120 features...


[I 2024-11-26 18:46:43,783] A new study created in memory with name: no-name-7ab2e142-2ea3-4075-9beb-e6a9e29a74d8
[I 2024-11-26 18:46:51,810] Trial 0 finished with value: 11711270.137047278 and parameters: {'n_estimators': 750, 'max_depth': 3, 'learning_rate': 0.022868730504128937, 'subsample': 0.7131948228705065, 'colsample_bytree': 0.7504294905672823, 'gamma': 2.138752833816548, 'reg_alpha': 0.7333531217368262, 'reg_lambda': 0.01278973129440275}. Best is trial 0 with value: 11711270.137047278.
[I 2024-11-26 18:47:29,966] Trial 1 finished with value: 11588387.91966546 and parameters: {'n_estimators': 630, 'max_depth': 9, 'learning_rate': 0.008920122662599297, 'subsample': 0.9136108042778269, 'colsample_bytree': 0.9322696785528558, 'gamma': 2.485741585850434, 'reg_alpha': 0.5650388636154775, 'reg_lambda': 0.6731036809007764}. Best is trial 1 with value: 11588387.91966546.
[I 2024-11-26 18:49:00,784] Trial 2 finished with value: 11661431.27568049 and parameters: {'n_estimators': 792, 'm

Top-120 features: Best RMSE = 11512710.1295 with params = {'n_estimators': 589, 'max_depth': 13, 'learning_rate': 0.00632400532912893, 'subsample': 0.8875038586752022, 'colsample_bytree': 0.8076848572010236, 'gamma': 0.3888453931576725, 'reg_alpha': 0.79702474696123, 'reg_lambda': 0.01984297155665038}
Evaluating top-130 features...


[I 2024-11-26 19:36:55,273] A new study created in memory with name: no-name-ed34569b-ca9b-4d47-9bea-21c11a73a453
[I 2024-11-26 19:37:03,900] Trial 0 finished with value: 11608190.919754405 and parameters: {'n_estimators': 133, 'max_depth': 9, 'learning_rate': 0.03222064132199402, 'subsample': 0.6865307963013069, 'colsample_bytree': 0.7639399132861783, 'gamma': 1.7777397523739809, 'reg_alpha': 0.6955201612076404, 'reg_lambda': 0.5193695749789631}. Best is trial 0 with value: 11608190.919754405.
[I 2024-11-26 19:37:12,075] Trial 1 finished with value: 11736964.83978931 and parameters: {'n_estimators': 364, 'max_depth': 6, 'learning_rate': 0.04667944031850577, 'subsample': 0.7557128593459789, 'colsample_bytree': 0.8649089894825406, 'gamma': 0.9769025885326782, 'reg_alpha': 0.4557153060517317, 'reg_lambda': 0.9157515818516372}. Best is trial 0 with value: 11608190.919754405.
[I 2024-11-26 19:37:16,640] Trial 2 finished with value: 11676519.464775335 and parameters: {'n_estimators': 177, '

Top-130 features: Best RMSE = 11484932.8166 with params = {'n_estimators': 773, 'max_depth': 14, 'learning_rate': 0.009539656254387099, 'subsample': 0.9108227873644054, 'colsample_bytree': 0.6945334324915096, 'gamma': 2.578619413748529, 'reg_alpha': 0.32633217383265145, 'reg_lambda': 0.1710638849711445}
Evaluating top-140 features...


[I 2024-11-26 20:49:44,692] A new study created in memory with name: no-name-38a4c2ad-802a-4424-9783-ec960d07ae18
[I 2024-11-26 20:50:21,684] Trial 0 finished with value: 13182769.181031574 and parameters: {'n_estimators': 671, 'max_depth': 8, 'learning_rate': 0.0014268159796936794, 'subsample': 0.9455342344542663, 'colsample_bytree': 0.9709475173696474, 'gamma': 3.483851824110684, 'reg_alpha': 0.31437894838384506, 'reg_lambda': 0.08354897151170049}. Best is trial 0 with value: 13182769.181031574.
[I 2024-11-26 20:50:24,239] Trial 1 finished with value: 11712862.12496737 and parameters: {'n_estimators': 130, 'max_depth': 3, 'learning_rate': 0.033516927592254386, 'subsample': 0.9993079919566437, 'colsample_bytree': 0.6272856291826648, 'gamma': 2.4209775199540036, 'reg_alpha': 0.3437788861606348, 'reg_lambda': 0.544199842651634}. Best is trial 1 with value: 11712862.12496737.
[I 2024-11-26 20:51:19,736] Trial 2 finished with value: 11645217.146465927 and parameters: {'n_estimators': 818,

Top-140 features: Best RMSE = 11510546.8400 with params = {'n_estimators': 893, 'max_depth': 14, 'learning_rate': 0.007578832562438026, 'subsample': 0.9007020052237277, 'colsample_bytree': 0.6727194492968123, 'gamma': 4.478405016202928, 'reg_alpha': 0.7385616702611096, 'reg_lambda': 0.6224639516976149}
Evaluating top-150 features...


[I 2024-11-26 22:11:01,307] A new study created in memory with name: no-name-a411d8e3-4352-4bef-b49e-a091e72ebe46
[I 2024-11-26 22:11:20,107] Trial 0 finished with value: 12350610.489394372 and parameters: {'n_estimators': 152, 'max_depth': 10, 'learning_rate': 0.008759062308912297, 'subsample': 0.7117316504442052, 'colsample_bytree': 0.8724552932834984, 'gamma': 2.1752940520704076, 'reg_alpha': 0.6087081317160373, 'reg_lambda': 0.9534250137104007}. Best is trial 0 with value: 12350610.489394372.
[I 2024-11-26 22:13:20,259] Trial 1 finished with value: 11574498.160440294 and parameters: {'n_estimators': 980, 'max_depth': 11, 'learning_rate': 0.02440394616340625, 'subsample': 0.9812339822288213, 'colsample_bytree': 0.6652244702894762, 'gamma': 1.047594684558209, 'reg_alpha': 0.6170940970958124, 'reg_lambda': 0.8987021658581495}. Best is trial 1 with value: 11574498.160440294.
[I 2024-11-26 22:13:41,442] Trial 2 finished with value: 11654152.15227246 and parameters: {'n_estimators': 151,

Top-150 features: Best RMSE = 11500293.0076 with params = {'n_estimators': 489, 'max_depth': 14, 'learning_rate': 0.009930748156263335, 'subsample': 0.9081035902877045, 'colsample_bytree': 0.7187344546691806, 'gamma': 3.15386306832049, 'reg_alpha': 0.4478357197315154, 'reg_lambda': 0.30584449580007506}
Evaluating top-160 features...


[I 2024-11-26 23:12:41,136] A new study created in memory with name: no-name-6bf5608c-3f19-4a1a-a756-8a05f71ba40f
[I 2024-11-26 23:13:47,452] Trial 0 finished with value: 11641464.152118945 and parameters: {'n_estimators': 651, 'max_depth': 10, 'learning_rate': 0.036126055009715496, 'subsample': 0.8022313569460447, 'colsample_bytree': 0.8197234863854148, 'gamma': 4.315904083827556, 'reg_alpha': 0.3170584112718491, 'reg_lambda': 0.4529677031996606}. Best is trial 0 with value: 11641464.152118945.
[I 2024-11-26 23:14:01,862] Trial 1 finished with value: 11695768.494212097 and parameters: {'n_estimators': 906, 'max_depth': 4, 'learning_rate': 0.03021577914524595, 'subsample': 0.9474318286289168, 'colsample_bytree': 0.6755423254946641, 'gamma': 2.8001216494391534, 'reg_alpha': 0.19777517761255214, 'reg_lambda': 0.511723414847199}. Best is trial 0 with value: 11641464.152118945.
[I 2024-11-26 23:19:25,432] Trial 2 finished with value: 11568123.532629406 and parameters: {'n_estimators': 656,

Top-160 features: Best RMSE = 11514088.1282 with params = {'n_estimators': 794, 'max_depth': 14, 'learning_rate': 0.006745997349808073, 'subsample': 0.8134417706774906, 'colsample_bytree': 0.8485036447436455, 'gamma': 3.9924393914659593, 'reg_alpha': 0.4567901264875281, 'reg_lambda': 0.4344458637593278}
Evaluating top-170 features...


[I 2024-11-27 00:29:07,045] A new study created in memory with name: no-name-a838c976-76ff-4989-9385-a8e9845b837e
[I 2024-11-27 00:30:13,076] Trial 0 finished with value: 11672202.654310504 and parameters: {'n_estimators': 932, 'max_depth': 9, 'learning_rate': 0.029576917079233007, 'subsample': 0.6054247760669629, 'colsample_bytree': 0.7711294530109262, 'gamma': 1.4707706090803292, 'reg_alpha': 0.8119797267633556, 'reg_lambda': 0.44910958481031593}. Best is trial 0 with value: 11672202.654310504.
[I 2024-11-27 00:30:38,502] Trial 1 finished with value: 11831551.893755706 and parameters: {'n_estimators': 920, 'max_depth': 6, 'learning_rate': 0.04016325658855304, 'subsample': 0.6305333923251254, 'colsample_bytree': 0.9748252622900989, 'gamma': 4.87725726883414, 'reg_alpha': 0.9982592261965898, 'reg_lambda': 0.07999007599683139}. Best is trial 0 with value: 11672202.654310504.
[I 2024-11-27 00:31:01,387] Trial 2 finished with value: 11615889.27264895 and parameters: {'n_estimators': 297, 

Top-170 features: Best RMSE = 11500545.9154 with params = {'n_estimators': 993, 'max_depth': 15, 'learning_rate': 0.008381988931921125, 'subsample': 0.6617064347483874, 'colsample_bytree': 0.6014170829154828, 'gamma': 2.050582610968801, 'reg_alpha': 0.18181939350210746, 'reg_lambda': 0.8606409260407952}
Evaluating top-180 features...


[I 2024-11-27 02:00:56,965] A new study created in memory with name: no-name-87fafbe2-6df9-4f51-8047-5c5c37bd1cce
[I 2024-11-27 02:01:28,238] Trial 0 finished with value: 11693498.769906325 and parameters: {'n_estimators': 856, 'max_depth': 7, 'learning_rate': 0.022623492848552904, 'subsample': 0.7179573848657256, 'colsample_bytree': 0.7136753859605762, 'gamma': 0.3313020840949199, 'reg_alpha': 0.7935123221612084, 'reg_lambda': 0.08688825471534078}. Best is trial 0 with value: 11693498.769906325.
[I 2024-11-27 02:01:36,068] Trial 1 finished with value: 11706790.014057484 and parameters: {'n_estimators': 446, 'max_depth': 3, 'learning_rate': 0.03976047448219046, 'subsample': 0.855189877647649, 'colsample_bytree': 0.9329165004153519, 'gamma': 2.942958206166174, 'reg_alpha': 0.0066241526106661786, 'reg_lambda': 0.6995206904745587}. Best is trial 0 with value: 11693498.769906325.
[I 2024-11-27 02:04:38,041] Trial 2 finished with value: 11562932.926883126 and parameters: {'n_estimators': 81

Top-180 features: Best RMSE = 11504912.6203 with params = {'n_estimators': 986, 'max_depth': 14, 'learning_rate': 0.00452717139253509, 'subsample': 0.6850521380601174, 'colsample_bytree': 0.702550953976305, 'gamma': 0.20964625840740525, 'reg_alpha': 0.06948269856327328, 'reg_lambda': 0.4165429364862752}
Evaluating top-190 features...


[I 2024-11-27 03:25:59,180] A new study created in memory with name: no-name-47fb3ac3-d90f-4d79-b6ae-3eefc07a8d17
[I 2024-11-27 03:27:06,277] Trial 0 finished with value: 11661760.029440949 and parameters: {'n_estimators': 558, 'max_depth': 10, 'learning_rate': 0.04253876794680194, 'subsample': 0.893568757236596, 'colsample_bytree': 0.7921818019109343, 'gamma': 3.3312768849900736, 'reg_alpha': 0.13351225017009982, 'reg_lambda': 0.6016886744997723}. Best is trial 0 with value: 11661760.029440949.
[I 2024-11-27 03:28:36,617] Trial 1 finished with value: 17455942.698841203 and parameters: {'n_estimators': 152, 'max_depth': 15, 'learning_rate': 0.0015153912112042468, 'subsample': 0.6106346771251367, 'colsample_bytree': 0.6018562073765931, 'gamma': 2.182969146002536, 'reg_alpha': 0.10508613858973248, 'reg_lambda': 0.8190393411434175}. Best is trial 0 with value: 11661760.029440949.
[I 2024-11-27 03:29:00,799] Trial 2 finished with value: 11728192.08059442 and parameters: {'n_estimators': 78

Top-190 features: Best RMSE = 11476342.2368 with params = {'n_estimators': 725, 'max_depth': 15, 'learning_rate': 0.010226887130093398, 'subsample': 0.8140286256566845, 'colsample_bytree': 0.6874697388201212, 'gamma': 0.7388217185974744, 'reg_alpha': 0.7715128316223478, 'reg_lambda': 0.7549000911002367}
Evaluating top-200 features...


[I 2024-11-27 05:12:35,853] A new study created in memory with name: no-name-e213ffee-c6b4-48eb-b5f9-708fd2b139b0
[I 2024-11-27 05:12:42,729] Trial 0 finished with value: 11706737.658791846 and parameters: {'n_estimators': 299, 'max_depth': 3, 'learning_rate': 0.016327814416724908, 'subsample': 0.7239029541104568, 'colsample_bytree': 0.7065262257962206, 'gamma': 2.877378032717828, 'reg_alpha': 0.033263290188197736, 'reg_lambda': 0.3391454411609095}. Best is trial 0 with value: 11706737.658791846.
[I 2024-11-27 05:12:55,983] Trial 1 finished with value: 13831569.849037213 and parameters: {'n_estimators': 515, 'max_depth': 4, 'learning_rate': 0.0015560439300566606, 'subsample': 0.9941370405898702, 'colsample_bytree': 0.9951990603648336, 'gamma': 3.458526056447699, 'reg_alpha': 0.03626156006787007, 'reg_lambda': 0.5034886778170442}. Best is trial 0 with value: 11706737.658791846.
[I 2024-11-27 05:13:07,347] Trial 2 finished with value: 11724406.658323942 and parameters: {'n_estimators': 5

Top-200 features: Best RMSE = 11523689.8968 with params = {'n_estimators': 611, 'max_depth': 14, 'learning_rate': 0.02673119351299593, 'subsample': 0.9565736188062032, 'colsample_bytree': 0.8200301399481642, 'gamma': 2.3803458355586997, 'reg_alpha': 0.8391484206455042, 'reg_lambda': 0.8275267768741771}
Final submission file saved as submissions5/final_submission_top_190_features_rmse_11476342.2368.csv


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import optuna
import os

# Load the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# Separate features and target
X = train.drop(columns=["price_doc", "id"], errors='ignore')
y = train["price_doc"]

# Handle Outliers (Example: Remove extreme price outliers)
q1 = y.quantile(0.01)
q99 = y.quantile(0.99)
outlier_mask = (y >= q1) & (y <= q99)
X = X[outlier_mask]
y = y[outlier_mask]

# Impute missing values
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Numerical imputation
num_imputer = SimpleImputer(strategy='median')
X[numerical_features] = num_imputer.fit_transform(X[numerical_features])
test[numerical_features] = num_imputer.transform(test[numerical_features])

# Categorical encoding and imputation
cat_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = cat_imputer.fit_transform(X[categorical_features])
test[categorical_features] = cat_imputer.transform(test[categorical_features])

label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# Feature Scaling
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])
test[numerical_features] = scaler.transform(test[numerical_features])

# XGBoost for Feature Selection
xgb_fs_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_jobs=-1,
    random_state=42,
    n_estimators=100
)
xgb_fs_model.fit(X, y)

# Get feature importance scores
feature_importances = pd.Series(xgb_fs_model.feature_importances_, index=X.columns)

# Define a function to evaluate the number of top features
def evaluate_top_k_features(k):
    top_features = feature_importances.nlargest(k).index
    X_k = X[top_features]
    X_train, X_val, y_train, y_val = train_test_split(X_k, y, test_size=0.2, random_state=42)

    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 3, 15),
            "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.05),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 5),
            "reg_alpha": trial.suggest_float("reg_alpha", 0, 1),
            "reg_lambda": trial.suggest_float("reg_lambda", 0, 1),
        }
        model = xgb.XGBRegressor(
            objective="reg:squarederror", tree_method="hist", random_state=42, **params
        )
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        return rmse

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=30)

    return study.best_params, study.best_value

# Evaluate different top-k features
best_k = None
best_rmse = float("inf")
best_model = None
test_predictions_best = None

for k in [210, 220, 230, 240, 250, 260]:  # Test different numbers of features
    print(f"Evaluating top-{k} features...")
    params, rmse = evaluate_top_k_features(k)
    print(f"Top-{k} features: Best RMSE = {rmse:.4f} with params = {params}")
    
    if rmse < best_rmse:
        best_rmse = rmse
        best_k = k
        # Train the model with the best parameters on all data
        top_features = feature_importances.nlargest(k).index
        X_train_full = X[top_features]
        X_train, X_val, y_train, y_val = train_test_split(X_train_full, y, test_size=0.2, random_state=42)

        best_model = xgb.XGBRegressor(
            objective="reg:squarederror", tree_method="hist", random_state=42, **params
        )
        best_model.fit(X_train_full, y)
        test_predictions_best = best_model.predict(test[top_features])

# Save the final submission file using the sample_submission structure
final_submission = sample_submission.copy()
final_submission["price_doc"] = test_predictions_best

# Prepare Submission Directory
if not os.path.exists("submissions5"):
    os.makedirs("submissions5")

# Save the final submission file
final_submission_file = f"submissions5/final_submission_top_{best_k}_features_rmse_{best_rmse:.4f}.csv"
final_submission.to_csv(final_submission_file, index=False)

print(f"Final submission file saved as {final_submission_file}")


Evaluating top-210 features...


[I 2024-11-27 09:02:49,560] A new study created in memory with name: no-name-d383ef31-26f1-4c7d-bdcb-03332a886e25
[I 2024-11-27 09:02:55,826] Trial 0 finished with value: 11705457.488878509 and parameters: {'n_estimators': 179, 'max_depth': 5, 'learning_rate': 0.015699759609221233, 'subsample': 0.6870732612041776, 'colsample_bytree': 0.6944747539902663, 'gamma': 1.5118248994991235, 'reg_alpha': 0.43378349618444456, 'reg_lambda': 0.1810878949457877}. Best is trial 0 with value: 11705457.488878509.
[I 2024-11-27 09:06:52,704] Trial 1 finished with value: 11594441.658483226 and parameters: {'n_estimators': 419, 'max_depth': 14, 'learning_rate': 0.040982631460510316, 'subsample': 0.9878512347069321, 'colsample_bytree': 0.9269083563529666, 'gamma': 3.6250106019413844, 'reg_alpha': 0.16532307140558145, 'reg_lambda': 0.9971676903728091}. Best is trial 1 with value: 11594441.658483226.
[I 2024-11-27 09:09:05,166] Trial 2 finished with value: 11666555.837646687 and parameters: {'n_estimators': 

Top-210 features: Best RMSE = 11516324.4145 with params = {'n_estimators': 870, 'max_depth': 13, 'learning_rate': 0.03372867302760106, 'subsample': 0.9342355661317092, 'colsample_bytree': 0.8757942561491007, 'gamma': 4.015532119062435, 'reg_alpha': 0.6141549709651437, 'reg_lambda': 0.25872728482560003}
Evaluating top-220 features...


[I 2024-11-27 11:01:22,687] A new study created in memory with name: no-name-eef9fbe9-b9f3-4ab9-b1d4-143421f241dd
[I 2024-11-27 11:01:43,937] Trial 0 finished with value: 11672064.427503519 and parameters: {'n_estimators': 871, 'max_depth': 4, 'learning_rate': 0.010308075066636693, 'subsample': 0.7131997041023364, 'colsample_bytree': 0.7754169791262724, 'gamma': 3.076628548944322, 'reg_alpha': 0.3426111946321627, 'reg_lambda': 0.6868532729399124}. Best is trial 0 with value: 11672064.427503519.
[I 2024-11-27 11:11:29,636] Trial 1 finished with value: 11491328.481578592 and parameters: {'n_estimators': 926, 'max_depth': 14, 'learning_rate': 0.004479816749385142, 'subsample': 0.8361588804189354, 'colsample_bytree': 0.846621400035282, 'gamma': 1.4928382763365577, 'reg_alpha': 0.5872981384085251, 'reg_lambda': 0.10348588777796464}. Best is trial 1 with value: 11491328.481578592.
[I 2024-11-27 11:11:46,978] Trial 2 finished with value: 11597019.501777604 and parameters: {'n_estimators': 148

Top-220 features: Best RMSE = 11491328.4816 with params = {'n_estimators': 926, 'max_depth': 14, 'learning_rate': 0.004479816749385142, 'subsample': 0.8361588804189354, 'colsample_bytree': 0.846621400035282, 'gamma': 1.4928382763365577, 'reg_alpha': 0.5872981384085251, 'reg_lambda': 0.10348588777796464}
Evaluating top-230 features...


[I 2024-11-27 12:58:02,665] A new study created in memory with name: no-name-5e7b235f-59b0-4186-8580-d3a420b9372e
[I 2024-11-27 13:00:41,941] Trial 0 finished with value: 11582648.1599822 and parameters: {'n_estimators': 511, 'max_depth': 12, 'learning_rate': 0.02034458101237879, 'subsample': 0.6879692240461502, 'colsample_bytree': 0.9460042823079738, 'gamma': 1.0088184135745686, 'reg_alpha': 0.9143125047510989, 'reg_lambda': 0.978454054751973}. Best is trial 0 with value: 11582648.1599822.
[I 2024-11-27 13:02:17,099] Trial 1 finished with value: 11614448.64753556 and parameters: {'n_estimators': 286, 'max_depth': 12, 'learning_rate': 0.023890879881616173, 'subsample': 0.857558652018729, 'colsample_bytree': 0.7398196523493298, 'gamma': 3.932660898207951, 'reg_alpha': 0.2907429865632274, 'reg_lambda': 0.055591836981563114}. Best is trial 0 with value: 11582648.1599822.
[I 2024-11-27 13:02:36,864] Trial 2 finished with value: 12093788.660868803 and parameters: {'n_estimators': 170, 'max_

Top-230 features: Best RMSE = 11543891.5911 with params = {'n_estimators': 822, 'max_depth': 11, 'learning_rate': 0.004056920502964974, 'subsample': 0.8998400558535996, 'colsample_bytree': 0.7877449376824399, 'gamma': 3.774232341026812, 'reg_alpha': 0.08487935064496657, 'reg_lambda': 0.20392063205648078}
Evaluating top-240 features...


[I 2024-11-27 14:56:11,783] A new study created in memory with name: no-name-f4da809c-2c90-42f0-9c58-2bbf289bc0eb
[I 2024-11-27 14:57:16,050] Trial 0 finished with value: 11677708.349662323 and parameters: {'n_estimators': 836, 'max_depth': 8, 'learning_rate': 0.0331851384762922, 'subsample': 0.9731035208295651, 'colsample_bytree': 0.9412533077717107, 'gamma': 3.887715344968876, 'reg_alpha': 0.18612760356991298, 'reg_lambda': 0.9712548575696189}. Best is trial 0 with value: 11677708.349662323.
[I 2024-11-27 15:03:53,710] Trial 1 finished with value: 11618447.727141434 and parameters: {'n_estimators': 700, 'max_depth': 14, 'learning_rate': 0.04341807112364698, 'subsample': 0.9807692174293988, 'colsample_bytree': 0.8420241830862227, 'gamma': 1.3709885257859555, 'reg_alpha': 0.04975610859585167, 'reg_lambda': 0.9867454367837436}. Best is trial 1 with value: 11618447.727141434.
[I 2024-11-27 15:07:37,214] Trial 2 finished with value: 11542431.889408281 and parameters: {'n_estimators': 350,

Top-240 features: Best RMSE = 11527835.1995 with params = {'n_estimators': 995, 'max_depth': 12, 'learning_rate': 0.006686795095652565, 'subsample': 0.8260902071130293, 'colsample_bytree': 0.6702608157030406, 'gamma': 3.8034219603226456, 'reg_alpha': 0.42702112101980466, 'reg_lambda': 0.17762395969900624}
Evaluating top-250 features...


[I 2024-11-27 16:37:22,160] A new study created in memory with name: no-name-42e68bca-da17-4d1f-92b4-1bb878473419
[I 2024-11-27 16:37:39,196] Trial 0 finished with value: 11652229.806747235 and parameters: {'n_estimators': 466, 'max_depth': 5, 'learning_rate': 0.009999320308035027, 'subsample': 0.8902691899358455, 'colsample_bytree': 0.7785017805547635, 'gamma': 0.25860206794787677, 'reg_alpha': 0.7637957729280413, 'reg_lambda': 0.44978705299727084}. Best is trial 0 with value: 11652229.806747235.
[I 2024-11-27 16:37:46,893] Trial 1 finished with value: 11658243.315904103 and parameters: {'n_estimators': 179, 'max_depth': 5, 'learning_rate': 0.027573309128358956, 'subsample': 0.8629383588535193, 'colsample_bytree': 0.7260362915345069, 'gamma': 2.5616085500343164, 'reg_alpha': 0.6277860233717247, 'reg_lambda': 0.8977698764009324}. Best is trial 0 with value: 11652229.806747235.
[I 2024-11-27 16:38:17,462] Trial 2 finished with value: 11750205.44419666 and parameters: {'n_estimators': 73

Top-250 features: Best RMSE = 11510927.9200 with params = {'n_estimators': 247, 'max_depth': 15, 'learning_rate': 0.01780488757040003, 'subsample': 0.7744864530937439, 'colsample_bytree': 0.9971584251588652, 'gamma': 1.6158704147017455, 'reg_alpha': 0.9773849216361268, 'reg_lambda': 0.30519995887239276}
Evaluating top-260 features...


[I 2024-11-27 17:41:44,272] A new study created in memory with name: no-name-048147cd-d326-4870-9a6c-4988df1480d7
[I 2024-11-27 17:43:39,954] Trial 0 finished with value: 11634677.131602582 and parameters: {'n_estimators': 219, 'max_depth': 13, 'learning_rate': 0.03513357254453264, 'subsample': 0.6730468454203806, 'colsample_bytree': 0.9380941522653625, 'gamma': 0.22208876937933886, 'reg_alpha': 0.7568062907191622, 'reg_lambda': 0.1641728120816286}. Best is trial 0 with value: 11634677.131602582.
[I 2024-11-27 17:53:36,894] Trial 1 finished with value: 11496239.623931287 and parameters: {'n_estimators': 723, 'max_depth': 15, 'learning_rate': 0.01348227576757648, 'subsample': 0.6896765016596557, 'colsample_bytree': 0.9765448577986259, 'gamma': 0.596140543603319, 'reg_alpha': 0.8722264645148121, 'reg_lambda': 0.44658648035967174}. Best is trial 1 with value: 11496239.623931287.
[I 2024-11-27 17:54:06,685] Trial 2 finished with value: 11666212.874075463 and parameters: {'n_estimators': 99

Top-260 features: Best RMSE = 11496239.6239 with params = {'n_estimators': 723, 'max_depth': 15, 'learning_rate': 0.01348227576757648, 'subsample': 0.6896765016596557, 'colsample_bytree': 0.9765448577986259, 'gamma': 0.596140543603319, 'reg_alpha': 0.8722264645148121, 'reg_lambda': 0.44658648035967174}
Final submission file saved as submissions5/final_submission_top_220_features_rmse_11491328.4816.csv
