In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgbm
from xgboost import XGBRegressor
import optuna
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('data/train.csv',index_col=0)
test = pd.read_csv('data/test.csv')

In [3]:
train.isna().sum()

cont1     0
cont2     0
cont3     0
cont4     0
cont5     0
cont6     0
cont7     0
cont8     0
cont9     0
cont10    0
cont11    0
cont12    0
cont13    0
cont14    0
target    0
dtype: int64

In [4]:
test.isna().sum()

id        0
cont1     0
cont2     0
cont3     0
cont4     0
cont5     0
cont6     0
cont7     0
cont8     0
cont9     0
cont10    0
cont11    0
cont12    0
cont13    0
cont14    0
dtype: int64

In [5]:
X_train,X_val,y_train,y_val = train_test_split(train.drop('target',axis=1),train['target'],test_size=0.2)
X_train.shape,X_val.shape,y_train.shape,y_val.shape

((240000, 14), (60000, 14), (240000,), (60000,))

In [None]:
def rf_objective(trial):
    rf_params = {
      'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
      'max_depth': trial.suggest_int('max_depth', 2, 32),
      'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
      'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
      'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    rf_model=RandomForestRegressor(**rf_params)    
    rf_model.fit(X_train,y_train)

    rf_rmse = np.sqrt(mean_squared_error(y_val,rf_model.predict(X_val)))
    return rf_rmse

rf_study = optuna.create_study(direction='minimize')
rf_study.optimize(rf_objective, n_trials=50,n_jobs=-1)

In [6]:
rf_best_params = {
'n_estimators': 775, 
'max_depth': 32, 
'min_samples_split': 3, 
'min_samples_leaf': 3, 
'bootstrap': True
}

rf_best_model=RandomForestRegressor(**rf_best_params)    
rf_best_model.fit(X_train,y_train)
np.sqrt(mean_squared_error(y_val,rf_best_model.predict(X_val)))

0.7055657687699354

In [8]:
sample_submission_df = pd.read_csv('data/sample_submission.csv')
sample_submission_df['target'] = rf_best_model.predict(test.drop('id',axis=1))
sample_submission_df.to_csv('data/submission-RF.csv', index=False)
sample_submission_df.head()

Unnamed: 0,id,target
0,0,8.039456
1,2,7.790411
2,6,7.953302
3,7,8.10041
4,10,8.031611


In [None]:
def xgb_objective(trial):
    xgb_params = {
     'min_child_weight': trial.suggest_int('min_child_weight', 0, 10),
     'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
     'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
     'max_depth': trial.suggest_int('max_depth', 4, 10),
     'subsample': trial.suggest_float('subsample', 0, 1.0),
     'colsample_bytree': trial.suggest_float('colsample_bytree', 0, 1.0),
     'random_state': 42,
     'gamma': trial.suggest_float('gamma', 0, 1),
     'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
     'reg_lambda': trial.suggest_float('reg_lambda', 0, 1)
    }

    xgb_model=XGBRegressor(**xgb_params)    
    xgb_model.fit(X_train,y_train)

    xgb_rmse = np.sqrt(mean_squared_error(y_val,rf_best_model.predict(X_val)))
    return xgb_rmse

xgb_study = optuna.create_study(direction='minimize')
xgb_study.optimize(xgb_objective, n_trials=50,n_jobs=-1)

[I 2024-04-11 14:32:28,325] A new study created in memory with name: no-name-40d8c477-d43d-4a5a-89e6-4bfb0e558157
[I 2024-04-11 14:43:06,087] Trial 5 finished with value: 0.7055657687699354 and parameters: {'min_child_weight': 9, 'learning_rate': 0.08573061910161554, 'n_estimators': 835, 'max_depth': 4, 'subsample': 0.7937017757861644, 'colsample_bytree': 0.01954373399252951, 'gamma': 0.34430422755048806, 'reg_alpha': 0.29754460320610987, 'reg_lambda': 0.7324263965958584}. Best is trial 5 with value: 0.7055657687699354.
[I 2024-04-11 14:43:10,593] Trial 1 finished with value: 0.7055657687699354 and parameters: {'min_child_weight': 5, 'learning_rate': 0.0526307684313507, 'n_estimators': 698, 'max_depth': 4, 'subsample': 0.07054104936228422, 'colsample_bytree': 0.8725787053711869, 'gamma': 0.5761603483153325, 'reg_alpha': 0.40463782002842197, 'reg_lambda': 0.03459086999513139}. Best is trial 5 with value: 0.7055657687699354.
[I 2024-04-11 14:54:11,330] Trial 8 finished with value: 0.7055