In [1]:
import optuna
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import KFold 
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error


warnings.filterwarnings('ignore')
seed = 27

# Loading Data

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')

test_ids = test['id']

In [3]:
train.describe()

Unnamed: 0,id,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
count,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0
mean,45307.0,0.517098,0.401679,0.135464,0.789035,0.340778,0.169422,0.225898,9.696794
std,26158.441658,0.118217,0.098026,0.038008,0.457671,0.204428,0.100909,0.130203,3.176221
min,0.0,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,22653.5,0.445,0.345,0.11,0.419,0.1775,0.0865,0.12,8.0
50%,45307.0,0.545,0.425,0.14,0.7995,0.33,0.166,0.225,9.0
75%,67960.5,0.6,0.47,0.16,1.0675,0.463,0.2325,0.305,11.0
max,90614.0,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [4]:
train.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


# Preprocessing

In [5]:
train['Sex'] = train['Sex'].map({'M': 0, 'F': 1, 'I': 2})
test['Sex'] = test['Sex'].map({'M': 0, 'F': 1, 'I': 2})

features_to_scale = ['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']

scaler = StandardScaler()
train[features_to_scale] = scaler.fit_transform(train[features_to_scale])
test[features_to_scale] = scaler.transform(test[features_to_scale])

In [6]:
X = train.drop(['id', 'Rings'], axis=1)
y = train['Rings']

X_test = test.drop('id', axis=1)

# Tuning with Optuna

In [7]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        'gamma': trial.suggest_float('gamma', 1e-3, 1, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        "min_child_weight": trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1, log=True)
    }

    cv = KFold(n_splits=10, shuffle=True, random_state=seed)

    scores = []
    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

        model = XGBRegressor(**params, random_state=seed)
        model.fit(X_train_fold, y_train_fold)

        y_pred = model.predict(X_valid_fold)
        score = np.sqrt(mean_squared_log_error(y_valid_fold, y_pred))

        scores.append(score)

    return np.mean(scores)

In [8]:
study = optuna.create_study(direction='minimize', study_name="XGBRegressor")
study.optimize(objective, n_trials=250, n_jobs=-1)

[I 2024-04-01 10:38:43,137] A new study created in memory with name: XGBRegressor
[I 2024-04-01 10:39:58,326] Trial 3 finished with value: 0.15009069179673293 and parameters: {'n_estimators': 196, 'learning_rate': 0.051027289813334806, 'gamma': 0.6412134317567473, 'reg_alpha': 0.021118290585924605, 'reg_lambda': 0.01623336090697498, 'max_depth': 11, 'min_child_weight': 6, 'subsample': 0.9867926524778027, 'colsample_bytree': 0.6291354693830742}. Best is trial 3 with value: 0.15009069179673293.
[I 2024-04-01 10:40:42,306] Trial 4 finished with value: 0.16119144772345534 and parameters: {'n_estimators': 219, 'learning_rate': 0.40094160961541436, 'gamma': 0.7685508382056957, 'reg_alpha': 0.3408746976668166, 'reg_lambda': 0.0010987268112821163, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.9252075169618438, 'colsample_bytree': 0.6804811966213334}. Best is trial 3 with value: 0.15009069179673293.
[I 2024-04-01 10:41:14,574] Trial 0 finished with value: 0.14932548146264413 and paramet

In [9]:
optuna.visualization.plot_optimization_history(study)

In [10]:
optuna.visualization.plot_slice(study)

# Training the Final Model

In [11]:
best_params = study.best_params

model = XGBRegressor(**best_params, random_state=seed)
model.fit(X, y)

preds = model.predict(X_test)

In [12]:
submission = pd.DataFrame({'id': test_ids, 'Rings': preds})
submission.to_csv(f'XGBRegressor_{study.best_value:.6f}.csv', index=False)
submission.head()

Unnamed: 0,id,Rings
0,90615,9.996573
1,90616,9.742382
2,90617,10.176211
3,90618,10.310677
4,90619,7.646965
