In [1]:
import optuna
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import KFold 
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error


warnings.filterwarnings('ignore')
seed = 27

# Loading Data

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')

test_ids = test['id']

In [3]:
train.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


# Preprocessing

In [4]:
train['Sex'] = train['Sex'].map({'M': 0, 'F': 1, 'I': 2})
test['Sex'] = test['Sex'].map({'M': 0, 'F': 1, 'I': 2})

scaler = StandardScaler()
features_to_scale = ['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']
train[features_to_scale] = scaler.fit_transform(train[features_to_scale])
test[features_to_scale] = scaler.transform(test[features_to_scale])

In [5]:
X = train.drop(['id', 'Rings'], axis=1)
y = train['Rings']

X_test = test.drop('id', axis=1)

# Tuning with Optuna

In [6]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
        'gamma': trial.suggest_float('gamma', 1e-3, 1, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        "min_child_weight": trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1, log=True)
    }

    cv = KFold(n_splits=5, shuffle=True, random_state=seed)

    scores = []
    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

        model = XGBRegressor(**params, random_state=27)
        model.fit(X_train_fold, y_train_fold)

        y_pred = model.predict(X_valid_fold)
        score = np.sqrt(mean_squared_log_error(y_valid_fold, y_pred))

        scores.append(score)

    return np.mean(scores)

In [7]:
study = optuna.create_study(direction='minimize', study_name="XGBRegressor")
study.optimize(objective, n_trials=100, n_jobs=-1)

[I 2024-04-01 03:18:30,902] A new study created in memory with name: XGBRegressor
[I 2024-04-01 03:18:36,731] Trial 2 finished with value: 0.15330425088947203 and parameters: {'n_estimators': 65, 'learning_rate': 0.09288673015852558, 'gamma': 0.010412858287555199, 'reg_alpha': 0.13462578187146843, 'reg_lambda': 0.17542471703145876, 'max_depth': 4, 'min_child_weight': 5, 'subsample': 0.6749939888660631, 'colsample_bytree': 0.5493508026250103}. Best is trial 2 with value: 0.15330425088947203.
[I 2024-04-01 03:18:40,473] Trial 3 finished with value: 0.15211673010666424 and parameters: {'n_estimators': 83, 'learning_rate': 0.23848265685967324, 'gamma': 0.09729032601223381, 'reg_alpha': 0.02443331986625056, 'reg_lambda': 0.6308237295884384, 'max_depth': 7, 'min_child_weight': 1, 'subsample': 0.5671303217699711, 'colsample_bytree': 0.7963217917284253}. Best is trial 3 with value: 0.15211673010666424.
[I 2024-04-01 03:18:47,602] Trial 0 finished with value: 0.1500541804066923 and parameters: 

In [8]:
optuna.visualization.plot_optimization_history(study)

In [9]:
optuna.visualization.plot_slice(study)

# Training the Final Model

In [10]:
best_params = study.best_params

model = XGBRegressor(**best_params, random_state=27)
model.fit(X, y)

preds = model.predict(X_test)

In [11]:
submission = pd.DataFrame({'id': test_ids, 'Rings': preds})
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,Rings
0,90615,9.918823
1,90616,9.740312
2,90617,10.137782
3,90618,10.456995
4,90619,7.643459
