In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import clone
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import Ridge
from sklearn.isotonic import IsotonicRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_log_error, make_scorer
import xgboost
import lightgbm
import catboost

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

import optuna

In [41]:
COMPUTE_TEST_PRED = True

path_to_train = os.path.join('data','train.csv')
path_to_test = os.path.join('data','test.csv')

train_df = pd.read_csv(path_to_train)
test_df = pd.read_csv(path_to_test)

def clean_and_feature_eng(df):
    data = df.dropna()
    data = pd.get_dummies(data, columns=['Sex'], drop_first=True, dtype=float)
    data.fillna(data.mean(), inplace=True)
    return data

train_df = clean_and_feature_eng(train_df)
test_df = clean_and_feature_eng(test_df)

In [42]:
train_df.columns

Index(['id', 'Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1',
       'Whole weight.2', 'Shell weight', 'Rings', 'Sex_I', 'Sex_M'],
      dtype='object')

In [43]:
oof, predictions_test = {}, {}

numeric_features = [
    'Length',
    'Diameter',
    'Height',
    'Whole weight',
    'Whole weight.1',
    'Whole weight.2',
    'Shell weight',
    'Sex_I',
    'Sex_M']

numeric_vars = numeric_features + ['Rings']

log_features = []
for col in numeric_features:
    train_df[f'log_{col}'] = np.log1p(train_df[col])
    test_df[f'log_{col}'] = np.log1p(test_df[col])
    log_features.append(f'log_{col}')


In [73]:
def cross_validate(model, label, features=numeric_features, n_repeats=1):
    scores = []
    oof_preds = np.full_like(train_df.Rings, np.nan, dtype=float)
    kFold = StratifiedKFold(n_splits=5, shuffle=True)

    for fold, (idx_train, idx_val) in enumerate(kFold.split(train_df, train_df.Rings)):
        X_train = train_df.iloc[idx_train][features]
        X_val = train_df.iloc[idx_val][features]
        y_train = train_df.iloc[idx_train].Rings
        y_val = train_df.iloc[idx_val].Rings

        y_pred = np.zeros_like(y_val, dtype=float)
        
        for i in range(n_repeats):
            m = clone(model)

            if n_repeats > 1:
                mm = m
                if isinstance(mm, TransformedTargetRegressor):
                    mm = mm.regressor
                mm.set_params(random_state=i)
            
            m.fit(X_train, y_train)
            y_pred += m.predict(X_val)

        y_pred /= n_repeats
        y_pred = y_pred.clip(1, 29)

        score = mean_squared_log_error(y_val, y_pred, squared=False)

        scores.append(score)
        oof_preds[idx_val] = y_pred
        print(f'fold #: {fold}, score: {score:.4f}')

    oof[label] = oof_preds
    # print(f'scores: {[f"{score:.4f}" for score in scores]}')
    print(f'overall RMSLE: {np.array(scores).mean():.4f}')

    if COMPUTE_TEST_PRED:
        y_pred = np.zeros(len(test_df), dtype=float)
        for i in range(n_repeats):
            m = clone(m)
            if n_repeats > 1:
                mm = m
                if isinstance(mm, TransformedTargetRegressor):
                    mm = mm.regressor
                mm.set_params(random_state=i)
            m.fit(train_df[features], train_df.Rings)
            y_pred += m.predict(test_df[features])
        y_pred /= n_repeats
        y_pred = y_pred.clip(1, 29)
        predictions_test[label] = y_pred

custom_column_transformer = ColumnTransformer(
    [('ohe', OneHotEncoder(), ['Sex'])],
    remainder='passthrough')

In [74]:
%%time
model = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(degree=3),
    TransformedTargetRegressor(
        Ridge(0.01),
        func=np.log1p,
        inverse_func=np.expm1
    )
)
cross_validate(model, 'Poly-Ridge', numeric_features+log_features)

fold #: 0, score: 0.1546
fold #: 1, score: 0.1527
fold #: 2, score: 0.1528
fold #: 3, score: 0.1534
fold #: 4, score: 0.1543
overall RMSLE: 0.1536
CPU times: user 1min 25s, sys: 6.72 s, total: 1min 32s
Wall time: 11.1 s


In [75]:
%%time
model = make_pipeline(
    StandardScaler(),
    Nystroem(n_components=500),
    TransformedTargetRegressor(
        Ridge(0.1),
        func=np.log1p,
        inverse_func=np.expm1
    )
)
cross_validate(model, 'Poly-Ridge', numeric_features+log_features)

fold #: 0, score: 0.1521
fold #: 1, score: 0.1520
fold #: 2, score: 0.1511
fold #: 3, score: 0.1523
fold #: 4, score: 0.1511
overall RMSLE: 0.1517
CPU times: user 1min 3s, sys: 10.6 s, total: 1min 13s
Wall time: 7.85 s


In [40]:
%%time
model = make_pipeline(
    StandardScaler(),
    TransformedTargetRegressor(
        KNeighborsRegressor(n_neighbors=50),
        func=np.log1p,
        inverse_func=np.expm1
    )
)
cross_validate(model, 'KNN', log_features)

fold #: 0; RMSLE: 0.15543
fold #: 1; RMSLE: 0.15408
fold #: 2; RMSLE: 0.15479
fold #: 3; RMSLE: 0.15406
fold #: 4; RMSLE: 0.15503
overall RMSLE: 0.15468
CPU times: user 4.39 s, sys: 27.3 ms, total: 4.41 s
Wall time: 4.42 s


In [47]:
%%time
model = make_pipeline(
    TransformedTargetRegressor(
        RandomForestRegressor(n_estimators=200, min_samples_leaf=7),
        func=np.log1p,
        inverse_func=np.expm1
    )
)
cross_validate(model, 'RandomForest', log_features)

fold #: 0; RMSLE: 0.15058
fold #: 1; RMSLE: 0.14954
fold #: 2; RMSLE: 0.14891
fold #: 3; RMSLE: 0.15189
fold #: 4; RMSLE: 0.15045
overall RMSLE: 0.15027
CPU times: user 2min 54s, sys: 370 ms, total: 2min 55s
Wall time: 2min 56s


In [48]:
%%time
model = make_pipeline(
    TransformedTargetRegressor(
        GradientBoostingRegressor(n_estimators=200, min_samples_leaf=7),
        func = np.log1p,
        inverse_func = np.expm1
    )
)
cross_validate(model, 'GradientBoosted', log_features)

fold #: 0; RMSLE: 0.15223
fold #: 1; RMSLE: 0.15134
fold #: 2; RMSLE: 0.14967
fold #: 3; RMSLE: 0.15079
fold #: 4; RMSLE: 0.15071
overall RMSLE: 0.15095
CPU times: user 57.8 s, sys: 277 ms, total: 58 s
Wall time: 58.8 s


In [49]:
%%time
model = make_pipeline(
    TransformedTargetRegressor(ExtraTreesRegressor(n_estimators=200, min_samples_leaf=7),
    func = np.log1p,
    inverse_func = np.expm1
    )
)
cross_validate(model, 'ExtraTrees', log_features)

fold #: 0; RMSLE: 0.14996
fold #: 1; RMSLE: 0.14904
fold #: 2; RMSLE: 0.15131
fold #: 3; RMSLE: 0.15129
fold #: 4; RMSLE: 0.15069
overall RMSLE: 0.15046
CPU times: user 58.6 s, sys: 210 ms, total: 58.8 s
Wall time: 59.1 s


In [52]:
%%time
xgb_params = {'grow_policy': 'depthwise',
             'n_estimators': 300,
             'learning_rate': 0.12696,
             'res_lambda': 41.46839,
             'min_child_weight': 3.76337,
             'colsample_bytree': 0.73179,
             'objective': 'reg:squarederror',
             'tree_method': 'hist',
             'gamma': 0,
             'enable_categorical': True}

model = TransformedTargetRegressor(xgboost.XGBRegressor(**xgb_params), func = np.log1p, inverse_func = np.expm1)
cross_validate(model, 'XGBoost', log_features, n_repeats=5)

fold #: 0; RMSLE: 0.14929
fold #: 1; RMSLE: 0.14718
fold #: 2; RMSLE: 0.14748
fold #: 3; RMSLE: 0.14956
fold #: 4; RMSLE: 0.14939
overall RMSLE: 0.14858
CPU times: user 1min 11s, sys: 22.6 s, total: 1min 33s
Wall time: 9.44 s


In [54]:
%%time
lgbm_params = {'n_estimators': 1000,
               'learning_rate': 0.03049275229875912,
               'colsample_bytree': 0.6702510959773988,
               'reg_lambda': 0.18167530315433994,
               'min_child_samples': 102,
               'num_leaves': 46,
               'verbose': -1} # 0.14803

model = TransformedTargetRegressor(
    lightgbm.LGBMRegressor(**lgbm_params),
    func=np.log1p,
    inverse_func=np.expm1)
cross_validate(model, 'lightGBM', numeric_features, n_repeats=5)

fold #: 0; RMSLE: 0.14931
fold #: 1; RMSLE: 0.14781
fold #: 2; RMSLE: 0.14842
fold #: 3; RMSLE: 0.14753
fold #: 4; RMSLE: 0.14755
overall RMSLE: 0.14813
CPU times: user 7min 31s, sys: 1min 36s, total: 9min 8s
Wall time: 54.4 s


In [76]:
%%time
lgbm_params = {'n_estimators': 1000,
               'learning_rate': 0.038622511348472645,
               'max_bin': 2048,
               'colsample_bytree': 0.5757189042456357,
               'reg_lambda': 0.09664116733307193,
               'min_child_samples': 87,
               'num_leaves': 43,
               'verbose': -1}

model = TransformedTargetRegressor(
    lightgbm.LGBMRegressor(**lgbm_params),
    func=np.sqrt,
    inverse_func=np.square)
cross_validate(model, 'LightGBM-sqrt', numeric_features, n_repeats=5)

fold #: 0, score: 0.1482
fold #: 1, score: 0.1483
fold #: 2, score: 0.1477
fold #: 3, score: 0.1479
fold #: 4, score: 0.1466
overall RMSLE: 0.1477
CPU times: user 8min 26s, sys: 2min 18s, total: 10min 45s
Wall time: 1min 7s


In [79]:
cb_params = {'grow_policy': 'SymmetricTree',
             'n_estimators': 1000,
             'learning_rate': 0.12909209153923812,
             'l2_leaf_reg': 24.368207730414547,
             'max_depth': 8,
             'colsample_bylevel': 0.5018730598197876,
             'boost_from_average': True,
             'loss_function': 'RMSE',
             'verbose': 0} # 0.14879

model = TransformedTargetRegressor(catboost.CatBoostRegressor(**cb_params),
                                                 func=np.log1p,
                                                 inverse_func=np.expm1)
cross_validate(model, 'Catboost', log_features, n_repeats=5)

fold #: 0, score: 0.1486
fold #: 1, score: 0.1483
fold #: 2, score: 0.1479
fold #: 3, score: 0.1488
fold #: 4, score: 0.1489
overall RMSLE: 0.1485


In [80]:
cb_params = {'grow_policy': 'Lossguide',
             'n_estimators': 1000,
             'learning_rate': 0.12444528932682379,
             'max_bin': 2048,
             'l2_leaf_reg': 41.57232155127747,
             'min_child_samples': 75,
             'colsample_bylevel': 0.9931075066636142,
             'subsample': 0.9885992818939339,
             'random_strength': 0.09223106939759793,
             'boost_from_average': True,
             'loss_function': 'RMSE',
             'bootstrap_type': 'Bernoulli',
             'verbose': False} # 0.14815
model = TransformedTargetRegressor(catboost.CatBoostRegressor(**cb_params),
                                                 func=np.log1p,
                                                 inverse_func=np.expm1)
cross_validate(model, 'Catboost-LG', log_features, n_repeats=5)
# Overall: 0.14800 Catboost-LG   14 min with max_bin=254 (default)
# Overall: 0.14762 Catboost-LG   17 min with max_bin=2048

fold #: 0, score: 0.1481
fold #: 1, score: 0.1464
fold #: 2, score: 0.1477
fold #: 3, score: 0.1490
fold #: 4, score: 0.1476
overall RMSLE: 0.1478


In [None]:
%%time
hgb_params = {'max_iter': 300,
              'max_leaf_nodes': 43,
              'early_stopping': False,
              'learning_rate': 0.08019987638525192,
              'min_samples_leaf': 37} # 0.14916

model = make_pipeline(custom_column_transformer,
                      TransformedTargetRegressor(
                          HistGradientBoostingRegressor(**hgb_params),
                          func=np.log1p,
                          inverse_func=np.expm1))

cross_validate(model, 'HGB', numeric_features + ['Sex'])