In [None]:
from functools import partial

import numpy as np
import pandas as pd

import lightgbm as lgb

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

from matplotlib import pyplot as plt

import optuna

In [None]:
train_df = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")
train_df.head()

## Stratified Split

In [None]:
# Reference: https://www.kaggle.com/tolgadincer/continuous-target-stratification

N_FOLDS = 5
train_df["kfold"] = -1
skf = StratifiedKFold(n_splits=N_FOLDS)
train_df["groups"] = pd.cut(train_df["Pawpularity"], bins=10, labels=False)
target = train_df["groups"]

for fold, (train_idx, val_idx) in enumerate(skf.split(target, target)):
    train_df.loc[val_idx, 'kfold'] = fold
train_df = train_df.drop(["groups"], axis=1)
train_df.head()

## Verifying Stratified Split

### Target distribution

In [None]:
plt.hist(train_df["Pawpularity"], bins=100, density=True)
plt.xlabel('Target')
plt.ylabel('Frequency')
plt.show()

### Distribution of each split

In [None]:
fig, axs = plt.subplots(1, 5, sharex=True, sharey=True, figsize=(15, 4))
for i, ax in enumerate(axs):
    ax.hist(train_df[train_df["kfold"] == i]["Pawpularity"], bins=100, density=True, label=f"Fold-{i}")
    if i == 0:
        ax.set_ylabel("Frequency")
    if i == 2:
        ax.set_xlabel("Target")
    ax.legend(frameon=False, handlelength=0)
plt.tight_layout()
plt.show()

## Regression using Light GBM

In [None]:
features = [
    "Subject Focus", "Eyes", "Face", "Near", "Action",
    "Accessory", "Group", "Collage", "Human", "Occlusion",
    "Info", "Blur"
]
target = ["Pawpularity"]

In [None]:
def oof_score(models, train_df):
    preds, targets = [], []
    for idx, model in enumerate(models):
        preds.append(model.predict(train_df[train_df["kfold"] == idx][features]))
        targets.append(train_df[train_df["kfold"] == idx][target])
    preds, targets = np.hstack(preds), np.vstack(np.array(targets, dtype="object")).reshape(-1, )
    return np.sqrt(mean_squared_error(targets, preds))

In [None]:
def objective(trial, train_df, features, target):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'rmse'},
        'n_estimators': trial.suggest_int("n_estimators", 64, 8192),
        'learning_rate': trial.suggest_float("learning_rate", 1e-3, 0.25, log=True),
        'num_leaves': trial.suggest_int("num_leaves", 4, 16),
        'max_depth': trial.suggest_int("max_depth", 4, 16),
        'feature_fraction': trial.suggest_float("feature_fraction", 0.1, 1.0),
        'lambda_l1': trial.suggest_loguniform("lambda_l1", 1e-8, 100.0),
        'lambda_l2': trial.suggest_loguniform("lambda_l2", 1e-8, 100.0),
        'seed': 42,
        'deterministic': True,
        'verbose':-1
    }

    models = []
    for fold in range(N_FOLDS):
        train = train_df[train_df["kfold"] != fold]
        val = train_df[train_df["kfold"] == fold]

        x_train, y_train = train[features], train[target]
        x_val, y_val = val[features], val[target]

        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_val = lgb.Dataset(x_val, y_val)

        model = lgb.train(
            params,
            lgb_train,
            num_boost_round=5000,
            valid_sets=(lgb_train, lgb_val),
            early_stopping_rounds=100,
            verbose_eval=False
        )
        models.append(model)
    return oof_score(models, train_df)

In [None]:
opt_fun = partial(
    objective,
    train_df=train_df,
    features=features,
    target=target
)

study = optuna.create_study(direction="minimize")
study.optimize(opt_fun, n_trials=500)
print(study.best_params)

In [None]:
study.best_value, study.best_params

In [None]:
params = study.best_params.copy()
params.update({
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'rmse'}
})
params

In [None]:
models = []
for fold in range(N_FOLDS):
    train = train_df[train_df["kfold"] != fold]
    val = train_df[train_df["kfold"] == fold]

    x_train, y_train = train[features], train[target]
    x_val, y_val = val[features], val[target]

    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_val = lgb.Dataset(x_val, y_val)

    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=(lgb_train, lgb_val),
        early_stopping_rounds=100,
        verbose_eval=100
    )
    models.append(model)

In [None]:
test_df = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")
test_df.head()

In [None]:
sample_sub_df = pd.read_csv("../input/petfinder-pawpularity-score/sample_submission.csv")
sample_sub_df.head()

In [None]:
preds = sum([model.predict(test_df[features]) for model in models])/N_FOLDS
sample_sub_df["Pawpularity"] = preds
sample_sub_df.to_csv('submission.csv', index=False)