In [None]:
import numpy as np
import pandas as pd

rng = 1337

train_df = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')

In [None]:
train_df.head()

In [None]:
train_df.iloc[:,1:-1].apply(pd.value_counts)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

plt.subplots(figsize=(20,10))
sns.distplot(train_df['Pawpularity'], fit=norm)

In [None]:
X, y = train_df.drop(['Id', 'Pawpularity'], axis=1), train_df['Pawpularity']

In [None]:
import lightgbm as lgbm
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

N_SPLITS = 4

def objective(trial, X, y):
    param_grid = {
        "n_estimators": 10000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 10, 300, step=5),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 20, step=1),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 110, step=2),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 110, step=2),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_freq": 1,
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.05
        ),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.05
        ),
        'random_state': rng,
        'verbose': -1
    }
    
    cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=rng)
    
    cv_scores = np.empty(N_SPLITS)
    n_estimators_values = np.empty(N_SPLITS)
    
    for idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model = lgbm.LGBMRegressor(objective='rmse', **param_grid)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric='rmse',
                  early_stopping_rounds=150,
                  verbose=-1
                 )
        preds = model.predict(X_val)
        cv_scores[idx] = mean_squared_error(y_val, preds, squared=False)
        n_estimators_values[idx] = model.best_iteration_

    trial.set_user_attr('best_iteration', n_estimators_values)
    return np.mean(cv_scores)

study = optuna.create_study(direction='minimize', study_name="LGBM Classifier")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=200, show_progress_bar=True)

In [None]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")
print(f"\t\tn_estimators_used: {study.best_trial.user_attrs['best_iteration']}")

In [None]:
fig, ax = plt.subplots(figsize=(16,8))
plt.plot([v.values for v in study.trials])
plt.show()

## Test Abgabe

In [None]:
test_df = pd.read_csv('../input/petfinder-pawpularity-score/test.csv')
X_test = test_df.drop(['Id'], axis=1)

In [None]:
model = lgbm.LGBMRegressor(objective='rmse', n_estimators=150, **study.best_params)
model.fit(X, y)
preds = model.predict(X_test)

In [None]:
submission = pd.read_csv('../input/petfinder-pawpularity-score/sample_submission.csv')
submission['Pawpularity'] = preds
submission

In [None]:
submission.to_csv('submission.csv', index=False)