In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [3]:
print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('data/regression.train', header=None, sep='\t')
df_test = pd.read_csv('data/regression.test', header=None, sep='\t')

Loading data...


In [4]:
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

In [5]:
print('Starting training...')
# train
gbm = lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        early_stopping_rounds=5)

Starting training...
[1]	valid_0's l2: 0.242763	valid_0's l1: 0.491735
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.237895	valid_0's l1: 0.486563
[3]	valid_0's l2: 0.233277	valid_0's l1: 0.481489
[4]	valid_0's l2: 0.22925	valid_0's l1: 0.476848
[5]	valid_0's l2: 0.226155	valid_0's l1: 0.47305
[6]	valid_0's l2: 0.222963	valid_0's l1: 0.469049
[7]	valid_0's l2: 0.220364	valid_0's l1: 0.465556
[8]	valid_0's l2: 0.217872	valid_0's l1: 0.462208
[9]	valid_0's l2: 0.215328	valid_0's l1: 0.458676
[10]	valid_0's l2: 0.212743	valid_0's l1: 0.454998
[11]	valid_0's l2: 0.210805	valid_0's l1: 0.452047
[12]	valid_0's l2: 0.208945	valid_0's l1: 0.449158
[13]	valid_0's l2: 0.206986	valid_0's l1: 0.44608
[14]	valid_0's l2: 0.205513	valid_0's l1: 0.443554
[15]	valid_0's l2: 0.203728	valid_0's l1: 0.440643
[16]	valid_0's l2: 0.201865	valid_0's l1: 0.437687
[17]	valid_0's l2: 0.200639	valid_0's l1: 0.435454
[18]	valid_0's l2: 0.199522	valid_0's l1: 0.433288
[19]	valid_0

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.05, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=20, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [6]:
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

Starting predicting...
The rmse of prediction is: 0.4441153344254208


In [7]:
# feature importances
print('Feature importances:', list(gbm.feature_importances_))

Feature importances: [23, 7, 0, 33, 5, 56, 9, 1, 1, 21, 2, 5, 1, 19, 9, 6, 1, 10, 4, 10, 0, 31, 61, 4, 48, 102, 52, 79]


In [8]:
# self-defined eval metric
# f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
# Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

In [9]:
print('Starting training with custom eval function...')
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=rmsle,
        early_stopping_rounds=5)

Starting training with custom eval function...
[1]	valid_0's l2: 0.242763	valid_0's RMSLE: 0.344957
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.237895	valid_0's RMSLE: 0.341693
[3]	valid_0's l2: 0.233277	valid_0's RMSLE: 0.338462
[4]	valid_0's l2: 0.22925	valid_0's RMSLE: 0.335656
[5]	valid_0's l2: 0.226155	valid_0's RMSLE: 0.333431
[6]	valid_0's l2: 0.222963	valid_0's RMSLE: 0.331104
[7]	valid_0's l2: 0.220364	valid_0's RMSLE: 0.329193
[8]	valid_0's l2: 0.217872	valid_0's RMSLE: 0.327337
[9]	valid_0's l2: 0.215328	valid_0's RMSLE: 0.325433
[10]	valid_0's l2: 0.212743	valid_0's RMSLE: 0.323523
[11]	valid_0's l2: 0.210805	valid_0's RMSLE: 0.321986
[12]	valid_0's l2: 0.208945	valid_0's RMSLE: 0.320523
[13]	valid_0's l2: 0.206986	valid_0's RMSLE: 0.319027
[14]	valid_0's l2: 0.205513	valid_0's RMSLE: 0.317796
[15]	valid_0's l2: 0.203728	valid_0's RMSLE: 0.316383
[16]	valid_0's l2: 0.201865	valid_0's RMSLE: 0.314827
[17]	valid_0's l2: 0.200639	valid_0's 

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.05, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=20, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [10]:
# another self-defined eval metric
# f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
# Relative Absolute Error (RAE)
def rae(y_true, y_pred):
    return 'RAE', np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False

In [11]:
print('Starting training with multiple custom eval functions...')
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=lambda y_true, y_pred: [rmsle(y_true, y_pred), rae(y_true, y_pred)],
        early_stopping_rounds=5)

Starting training with multiple custom eval functions...
[1]	valid_0's l2: 0.242763	valid_0's RMSLE: 0.344957	valid_0's RAE: 0.991146
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.237895	valid_0's RMSLE: 0.341693	valid_0's RAE: 0.98072
[3]	valid_0's l2: 0.233277	valid_0's RMSLE: 0.338462	valid_0's RAE: 0.970493
[4]	valid_0's l2: 0.22925	valid_0's RMSLE: 0.335656	valid_0's RAE: 0.961139
[5]	valid_0's l2: 0.226155	valid_0's RMSLE: 0.333431	valid_0's RAE: 0.953484
[6]	valid_0's l2: 0.222963	valid_0's RMSLE: 0.331104	valid_0's RAE: 0.945419
[7]	valid_0's l2: 0.220364	valid_0's RMSLE: 0.329193	valid_0's RAE: 0.938379
[8]	valid_0's l2: 0.217872	valid_0's RMSLE: 0.327337	valid_0's RAE: 0.931631
[9]	valid_0's l2: 0.215328	valid_0's RMSLE: 0.325433	valid_0's RAE: 0.92451
[10]	valid_0's l2: 0.212743	valid_0's RMSLE: 0.323523	valid_0's RAE: 0.917099
[11]	valid_0's l2: 0.210805	valid_0's RMSLE: 0.321986	valid_0's RAE: 0.911151
[12]	valid_0's l2: 0.208945	valid_0'

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.05, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=20, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [12]:
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
print('The rmsle of prediction is:', rmsle(y_test, y_pred)[1])
print('The rae of prediction is:', rae(y_test, y_pred)[1])

Starting predicting...
The rmsle of prediction is: 0.3110323289863278
The rae of prediction is: 0.8645881044669875


In [13]:
# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)

In [19]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.001, 1],
    'n_estimators': [20, 40, 60]
}

In [20]:
gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(X_train, y_train)

print('Best parameters found by grid search are:', gbm.best_params_)

Best parameters found by grid search are: {'learning_rate': 0.1, 'n_estimators': 40}
