In [1]:
from fastai.tabular.all import *

from sklearn.model_selection import KFold
import lightgbm as lgb

In [2]:
import pickle

In [3]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [4]:
train_df = pd.read_csv('train_with_features.csv')

In [5]:
def train_models(train):
    # Hyperparammeters (optimized)
    seed = 29
    params = {
        'learning_rate': 0.1,        
        'lambda_l1': 2,
        'lambda_l2': 7,
        'num_leaves': 800,
        'min_sum_hessian_in_leaf': 20,
        'feature_fraction': 0.8,
        'feature_fraction_bynode': 0.8,
        'bagging_fraction': 0.9,
        'bagging_freq': 42,
        'min_data_in_leaf': 700,
        'max_depth': 4,
        'seed': seed,
        'feature_fraction_seed': seed,
        'bagging_seed': seed,
        'drop_seed': seed,
        'data_random_seed': seed,
        'objective': 'rmse',
        'boosting': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1,
    }   
    
    # Split features and target
    x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']
    # Transform stock id to a numeric value
    x['stock_id'] = x['stock_id'].astype(int)
    models =[]
    # Create out of folds array
    oof_predictions = np.zeros(x.shape[0])
    # Create a KFold object
    kfold = KFold(n_splits = 10, random_state = 1111, shuffle = True)
    # Iterate through each fold
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
        val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])
        model = lgb.train(params = params, 
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          num_boost_round = 3000, 
                          early_stopping_rounds = 25, 
                          verbose_eval = 100,
                          feval = feval_rmspe)
        models.append(model)
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = model.predict(x_val)
        # Predict the test set
        #test_predictions += model.predict(x_test) / 10
        
    rmspe_score = rmspe(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    # Return test predictions
    return models

In [9]:
train_models(train_df)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000473181	training's RMSPE: 0.219006	valid_1's rmse: 0.000486846	valid_1's RMSPE: 0.225212
[200]	training's rmse: 0.000457561	training's RMSPE: 0.211776	valid_1's rmse: 0.000475783	valid_1's RMSPE: 0.220095
[300]	training's rmse: 0.000446237	training's RMSPE: 0.206535	valid_1's rmse: 0.000467194	valid_1's RMSPE: 0.216122
[400]	training's rmse: 0.000437508	training's RMSPE: 0.202495	valid_1's rmse: 0.000460669	valid_1's RMSPE: 0.213103
[500]	training's rmse: 0.00043029	training's RMSPE: 0.199154	valid_1's rmse: 0.000456661	valid_1's RMSPE: 0.211249
[600]	training's rmse: 0.000423433	training's RMSPE: 0.19598	valid_1's rmse: 0.0004523	valid_1's RMSPE: 0.209232
[700]	training's rmse: 0.000417333	training's RMSPE: 0.193157	valid_1's rmse: 0.000447933	valid_1's RMSPE: 0.207212
[800]	training's rmse: 0.00041227	training's RMSPE: 0.190814	valid_1's rmse: 0.000445541	valid_1's RMSPE: 0.206105
[900]	training's



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000471707	training's RMSPE: 0.218371	valid_1's rmse: 0.00047796	valid_1's RMSPE: 0.220672
[200]	training's rmse: 0.000455531	training's RMSPE: 0.210882	valid_1's rmse: 0.000464718	valid_1's RMSPE: 0.214558
[300]	training's rmse: 0.000444508	training's RMSPE: 0.205779	valid_1's rmse: 0.00045592	valid_1's RMSPE: 0.210496
[400]	training's rmse: 0.00043594	training's RMSPE: 0.201813	valid_1's rmse: 0.000449861	valid_1's RMSPE: 0.207699
[500]	training's rmse: 0.000428713	training's RMSPE: 0.198467	valid_1's rmse: 0.000445157	valid_1's RMSPE: 0.205527
[600]	training's rmse: 0.000422261	training's RMSPE: 0.19548	valid_1's rmse: 0.000440585	valid_1's RMSPE: 0.203416
[700]	training's rmse: 0.000416667	training's RMSPE: 0.192891	valid_1's rmse: 0.00043728	valid_1's RMSPE: 0.20189
[800]	training's rmse: 0.000411275	training's RMSPE: 0.190394	valid_1's rmse: 0.000433713	valid_1's RMSPE: 0.200243
[900]	training's 



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.00047342	training's RMSPE: 0.21882	valid_1's rmse: 0.000501723	valid_1's RMSPE: 0.234905
[200]	training's rmse: 0.000457799	training's RMSPE: 0.2116	valid_1's rmse: 0.000493499	valid_1's RMSPE: 0.231054
[300]	training's rmse: 0.000446382	training's RMSPE: 0.206323	valid_1's rmse: 0.000483827	valid_1's RMSPE: 0.226525
[400]	training's rmse: 0.000437694	training's RMSPE: 0.202307	valid_1's rmse: 0.00047766	valid_1's RMSPE: 0.223638
[500]	training's rmse: 0.000429622	training's RMSPE: 0.198576	valid_1's rmse: 0.000469612	valid_1's RMSPE: 0.21987
[600]	training's rmse: 0.000423447	training's RMSPE: 0.195722	valid_1's rmse: 0.00046505	valid_1's RMSPE: 0.217734
[700]	training's rmse: 0.000417594	training's RMSPE: 0.193016	valid_1's rmse: 0.000460762	valid_1's RMSPE: 0.215727
[800]	training's rmse: 0.000412897	training's RMSPE: 0.190845	valid_1's rmse: 0.000457902	valid_1's RMSPE: 0.214388
[900]	training's r



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472288	training's RMSPE: 0.218733	valid_1's rmse: 0.000477439	valid_1's RMSPE: 0.219578
[200]	training's rmse: 0.00045697	training's RMSPE: 0.211639	valid_1's rmse: 0.000465658	valid_1's RMSPE: 0.21416
[300]	training's rmse: 0.000444941	training's RMSPE: 0.206067	valid_1's rmse: 0.000456569	valid_1's RMSPE: 0.20998
[400]	training's rmse: 0.000436051	training's RMSPE: 0.20195	valid_1's rmse: 0.000450006	valid_1's RMSPE: 0.206962
[500]	training's rmse: 0.000428866	training's RMSPE: 0.198622	valid_1's rmse: 0.000444953	valid_1's RMSPE: 0.204637
[600]	training's rmse: 0.000422264	training's RMSPE: 0.195565	valid_1's rmse: 0.000441088	valid_1's RMSPE: 0.20286
[700]	training's rmse: 0.000416537	training's RMSPE: 0.192912	valid_1's rmse: 0.00043769	valid_1's RMSPE: 0.201297
[800]	training's rmse: 0.000411676	training's RMSPE: 0.190661	valid_1's rmse: 0.000434903	valid_1's RMSPE: 0.200015
[900]	training's 



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472412	training's RMSPE: 0.218772	valid_1's rmse: 0.000479161	valid_1's RMSPE: 0.220537
[200]	training's rmse: 0.000456474	training's RMSPE: 0.211391	valid_1's rmse: 0.000466351	valid_1's RMSPE: 0.214641
[300]	training's rmse: 0.000445154	training's RMSPE: 0.206149	valid_1's rmse: 0.000457729	valid_1's RMSPE: 0.210672
[400]	training's rmse: 0.000437009	training's RMSPE: 0.202377	valid_1's rmse: 0.000451974	valid_1's RMSPE: 0.208024
[500]	training's rmse: 0.000428781	training's RMSPE: 0.198567	valid_1's rmse: 0.000445569	valid_1's RMSPE: 0.205076
[600]	training's rmse: 0.000422661	training's RMSPE: 0.195733	valid_1's rmse: 0.000441552	valid_1's RMSPE: 0.203227
[700]	training's rmse: 0.000416902	training's RMSPE: 0.193066	valid_1's rmse: 0.000437614	valid_1's RMSPE: 0.201414
[800]	training's rmse: 0.000412124	training's RMSPE: 0.190853	valid_1's rmse: 0.000434963	valid_1's RMSPE: 0.200194
[900]	train



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000474152	training's RMSPE: 0.219398	valid_1's rmse: 0.000479756	valid_1's RMSPE: 0.222452
[200]	training's rmse: 0.000457296	training's RMSPE: 0.211598	valid_1's rmse: 0.000465317	valid_1's RMSPE: 0.215756
[300]	training's rmse: 0.000446563	training's RMSPE: 0.206632	valid_1's rmse: 0.000457194	valid_1's RMSPE: 0.21199
[400]	training's rmse: 0.000438023	training's RMSPE: 0.202681	valid_1's rmse: 0.000450689	valid_1's RMSPE: 0.208974
[500]	training's rmse: 0.00043052	training's RMSPE: 0.199209	valid_1's rmse: 0.000445167	valid_1's RMSPE: 0.206413
[600]	training's rmse: 0.000424101	training's RMSPE: 0.196239	valid_1's rmse: 0.000440328	valid_1's RMSPE: 0.20417
[700]	training's rmse: 0.000418955	training's RMSPE: 0.193858	valid_1's rmse: 0.000437037	valid_1's RMSPE: 0.202644
[800]	training's rmse: 0.000413881	training's RMSPE: 0.191509	valid_1's rmse: 0.000433666	valid_1's RMSPE: 0.201081
[900]	training



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472454	training's RMSPE: 0.218558	valid_1's rmse: 0.0004791	valid_1's RMSPE: 0.222639
[200]	training's rmse: 0.000457138	training's RMSPE: 0.211473	valid_1's rmse: 0.000468351	valid_1's RMSPE: 0.217644
[300]	training's rmse: 0.000446223	training's RMSPE: 0.206424	valid_1's rmse: 0.000461491	valid_1's RMSPE: 0.214457
[400]	training's rmse: 0.000437307	training's RMSPE: 0.202299	valid_1's rmse: 0.000456257	valid_1's RMSPE: 0.212024
[500]	training's rmse: 0.00043019	training's RMSPE: 0.199007	valid_1's rmse: 0.000451753	valid_1's RMSPE: 0.209931
[600]	training's rmse: 0.00042418	training's RMSPE: 0.196227	valid_1's rmse: 0.000448374	valid_1's RMSPE: 0.208361
[700]	training's rmse: 0.000417664	training's RMSPE: 0.193212	valid_1's rmse: 0.000443874	valid_1's RMSPE: 0.20627
[800]	training's rmse: 0.000412729	training's RMSPE: 0.190929	valid_1's rmse: 0.000441621	valid_1's RMSPE: 0.205223
[900]	training's



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472865	training's RMSPE: 0.218703	valid_1's rmse: 0.000479656	valid_1's RMSPE: 0.223306
[200]	training's rmse: 0.000457075	training's RMSPE: 0.211401	valid_1's rmse: 0.000468943	valid_1's RMSPE: 0.218318
[300]	training's rmse: 0.00044664	training's RMSPE: 0.206574	valid_1's rmse: 0.000460825	valid_1's RMSPE: 0.214539
[400]	training's rmse: 0.000437076	training's RMSPE: 0.202151	valid_1's rmse: 0.000453853	valid_1's RMSPE: 0.211294
[500]	training's rmse: 0.000429817	training's RMSPE: 0.198793	valid_1's rmse: 0.000449541	valid_1's RMSPE: 0.209286
[600]	training's rmse: 0.00042287	training's RMSPE: 0.19558	valid_1's rmse: 0.000444703	valid_1's RMSPE: 0.207034
[700]	training's rmse: 0.000417221	training's RMSPE: 0.192968	valid_1's rmse: 0.00044096	valid_1's RMSPE: 0.205291
[800]	training's rmse: 0.000412405	training's RMSPE: 0.19074	valid_1's rmse: 0.000438909	valid_1's RMSPE: 0.204336
[900]	training's



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000472532	training's RMSPE: 0.218808	valid_1's rmse: 0.000488707	valid_1's RMSPE: 0.22511
[200]	training's rmse: 0.000456851	training's RMSPE: 0.211547	valid_1's rmse: 0.000473923	valid_1's RMSPE: 0.2183
[300]	training's rmse: 0.000445591	training's RMSPE: 0.206333	valid_1's rmse: 0.000464816	valid_1's RMSPE: 0.214106
[400]	training's rmse: 0.000436208	training's RMSPE: 0.201988	valid_1's rmse: 0.000456801	valid_1's RMSPE: 0.210414
[500]	training's rmse: 0.000428726	training's RMSPE: 0.198524	valid_1's rmse: 0.000451477	valid_1's RMSPE: 0.207961
[600]	training's rmse: 0.000422481	training's RMSPE: 0.195632	valid_1's rmse: 0.000446431	valid_1's RMSPE: 0.205637
[700]	training's rmse: 0.000417237	training's RMSPE: 0.193204	valid_1's rmse: 0.000443086	valid_1's RMSPE: 0.204097
[800]	training's rmse: 0.000412582	training's RMSPE: 0.191048	valid_1's rmse: 0.000439504	valid_1's RMSPE: 0.202446
[900]	training



Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000474078	training's RMSPE: 0.219513	valid_1's rmse: 0.00048586	valid_1's RMSPE: 0.223907
[200]	training's rmse: 0.000457388	training's RMSPE: 0.211785	valid_1's rmse: 0.000473024	valid_1's RMSPE: 0.217992
[300]	training's rmse: 0.000445665	training's RMSPE: 0.206356	valid_1's rmse: 0.000464257	valid_1's RMSPE: 0.213951
[400]	training's rmse: 0.000436671	training's RMSPE: 0.202192	valid_1's rmse: 0.000458102	valid_1's RMSPE: 0.211115
[500]	training's rmse: 0.000429472	training's RMSPE: 0.198859	valid_1's rmse: 0.000453695	valid_1's RMSPE: 0.209084
Early stopping, best iteration is:
[562]	training's rmse: 0.000425444	training's RMSPE: 0.196994	valid_1's rmse: 0.00045201	valid_1's RMSPE: 0.208307
Our out of folds RMSPE is 0.20022585980078092


[<lightgbm.basic.Booster at 0x7f98f72e08e0>,
 <lightgbm.basic.Booster at 0x7f98f72e0a60>,
 <lightgbm.basic.Booster at 0x7f98f72e0760>,
 <lightgbm.basic.Booster at 0x7f98f72e0880>,
 <lightgbm.basic.Booster at 0x7f98fcab3220>,
 <lightgbm.basic.Booster at 0x7f98fcab30a0>,
 <lightgbm.basic.Booster at 0x7f98fcab33a0>,
 <lightgbm.basic.Booster at 0x7f98fcab30d0>,
 <lightgbm.basic.Booster at 0x7f98fcab3160>,
 <lightgbm.basic.Booster at 0x7f98fcab36d0>]

In [6]:
x_test = train_df.drop(['row_id', 'time_id', 'target'], axis = 1)
x_test['stock_id'] = x_test['stock_id'].astype(int)

In [7]:
# for idx, model in enumerate(models):
#     filename = f'models/lgb_fold{idx}.pickle'
#     pickle.dump(model, open(filename, 'wb'))
    

In [8]:
for idx in range(10):
    filename = f'models/lgb_fold{idx}.pickle'
    model = pickle.load(open(filename, 'rb'))
    preds = model.predict(x_test)
    print(rmspe(train_df.target, preds))

0.18470989403729027
0.1737223782065638
0.19073378463864546
0.1880587200715555
0.17799874022536552
0.1725530401092998
0.1898247989760112
0.18514039121932407
0.18203909027786244
0.1981542175208046
