In [5]:
from fastai.tabular.all import *

from sklearn.model_selection import KFold
import lightgbm as lgb

In [6]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [8]:
train_df = pd.read_csv('train_with_features.csv')

In [28]:
def train_models(train):
    # Hyperparammeters (optimized)
    seed = 29
    params = {
        'learning_rate': 0.1,        
        'lambda_l1': 2,
        'lambda_l2': 7,
        'num_leaves': 800,
        'min_sum_hessian_in_leaf': 20,
        'feature_fraction': 0.8,
        'feature_fraction_bynode': 0.8,
        'bagging_fraction': 0.9,
        'bagging_freq': 42,
        'min_data_in_leaf': 700,
        'max_depth': 4,
        'seed': seed,
        'feature_fraction_seed': seed,
        'bagging_seed': seed,
        'drop_seed': seed,
        'data_random_seed': seed,
        'objective': 'rmse',
        'boosting': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1,
    }   
    
    # Split features and target
    x = train.drop(['row_id', 'target', 'time_id'], axis = 1)
    y = train['target']
    # Transform stock id to a numeric value
    x['stock_id'] = x['stock_id'].astype(int)
    models =[]
    # Create out of folds array
    oof_predictions = np.zeros(x.shape[0])
    # Create a KFold object
    kfold = KFold(n_splits = 10, random_state = 1111, shuffle = True)
    # Iterate through each fold
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(x)):
        print(f'Training fold {fold + 1}')
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        # Root mean squared percentage error weights
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        train_dataset = lgb.Dataset(x_train, y_train, weight = train_weights, categorical_feature = ['stock_id'])
        val_dataset = lgb.Dataset(x_val, y_val, weight = val_weights, categorical_feature = ['stock_id'])
        model = lgb.train(params = params, 
                          train_set = train_dataset, 
                          valid_sets = [train_dataset, val_dataset], 
                          num_boost_round = 3000, 
                          early_stopping_rounds = 25, 
                          verbose_eval = 100,
                          feval = feval_rmspe)
        models.append(model)
        # Add predictions to the out of folds array
        oof_predictions[val_ind] = model.predict(x_val)
        # Predict the test set
        test_predictions += model.predict(x_test) / 10
        
    rmspe_score = rmspe(y, oof_predictions)
    print(f'Our out of folds RMSPE is {rmspe_score}')
    # Return test predictions
    return models

In [None]:
models = train_models(train_df)

Training fold 1




Training until validation scores don't improve for 25 rounds
[100]	training's rmse: 0.000473181	training's RMSPE: 0.219006	valid_1's rmse: 0.000486846	valid_1's RMSPE: 0.225212
[200]	training's rmse: 0.000457561	training's RMSPE: 0.211776	valid_1's rmse: 0.000475783	valid_1's RMSPE: 0.220095
[300]	training's rmse: 0.000446237	training's RMSPE: 0.206535	valid_1's rmse: 0.000467194	valid_1's RMSPE: 0.216122
[400]	training's rmse: 0.000437508	training's RMSPE: 0.202495	valid_1's rmse: 0.000460669	valid_1's RMSPE: 0.213103
[500]	training's rmse: 0.00043029	training's RMSPE: 0.199154	valid_1's rmse: 0.000456661	valid_1's RMSPE: 0.211249
[600]	training's rmse: 0.000423433	training's RMSPE: 0.19598	valid_1's rmse: 0.0004523	valid_1's RMSPE: 0.209232
[700]	training's rmse: 0.000417333	training's RMSPE: 0.193157	valid_1's rmse: 0.000447933	valid_1's RMSPE: 0.207212
[800]	training's rmse: 0.00041227	training's RMSPE: 0.190814	valid_1's rmse: 0.000445541	valid_1's RMSPE: 0.206105
[900]	training's

In [13]:
models

[<lightgbm.basic.Booster at 0x7f23b37ed340>]

In [19]:
x_test = train_df.drop(['row_id', 'time_id', 'target'], axis = 1)


In [21]:
preds = models[0].predict(x_test)

In [22]:
preds.shape

(428932,)

In [26]:
.shape

(428932,)

In [27]:
rmspe(preds, train_df.target.to_numpy())

0.2525921853990848