Credit: 
https://www.kaggle.com/nicohrubec/lgb-baseline-with-groupkfold
https://www.kaggle.com/valleyzw/ubiquant-lgbm-baseline

In [None]:
import numpy as np
import pandas as pd
import os, gc
from sklearn.model_selection import GroupKFold
from scipy.stats import pearsonr as p
import lightgbm as lgb
import joblib

import warnings
warnings.simplefilter('ignore')

In [None]:
#https://www.kaggle.com/valleyzw/ubiquant-lgbm-baseline
params = {
        'learning_rate':0.1,
        "objective": "regression",
        "metric": "rmse",
        'boosting_type': "gbdt",
        'verbosity': -1,
        'n_jobs': -1, 
        'seed': 21,
        'lambda_l1': 1.1895057699067542, 
        'lambda_l2': 1.9079686837880768e-08, 
        'num_leaves': 112, 
        'subsample':None,
        'feature_fraction': 0.6259927292757151, 
        'bagging_fraction': 0.9782210574588895, 
        'bagging_freq': 1, 
        'n_estimators': 306, 
        'max_depth': 12, 
        'max_bin': 494, 
        'min_data_in_leaf': 366,
        'colsample_bytree': None,
        'subsample_freq': None,
        'min_child_samples': None,
        'reg_lambda': None,
        'reg_alpha': None,
    }
gc.collect()

In [None]:
def run(train):
    for fold, (trn_idx, val_idx) in enumerate(kfold.split(train[feats], train.target, groups=train.time_id)):
        X_train, y_train = train[feats].iloc[trn_idx], train.target.iloc[trn_idx]
        X_val, y_val = train[feats].iloc[val_idx], train.target.iloc[val_idx]
        
        model = lgb.LGBMRegressor(**params)

        model.fit(X_train, y_train, eval_metric='rmse', eval_set=[(X_val, y_val)], verbose=100, early_stopping_rounds=50)
        joblib.dump(model, f'lgbm_fold_{fold}.pkl')

        y_pred = model.predict(X_val)

        score = p(y_pred, y_val)[0]
        print(f"Fold {fold + 1}: {score}")

        fold_scores.append(score)
        models.append(model)
        
        del model, y_pred, score, X_train, y_train, X_val, y_val
        gc.collect()

    del train
    gc.collect()
    
    print(f"Overall score: {np.mean(fold_scores, axis=0)}")

In [None]:
train = pd.read_pickle('../input/ump195gb/train.pkl')[-1000000:]
gc.collect()

In [None]:
#https://www.kaggle.com/lucamassaron/basic-eda-and-model-to-start
def feature_engineering(df, features):
    
    df['mean'] = df[features].mean(axis=1)
    df['median'] = df[features].median(axis=1)
    df['q01'] = df[features].quantile(q=0.01, axis=1)
    df['q05'] = df[features].quantile(q=0.05, axis=1)
    df['q10'] = df[features].quantile(q=0.10, axis=1)
    df['q25'] = df[features].quantile(q=0.25, axis=1)
    df['q75'] = df[features].quantile(q=0.75, axis=1)
    df['q90'] = df[features].quantile(q=0.90, axis=1)
    df['q95'] = df[features].quantile(q=0.95, axis=1)
    df['q99'] = df[features].quantile(q=0.99, axis=1)
    df['max'] = df[features].max(axis=1)
    df['min'] = df[features].min(axis=1)
    
    df['std'] = df[features].std(axis=1)
    df['range'] = df['max'] - df['min']
    df['iqr'] = df['q75'] - df['q25']
    df['tails'] = df['range'] / df['iqr']
    df['dispersion'] = df['std'] / df['mean']
    df['dispersion_2'] = df['iqr'] / df['median']
    df['skew'] = df[features].skew(axis=1)
    df['kurt'] = df[features].kurt(axis=1)
    
    df['median-max'] = df['median'] - df['max']
    df['median-min'] = df['median'] - df['min']
    df['q99-q95'] = df['q99'] - df['q95']
    df['q99-q90'] = df['q99'] - df['q90']
    df['q01-q05'] = df['q01'] - df['q05']
    df['q01-q10'] =  df['q01'] - df['q10']
    
    gc.collect()
    
    return df



In [None]:
models = []
fold_scores = []
model_weights = []
n_splits=10
feats = [f for f in train.columns if f not in ['time_id', 'row_id', 'target']]

train = feature_engineering(train, feats)

feats_origin = feats
feats = [f for f in train.columns if f not in ['time_id', 'row_id', 'target']]

kfold = GroupKFold(n_splits)
gc.collect()

In [None]:
run(train)
gc.collect()

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()

for i, (test_df, sample_prediction_df) in enumerate(iter_test):
    test_df = test_df[feats_origin]
    
    test_df = feature_engineering(test_df, feats_origin)
    
    pred = []
    
    for i in range(len(models)):
        pred.append(models[i].predict(test_df))
    
    sample_prediction_df['target'] = np.mean(pred, axis=0)
    gc.collect()
    env.predict(sample_prediction_df)