In [None]:
import os
import gc
import joblib
import random
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from argparse import Namespace
from collections import defaultdict

import optuna
import scipy as sc
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, GroupKFold, train_test_split, KFold

import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 64)

def seed_everything(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
args = Namespace(
    debug=False,
    seed=21,
    folds=5,
    workers=4,
    min_time_id=530, 
    holdout=False,
    cv_method="stratified",
    num_bins=16,
    timeout=int(3600*8.5), # an hour * x
    data_path=Path("../input/ubiquant-parquet/"),
)
seed_everything(args.seed)

if args.debug:
    setattr(args, 'min_time_id', 1100)

assert args.cv_method in {"kfold", "group", "stratified", "time", "group_time"}, "unknown cv method"
assert args.data_path.exists(), "data_path not exists"

In [None]:
%%time
train = pd.read_parquet(args.data_path.joinpath("train_low_mem.parquet"))
assert train.isnull().any().sum() == 0, "null exists."
assert train.row_id.str.extract(r"(?P<time_id>\d+)_(?P<investment_id>\d+)").astype(train.time_id.dtype).equals(train[["time_id", "investment_id"]]), "row_id!=time_id_investment_id"
assert train.time_id.is_monotonic_increasing, "time_id not monotonic increasing"

if args.min_time_id is not None:
    train = train.query("time_id>=@args.min_time_id").reset_index(drop=True)
    gc.collect()
    
too_few_time_investment = [1415, 2800]
train=train.loc[~train.investment_id.isin(too_few_time_investment)].reset_index(drop=True)
train.shape

# StratifiedKFold by time_span: [discussion](https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302429)

In [None]:
time_id_df = (
    train.filter(regex=r"^(?!f_).*")
    .groupby("investment_id")
    .agg({"time_id": ["min", "max"]})
    .reset_index()
)
time_id_df["time_span"] = time_id_df["time_id"].diff(axis=1)["max"]
time_id_df.head(6)

In [None]:
train = train.merge(time_id_df.drop(columns="time_id").droplevel(level=1, axis=1), on="investment_id", how='left')
train.time_span.hist(bins=args.num_bins, figsize=(16,8))
del time_id_df
gc.collect()

In [None]:
if args.holdout:
    _target = pd.cut(train.time_span, args.num_bins, labels=False)
    _train, _valid = train_test_split(_target, stratify=_target)
    print(f"train length: {len(_train)}", f"holdout length: {len(_valid)}")
    valid = train.iloc[_valid.index].sort_values(by=["time_id", "investment_id"]).reset_index(drop=True)
    train = train.iloc[_train.index].sort_values(by=["time_id", "investment_id"]).reset_index(drop=True)
    train.time_span.hist(bins=args.num_bins, figsize=(16,8), alpha=0.8)
    valid.time_span.hist(bins=args.num_bins, figsize=(16,8), alpha=0.8)
    valid.drop(columns="time_span").to_parquet("valid.parquet")
    del valid, _train, _valid, _target
    gc.collect()
assert train.time_id.is_monotonic_increasing, "time_id not monotonic increasing"

In [None]:
if args.cv_method=="stratified":
    train["fold"] = -1
    _target = pd.cut(train.time_span, args.num_bins, labels=False)
    skf = StratifiedKFold(n_splits=args.folds)
    for fold, (train_index, valid_index) in enumerate(skf.split(_target, _target)):
        train.loc[valid_index, 'fold'] = fold

    fig, axs = plt.subplots(nrows=args.folds, ncols=1, sharex=True, figsize=(16,8), tight_layout=True)
    for ax, (fold, df) in zip(axs, train[["fold", "time_span"]].groupby("fold")):
        ax.hist(df.time_span, bins=args.num_bins)
        ax.text(0, 40000, f"fold: {fold}, count: {len(df)}", fontsize=16)
    plt.show()
    del _target, train_index, valid_index
    _=gc.collect()

In [None]:
cat_features = []
num_features = list(train.filter(like="f_").columns)
features = num_features + cat_features

combination_features = ["f_231-f_250", "f_118-f_280", "f_155-f_297", "f_25-f_237", "f_179-f_265", "f_119-f_270", "f_71-f_197"]
for f in combination_features:
    f1, f2 = f.split("-")
    train[f] = train[f1] + train[f2]
features += combination_features

to_drop = ["f_148", "f_72", "f_49", "f_205", "f_228", "f_97", "f_262"]
features = list(sorted(set(features).difference(set(to_drop))))

train = reduce_mem_usage(train.drop(columns="time_span"))
train[["investment_id", "time_id"]] = train[["investment_id", "time_id"]].astype(np.uint16)
train=train.drop(columns=["row_id"]+to_drop)

if args.cv_method=="stratified":
    train["fold"] = train["fold"].astype(np.uint8)
gc.collect()
#features += ["time_id"] # https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302429
len(features)

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# TODO: replace with feval_pearsonr
def feval_rmse(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'rmse', rmse(y_true, y_pred), False

# https://www.kaggle.com/c/ubiquant-market-prediction/discussion/302480
def feval_pearsonr(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'pearsonr', pearsonr(y_true, y_pred)[0], True

def objective(trial):    
    params = {
        'learning_rate':0.05,
        "objective": "regression",
        "metric": "rmse",
        'boosting_type': "gbdt", #trial.suggest_categorical("boosting_type", ["dart", "gbdt"]),
        'verbosity': -1,
        'n_jobs': -1,
        'seed': args.seed,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 18, 128, step=4),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        'n_estimators': 1000, #trial.suggest_int('n_estimators',200,1000),
        'max_depth': trial.suggest_int('max_depth', -1, 32, step=4),
        'max_bin':trial.suggest_int('max_bin', 32, 512),
        'min_data_in_leaf':trial.suggest_int('min_data_in_leaf',8, 512),
    }
    
    y = train['target']
    train['preds'] = -1000
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "pearsonr", valid_name='valid_1')
    scores = []
    
    def run_single_fold(fold, trn_ind, val_ind):
        train_dataset = lgb.Dataset(train.loc[trn_ind, features], y.loc[trn_ind], categorical_feature=cat_features)
        valid_dataset = lgb.Dataset(train.loc[val_ind, features], y.loc[val_ind], categorical_feature=cat_features)
        model = lgb.train(
            params,
            train_set = train_dataset, 
            valid_sets = [train_dataset, valid_dataset], 
            verbose_eval=0,
            early_stopping_rounds=50,
            callbacks=[pruning_callback],
            feval = feval_pearsonr
        )
        preds = model.predict(train.loc[val_ind, features])
        train.loc[val_ind, "preds"] = preds
        scores.append(rmse(y.loc[val_ind], preds))
        del train_dataset, valid_dataset, model
        gc.collect()
        
    if args.cv_method=="stratified":
        for fold in range(args.folds):
            trn_ind, val_ind = train.fold!=fold, train.fold==fold
            run_single_fold(fold, trn_ind, val_ind)
    elif args.cv_method=="time":
        tscv = TimeSeriesSplit(args.folds)
        for fold, (trn_ind, val_ind) in enumerate(tscv.split(train[features])):
            run_single_fold(fold, trn_ind, val_ind)
    elif args.cv_method=="group":
        # https://www.kaggle.com/lucamassaron/eda-target-analysis/notebook
        kfold = GroupKFold(args.folds)
        for fold, (trn_ind, val_ind) in enumerate(kfold.split(train[features], y, train.time_id)):
            run_single_fold(fold, trn_ind, val_ind)
    elif args.cv_method=="kfold":
        kfold = KFold(args.folds)
        for fold, (trn_ind, val_ind) in enumerate(kfold.split(train[features], train.investment_id)):
            run_single_fold(fold, trn_ind, val_ind)
         
    gc.collect()
    #return np.mean(scores)
    df = train[["target", "preds", "time_id"]].query("preds!=-1000")
    return df.groupby("time_id").apply(lambda x: pearsonr(x.target, x.preds)[0]).mean()

In [None]:
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=25))
study.optimize(objective, timeout=args.timeout)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)