#### Imports

In [1]:
import optuna
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
import gc
import pickle
from datetime import datetime
from os import makedirs
from os.path import dirname, join
from pathlib import Path

import numpy as np
import scipy
import yaml
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold

#### Utils

In [2]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules.

    It is assumed that the predictions are not constant.

    Returns the average of each sample's Pearson correlation coefficient

    Source: https://www.kaggle.com/code/xiafire/lb-t15-msci-multiome-catboostregressor#Predicting
    """
    if y_true.shape != y_pred.shape:
        raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

In [3]:
config = {
    "output_dir": "/scratch/st-jiaruid-1/shenoy/projects/scRNA-competition/output/krr-rbf-exp",
    "paths": {
      "x": "/scratch/st-jiaruid-1/shenoy/svd-comp/train_input_cite_svd128.pkl",
      "y": "/arc/project/st-jiaruid-1/yinian/multiome/sparse-data/train_cite_targets_values.sparse.npz",
      "x_test": "/scratch/st-jiaruid-1/shenoy/svd-comp/test_input_cite_svd128.pkl"
    },
    "seed": 42
}

In [4]:
# Load Data
%time
x_train_transformed = pickle.load(open(config["paths"]["x"], "rb"))

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 8.11 µs


In [6]:
%time
y = scipy.sparse.load_npz(config["paths"]["y"])

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 4.53 µs


In [7]:
pca_y = TruncatedSVD(
    n_components=1,
    random_state=config['seed'],
)
y_transformed = pca_y.fit_transform(y)

In [51]:
def objective(
    trial,
    x,
    y,
    y_orig,
    pca_y,
    random_state=22,
    n_jobs=1,
    early_stopping_rounds=50,
):
    # XGBoost parameters
    params = {
        "verbosity": 1,  # 0 (silent) - 3 (debug)
        "objective": 'reg:squarederror',
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, 50),
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05, log=True),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 0.6, log=True),
        "subsample": trial.suggest_float("subsample", 0.4, 0.8, log=True),
        "alpha": trial.suggest_float("alpha", 0.01, 10.0, log=True),
        "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
        "gamma": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
        "min_child_weight": trial.suggest_float("min_child_weight", 10, 1000, log=True),
        "seed": random_state,
        "n_jobs": n_jobs,
    }
    
    n = int(0.8 * x.shape[0]) # let's do random for now
    x_train, y_train, x_val, y_val, y_val_orig = (
        x[:n, :], y[:n, :], x[n:, :], y[n:, :], y_orig[n:, :]
    )
    model = MultiOutputRegressor(XGBRegressor(**params))
    model.fit(x_train, y_train, verbose=1)
    
    
    return correlation_score(
        y_val_orig, 
        model.predict(x_val) @ pca_y.components_
    )

In [53]:
study = optuna.create_study(
    study_name='test-hpo-run', direction='maximize'
)

[32m[I 2022-10-09 11:44:37,633][0m A new study created in memory with name: test-hpo-run[0m


In [55]:
study.optimize(
    lambda trial: objective(trial, x_train_transformed, y_transformed, y.toarray(), pca_y, random_state=42),
    n_trials=2,
)

[32m[I 2022-10-09 11:47:42,456][0m Trial 0 finished with value: 0.7769128347109914 and parameters: {'n_estimators': 450, 'max_depth': 12, 'learning_rate': 0.014025892758434168, 'colsample_bytree': 0.2384114442215289, 'subsample': 0.4115162088050853, 'alpha': 0.3017706029269441, 'lambda': 0.16488478889979685, 'min_child_weight': 10.973974988642297}. Best is trial 0 with value: 0.7769128347109914.[0m
[32m[I 2022-10-09 11:49:38,825][0m Trial 1 finished with value: 0.7769128347195178 and parameters: {'n_estimators': 450, 'max_depth': 4, 'learning_rate': 0.010100483967230173, 'colsample_bytree': 0.44039978628798326, 'subsample': 0.6661130594385517, 'alpha': 5.939059882360051, 'lambda': 0.0003860942467210062, 'min_child_weight': 278.8769758910916}. Best is trial 1 with value: 0.7769128347195178.[0m


In [56]:
study.best_trial

FrozenTrial(number=1, values=[0.7769128347195178], datetime_start=datetime.datetime(2022, 10, 9, 11, 47, 42, 459055), datetime_complete=datetime.datetime(2022, 10, 9, 11, 49, 38, 824975), params={'n_estimators': 450, 'max_depth': 4, 'learning_rate': 0.010100483967230173, 'colsample_bytree': 0.44039978628798326, 'subsample': 0.6661130594385517, 'alpha': 5.939059882360051, 'lambda': 0.0003860942467210062, 'min_child_weight': 278.8769758910916}, distributions={'n_estimators': IntDistribution(high=1000, log=False, low=100, step=50), 'max_depth': IntDistribution(high=12, log=False, low=4, step=1), 'learning_rate': FloatDistribution(high=0.05, log=True, low=0.005, step=None), 'colsample_bytree': FloatDistribution(high=0.6, log=True, low=0.2, step=None), 'subsample': FloatDistribution(high=0.8, log=True, low=0.4, step=None), 'alpha': FloatDistribution(high=10.0, log=True, low=0.01, step=None), 'lambda': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'min_child_weight': FloatDis