In [8]:
import xgboost as xgb

In [9]:
from catboost import CatBoostRegressor, metrics
import numpy as np

#### Imports

In [10]:
import gc
import pickle
from datetime import datetime
from os import makedirs
from os.path import dirname, join
from pathlib import Path

import numpy as np
import scipy
import yaml
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold

#### Utils

In [None]:
def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    """Compute the gradient squared error."""
    y = dtrain.get_label().reshape(predt.shape)
    return (predt - y).reshape(y.size)

def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    """Compute the hessian for squared error."""
    return np.ones(predt.shape).reshape(predt.size)

def squared_log(predt, dtrain):
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    return grad, hess

def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules.

    It is assumed that the predictions are not constant.

    Returns the average of each sample's Pearson correlation coefficient

    Source: https://www.kaggle.com/code/xiafire/lb-t15-msci-multiome-catboostregressor#Predicting
    """
    if y_true.shape != y_pred.shape:
        raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

In [11]:
config = {
    "output_dir": "/scratch/st-jiaruid-1/shenoy/projects/scRNA-competition/output/krr-rbf-exp",
    "paths": {
      "x": "/scratch/st-jiaruid-1/shenoy/svd-comp/train_input_multiome_svd128.pkl",
      "y": "/arc/project/st-jiaruid-1/yinian/multiome/sparse-data/train_multi_targets_values.sparse.npz",
      "x_test": "/scratch/st-jiaruid-1/shenoy/svd-comp/test_input_multiome_svd128.pkl"
    },
    "seed": 42
}

In [12]:
# Load Data
%time
x_train_transformed = pickle.load(open(config["paths"]["x"], "rb"))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 7.63 µs


In [13]:
%time
x_test_transformed = pickle.load(open(config["paths"]["x_test"], "rb"))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs


In [14]:
%time
y = scipy.sparse.load_npz(config["paths"]["y"])

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs


In [37]:
components = [50, 100, 150, 200, 250, 300]

### What is the best y_components?

In [40]:
from tqdm import tqdm 

In [43]:
for component in tqdm(components):
    print (f'Working on components: {component}')
    pca_y = TruncatedSVD(
        n_components=component,
        random_state=config['seed'],
    )
    y_transformed = pca_y.fit_transform(y)
    
    Xy = xgb.DMatrix(x_train_transformed, y_transformed)
    
    booster = xgb.train(
        {
            "tree_method": "hist",
            "num_target": y_transformed.shape[1],
        },
        dtrain=Xy,
        num_boost_round=100,x
        obj=squared_log,
    )    
    
    score = correlation_score(
        y.toarray(), 
        booster.inplace_predict(x_train_transformed) @ pca_y.components_
    )
    print (f'Score for components: {components} is {score}')

  0%|          | 0/6 [00:00<?, ?it/s]

Working on components: 50


 17%|█▋        | 1/6 [12:15<1:01:15, 735.08s/it]

Score for components: [50, 100, 150, 200, 250, 300] is 0.6715198414847497
Working on components: 100


 33%|███▎      | 2/6 [37:54<1:20:32, 1208.14s/it]

Score for components: [50, 100, 150, 200, 250, 300] is 0.6726009848447322
Working on components: 150


 50%|█████     | 3/6 [1:16:17<1:25:25, 1708.35s/it]

Score for components: [50, 100, 150, 200, 250, 300] is 0.6735511391011494
Working on components: 200


 67%|██████▋   | 4/6 [2:09:37<1:16:33, 2297.00s/it]

Score for components: [50, 100, 150, 200, 250, 300] is 0.6744700897079666
Working on components: 250


 83%|████████▎ | 5/6 [3:02:03<43:23, 2603.19s/it]  

Score for components: [50, 100, 150, 200, 250, 300] is 0.6753742106656241
Working on components: 300


100%|██████████| 6/6 [4:05:07<00:00, 2451.17s/it]

Score for components: [50, 100, 150, 200, 250, 300] is 0.6762454049504814





#### Objectives for XGBoost