#### Imports

In [1]:
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
import gc
import pickle
from datetime import datetime
from os import makedirs
from os.path import dirname, join
from pathlib import Path

import numpy as np
import scipy
import yaml
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold

#### Utils

In [2]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules.

    It is assumed that the predictions are not constant.

    Returns the average of each sample's Pearson correlation coefficient

    Source: https://www.kaggle.com/code/xiafire/lb-t15-msci-multiome-catboostregressor#Predicting
    """
    if y_true.shape != y_pred.shape:
        raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

In [3]:
config = {
    "output_dir": "/scratch/st-jiaruid-1/shenoy/projects/scRNA-competition/output/krr-rbf-exp",
    "paths": {
      "x": "/scratch/st-jiaruid-1/shenoy/svd-comp/train_input_multiome_svd128.pkl",
      "y": "/arc/project/st-jiaruid-1/yinian/multiome/sparse-data/train_multi_targets_values.sparse.npz",
      "x_test": "/scratch/st-jiaruid-1/shenoy/svd-comp/test_input_multiome_svd128.pkl"
    },
    "seed": 42
}

In [4]:
# Load Data
%time
x_train_transformed = pickle.load(open(config["paths"]["x"], "rb"))

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs


In [5]:
%time
x_test_transformed = pickle.load(open(config["paths"]["x_test"], "rb"))

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.25 µs


In [6]:
%time
y = scipy.sparse.load_npz(config["paths"]["y"])

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.53 µs


In [7]:
pca_y = TruncatedSVD(
    n_components=1,
    random_state=config['seed'],
)
y_transformed = pca_y.fit_transform(y)

In [13]:
model = MultiOutputRegressor(
    estimator = LGBMRegressor(
        n_estimators=200,
        objective='mae',
        random_state=config['seed']
    ),
    n_jobs=2
)

In [14]:
%time
model.fit(x_train_transformed, y_transformed)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.77 µs


Traceback (most recent call last):
  File "/arc/project/st-jiaruid-1/yinian/tensorflow-gpu/lib/python3.9/site-packages/psutil/_common.py", line 398, in wrapper
    return cache[key]
KeyError: (('/proc',), frozenset())

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/arc/project/st-jiaruid-1/yinian/tensorflow-gpu/lib/python3.9/site-packages/psutil/_pslinux.py", line 285, in <module>
    set_scputimes_ntuple("/proc")
  File "/arc/project/st-jiaruid-1/yinian/tensorflow-gpu/lib/python3.9/site-packages/psutil/_common.py", line 400, in wrapper
    ret = cache[key] = fun(*args, **kwargs)
  File "/arc/project/st-jiaruid-1/yinian/tensorflow-gpu/lib/python3.9/site-packages/psutil/_pslinux.py", line 268, in set_scputimes_ntuple
    with open_binary('%s/stat' % procfs_path) as f:
  File "/arc/project/st-jiaruid-1/yinian/tensorflow-gpu/lib/python3.9/site-packages/psutil/_common.py", line 727, in open_binary
    return open(fname, "rb"

In [10]:
score = correlation_score(
    y.toarray(), 
    model.predict(x_train_transformed) @ pca_y.components_
)

In [11]:
print (f'Score is {score}')

Score is -0.6421299203175792
