#### Imports

In [1]:
import gc
import pickle
from datetime import datetime
from os import makedirs
from os.path import dirname, join
from pathlib import Path

import numpy as np
import scipy
import yaml
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold

from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.metrics import Metric
from torch.nn import functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


#### Utils

In [2]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules.

    It is assumed that the predictions are not constant.

    Returns the average of each sample's Pearson correlation coefficient

    Source: https://www.kaggle.com/code/xiafire/lb-t15-msci-multiome-catboostregressor#Predicting
    """
    if y_true.shape != y_pred.shape:
        raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

In [3]:
config = {
    "output_dir": "/scratch/st-jiaruid-1/shenoy/projects/scRNA-competition/output/krr-rbf-exp",
    "paths": {
      "x": "/scratch/st-jiaruid-1/shenoy/svd-comp/train_input_multiome_svd128.pkl",
      "y": "/arc/project/st-jiaruid-1/yinian/multiome/sparse-data/train_multi_targets_values.sparse.npz",
      "x_test": "/scratch/st-jiaruid-1/shenoy/svd-comp/test_input_multiome_svd128.pkl"
    },
    "seed": 42
}

In [4]:
# Load Data
%time
x_train_transformed = pickle.load(open(config["paths"]["x"], "rb"))

CPU times: user 1 µs, sys: 1 µs, total: 2 µs
Wall time: 4.77 µs


In [5]:
%time
x_test_transformed = pickle.load(open(config["paths"]["x_test"], "rb"))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs


In [6]:
%time
y = scipy.sparse.load_npz(config["paths"]["y"])

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs


In [7]:
pca_y = TruncatedSVD(
    n_components=1,
    random_state=config['seed'],
)
y_transformed = pca_y.fit_transform(y)

#### Tabnet

In [9]:
tabnet_params = dict(
    n_d = 16,
    n_a = 16,
    n_steps = 8,
    gamma = 1.3,
    lambda_sparse = 0,
    optimizer_fn = optim.Adam,
    optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
    mask_type = "entmax",
    scheduler_params = dict(
        mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
    scheduler_fn = ReduceLROnPlateau,
    seed = config["seed"],
    verbose = 10
)

In [10]:
model = TabNetRegressor(**tabnet_params)



In [None]:
model.fit(x_train_transformed, y_transformed)



epoch 0  | loss: 18661.50942|  0:00:16s
epoch 10 | loss: 538.2967|  0:02:56s
epoch 20 | loss: 452.81091|  0:05:46s


In [None]:
score = correlation_score(
    y.toarray(), 
    model.predict(x_train_transformed) @ pca_y.components_
)

In [None]:
print (f'Score is {score}')