# Catboost solution

## Imports

In [None]:
import pandas as pd
import numpy as np
from scipy import sparse
from colorama import Fore, Back, Style
from pathlib import Path
import gc

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GroupKFold
from sklearn.decomposition import TruncatedSVD

warnings.filterwarnings('ignore')

## Config

In [None]:
MAKE_PCA = False
SAVE = True

PATH = Path('/content/drive/MyDrive/cite')
COLUMNS_PATH = Path('/content/drive/MyDrive/cite')

RANDOM_STATE = 1
N_COMPONENTS = None

PARAMS = {
    'depth': 7,
    'loss_function': 'RMSE',
    'task_type': 'GPU',
    'iterations': 800,
    'allow_const_label': True,
    'random_state': RANDOM_STATE,
    'verbose': 0
}

## Competition metric

In [None]:
def correlation_score(y_true, y_pred):
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    if y_true.shape != y_pred.shape: raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

## Prepare data

In [None]:
metadata_df = pd.read_csv(PATH / 'metadata.csv', index_col='cell_id')
metadata_df = metadata_df[metadata_df.technology == "citeseq"]

Y = pd.read_hdf(PATH / 'train_cite_targets.h5')
Y = Y.values
Y -= Y.mean(axis=1).reshape(-1, 1)
Y /= Y.std(axis=1).reshape(-1, 1)

In [None]:
if MAKE_PCA:
    important_cols = np.loadtxt(COLUMNS_PATH / 'important_cols_v2.txt',
                                dtype='str')
    constant_cols = np.loadtxt(COLUMNS_PATH / 'constant_cols.txt', dtype='str')

    X = pd.read_hdf(PATH / 'train_cite_inputs.h5').drop(columns=constant_cols)
    meta = metadata_df.reindex(X.index)
    X0 = X[important_cols].values
    X = sparse.csr_matrix(X.values)
    gc.collect()

    X_test = pd.read_hdf(PATH /
                         'test_cite_inputs.h5').drop(columns=constant_cols)
    meta_test = metadata_df.reindex(X_test.index)
    X0_test = X_test[important_cols].values
    X_test = sparse.csr_matrix(X_test.values)

    both = sparse.vstack([X, X_test])
    svd = TruncatedSVD(n_components=N_COMPONENTS, random_state=RANDOM_STATE)
    both = svd.fit_transform(both)

    X = both[:70988]
    X_test = both[70988:]
    del both
    gc.collect()

    X = np.hstack([X, X0])
    X_test = np.hstack([X_test, X0_test])
    del X0, X0_test
    gc.collect()
else:
    X = np.load(PATH / 'train_84_128.npy')
    X_test = np.load(PATH / 'test_84_128.npy')

## Training

In [None]:
losses = []
corrscores = []
pred_train = np.zeros((Y.shape[0], Y.shape[1]))
test_pred = np.zeros((len(X_test), 140))

model = MultiOutputRegressor(CatBoostRegressor(**PARAMS))
kf = GroupKFold(n_splits=3)
for fold, (idx_train, idx_val) in enumerate(kf.split(X, groups=meta.donor)):
    X_train = X[idx_train]
    y_train = Y[idx_train]
    X_val = X[idx_val]
    y_val = Y[idx_val]

    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    pred_train[idx_val] = y_val_pred
    test_pred += model.predict(X_test)

    mse = mean_squared_error(y_va, y_va_pred)
    corrscore = correlation_score(y_va, y_va_pred)

    print(f"Fold {fold}: mse = {mse:.5f}, corr =  {corrscore:.5f}")
    losses.append(mse)
    corrscores.append(corrscore)

    del X_train, y_train, X_val, y_val
    gc.collect()

print(
    f"Average  mse = {np.mean(losses):.5f};corr = {np.mean(corrscores):.5f}+-{np.std(corrscores):.5f}"
)

Fold 0: mse = 0.21382, corr =  0.88656
Fold 1: mse = 0.20546, corr =  0.89122
Fold 2: mse = 0.19081, corr =  0.89933
Average  mse = 0.20336;corr = 0.89237+-0.00646


## Save

In [None]:
if CFG.save:
    np.save('oof.npy', pred_train)
    np.save('test_preds.npy', test_pred)