In [27]:
import os
import gc
import pickle
import numpy as np
import scipy
from tqdm import tqdm
from datetime import datetime
from utils import correlation_score

In [28]:
base_path = 'output/tabnet-multiome-imp-kfold/13_10_2022-15_19/'
model_paths = [
    'model_0th_fold.pkl',
    'model_1th_fold.pkl',
    'model_2th_fold.pkl',
    'model_3th_fold.pkl',
    'model_4th_fold.pkl',    
]

pca_path = 'output/tabnet-multiome-imp-kfold/13_10_2022-15_19/pca_y.pkl'
x_path = '/scratch/st-jiaruid-1/shenoy/svd-comp/train_input_multiome_svd128.pkl'
y_path = '/arc/project/st-jiaruid-1/yinian/multiome/sparse-data/train_multi_targets_values.sparse.npz'
x_cols_path = '/arc/project/st-jiaruid-1/yinian/multiome/sparse-data/train_multi_targets_idxcol.npz'
val_set_indices_path = '/arc/project/st-jiaruid-1/yinian/multiome/multi_val_set_10000_0.75_day_7.npy'

#### Load PCA

In [3]:
# load pca
with open(pca_path, 'rb') as f:
    pca_y = pickle.load(f)

In [4]:
# load x features
with open(x_path, 'rb') as f:
    x = pickle.load(f)
    
y = scipy.sparse.load_npz(y_path).toarray()
    
x_cols = np.load(x_cols_path, allow_pickle=True)["index"].tolist()

In [5]:
val_indices = set(np.load(val_set_indices_path).tolist())

In [6]:
indices_to_use = [i for i, x in enumerate(x_cols) if x in val_indices]

In [7]:
x_pred = x[indices_to_use, :]
y_true = y[indices_to_use, :]

In [8]:
del x, y, x_cols, val_indices, indices_to_use

In [9]:
predictions = []
for path in tqdm(model_paths):
    # load model
    with open(os.path.join(base_path, path), 'rb') as f:
        model = pickle.load(f)
    
    # store predictions
    predictions.append(model.predict(x_pred) @ pca_y.components_)

  0%|          | 0/5 [00:00<?, ?it/s]Error in cpuinfo: failed to parse processor information from /proc/cpuinfo
100%|██████████| 5/5 [00:18<00:00,  3.61s/it]


In [10]:
del model

In [11]:
gc.collect()

23321

In [12]:
predictions = np.stack(predictions)

#### Prepare Stacking Model

In [15]:
from functools import partial
from scipy.optimize import fmin
from sklearn.model_selection import KFold

# using https://www.youtube.com/watch?v=TuIgtitqJho
class OptimizePCC:
    def __init__(self):
        self.coef_ = 0
        
    def _pcc(self, coef, X, y):
        coef = np.reshape(coef, (coef.shape[0], 1, 1))
        x_coef = X * coef
        predictions = np.sum(x_coef, axis=0)
        pcc = correlation_score(y, predictions)
        return -1.0 * pcc
    
    def fit(self, X, y):
        partial_loss = partial(self._pcc, X=X, y=y)
        init_coef = np.random.dirichlet(np.ones(X.shape[0]))
        self.coef_ = fmin(partial_loss, init_coef, disp=True)
    
    def predict(self, X):
        coef = self.coef_
        coef = np.reshape(coef, (coef.shape[0], 1, 1))
        x_coef = X * coef
        return np.sum(x_coef, axis=0)

In [16]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
coeffs = []
for i, (tr_indices, val_indices) in enumerate(kf.split(list(range(y_true.shape[0])))):
    # get x_train and y_train
    x_train, y_train = predictions[:, tr_indices, :], y_true[tr_indices, :]
    x_val, y_val = predictions[:, val_indices, :], y_true[val_indices, :]
    
    # opt
    opt = OptimizePCC()
    opt.fit(x_train, y_train)
    
    # check score
    score = correlation_score(y_val, opt.predict(x_val))
    print (f'For fold: {i}, score: {score}')
    
    # store coeff
    coeffs.append(opt.coef_)

Optimization terminated successfully.
         Current function value: -0.629264
         Iterations: 157
         Function evaluations: 253
For fold: 0, score: 0.6292158462063026
Optimization terminated successfully.
         Current function value: -0.629184
         Iterations: 120
         Function evaluations: 197
For fold: 1, score: 0.6295361785417961
Optimization terminated successfully.
         Current function value: -0.629517
         Iterations: 155
         Function evaluations: 262
For fold: 2, score: 0.628204269324357
Optimization terminated successfully.
         Current function value: -0.629181
         Iterations: 130
         Function evaluations: 224
For fold: 3, score: 0.6295469234727231


In [23]:
del x_train, y_train, x_val, y_val, opt, tr_indices, val_indices
gc.collect()

NameError: name 'x_train' is not defined

##### x_test prediction

In [24]:
final_coeffs = np.mean(coeffs, 0)
print (final_coeffs)

[0.13140964 0.18514668 0.11103691 0.00675475 0.12466054]


In [25]:
x_test_path = '/scratch/st-jiaruid-1/shenoy/svd-comp/test_input_multiome_svd128.pkl'
with open(x_test_path, 'rb') as f:
    x_test = pickle.load(f)

In [30]:
test_predictions = []
for path in tqdm(model_paths):
    # load model
    with open(os.path.join(base_path, path), 'rb') as f:
        model = pickle.load(f)
    
    # store predictions
    test_predictions.append(model.predict(x_test) @ pca_y.components_)

100%|██████████| 5/5 [01:39<00:00, 19.81s/it]


In [31]:
test_predictions = np.stack(test_predictions)
final_coeffs = np.reshape(final_coeffs, (final_coeffs.shape[0], 1, 1))
test_predictions = final_coeffs * test_predictions
test_predictions = np.sum(test_predictions, 0)

In [32]:
pkl_filename = os.path.join(base_path, 'ensembled_test_pred.pkl')
with open(pkl_filename, "wb") as file:
    pickle.dump(test_predictions, file)