In [1]:
import os
import pickle
import numpy as np
import scipy
from tqdm import tqdm
from datetime import datetime
from utils import correlation_score

  from .autonotebook import tqdm as notebook_tqdm


In [76]:
base_path = 'output/tabnet-cite-imp-kfold/13_10_2022-15_19'
model_paths = [
    'output/tabnet-cite-imp-kfold/13_10_2022-15_19/model_0th_fold.pkl',
    'output/tabnet-cite-imp-kfold/13_10_2022-15_19/model_1th_fold.pkl',
    'output/tabnet-cite-imp-kfold/13_10_2022-15_19/model_2th_fold.pkl',
    'output/tabnet-cite-imp-kfold/13_10_2022-15_19/model_3th_fold.pkl',
    'output/tabnet-cite-imp-kfold/13_10_2022-15_19/model_4th_fold.pkl',    
]

pca_path = 'output/tabnet-cite-imp-kfold/13_10_2022-15_19/pca_y.pkl'
x_path = '/scratch/st-jiaruid-1/shenoy/svd-comp/train_input_cite_200_mod.pkl'
y_path = '/arc/project/st-jiaruid-1/yinian/multiome/sparse-data/train_cite_targets_values.sparse.npz'
x_cols_path = '/arc/project/st-jiaruid-1/yinian/multiome/sparse-data/train_cite_targets_idxcol.npz'
val_set_indices_path = '/arc/project/st-jiaruid-1/yinian/multiome/cite_val_set_5000_0.75_day_7.npy'

#### Load PCA

In [3]:
# load pca
with open(pca_path, 'rb') as f:
    pca_y = pickle.load(f)

In [4]:
# load x features
with open(x_path, 'rb') as f:
    x = pickle.load(f)
    
y = scipy.sparse.load_npz(y_path).toarray()
    
x_cols = np.load(x_cols_path, allow_pickle=True)["index"].tolist()

In [5]:
val_indices = set(np.load(val_set_indices_path).tolist())

In [6]:
indices_to_use = [i for i, x in enumerate(x_cols) if x in val_indices]

In [7]:
x_pred = x[indices_to_use, :]
y_true = y[indices_to_use, :]

In [8]:
predictions = []
for path in tqdm(model_paths):
    # load model
    with open(path, 'rb') as f:
        model = pickle.load(f)
    
    # store predictions
    predictions.append(model.predict(x_pred) @ pca_y.components_)

  0%|          | 0/5 [00:00<?, ?it/s]Error in cpuinfo: failed to parse processor information from /proc/cpuinfo
100%|██████████| 5/5 [00:06<00:00,  1.36s/it]


In [22]:
predictions = np.stack(predictions)

#### Prepare Stacking Model

In [54]:
from functools import partial
from scipy.optimize import fmin

# using https://www.youtube.com/watch?v=TuIgtitqJho
class OptimizePCC:
    def __init__(self):
        self.coef_ = 0
        
    def _pcc(self, coef, X, y):
        coef = np.reshape(coef, (coef.shape[0], 1, 1))
        x_coef = X * coef
        predictions = np.sum(x_coef, axis=0)
        pcc = correlation_score(y, predictions)
        return -1.0 * pcc
    
    def fit(self, X, y):
        partial_loss = partial(self._pcc, X=X, y=y)
        init_coef = np.random.dirichlet(np.ones(X.shape[0]))
        self.coef_ = fmin(partial_loss, init_coef, disp=True)
    
    def predict(self, X):
        coef = self.coef_
        coef = np.reshape(coef, (coef.shape[0], 1, 1))
        x_coef = X * coef
        return np.sum(x_coef, axis=0)

In [55]:
from sklearn.model_selection import KFold

In [56]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
coeffs = []
for i, (tr_indices, val_indices) in enumerate(kf.split(list(range(y_true.shape[0])))):
    # get x_train and y_train
    x_train, y_train = predictions[:, tr_indices, :], y_true[tr_indices, :]
    x_val, y_val = predictions[:, val_indices, :], y_true[val_indices, :]
    
    # opt
    opt = OptimizePCC()
    opt.fit(x_train, y_train)
    
    # check score
    score = correlation_score(y_val, opt.predict(x_val))
    print (f'For fold: {i}, score: {score}')
    
    # store coeff
    coeffs.append(opt.coef_)

Optimization terminated successfully.
         Current function value: -0.892352
         Iterations: 147
         Function evaluations: 244
For fold: 0, score: 0.8890774085574681
Optimization terminated successfully.
         Current function value: -0.891585
         Iterations: 104
         Function evaluations: 178
For fold: 1, score: 0.892133064684909
Optimization terminated successfully.
         Current function value: -0.891590
         Iterations: 178
         Function evaluations: 286
For fold: 2, score: 0.8921251203805505
Optimization terminated successfully.
         Current function value: -0.891849
         Iterations: 169
         Function evaluations: 273
For fold: 3, score: 0.8910652699582885


##### x_test prediction

In [66]:
final_coeffs = np.mean(coeffs, 0)
print (final_coeffs)

[0.16880088 0.07555048 0.11631216 0.12215153 0.12251181]


In [67]:
x_test_path = '/scratch/st-jiaruid-1/shenoy/svd-comp/test_input_cite_svd200_mod.pkl'
with open(x_test_path, 'rb') as f:
    x_test = pickle.load(f)

In [68]:
test_predictions = []
for path in tqdm(model_paths):
    # load model
    with open(path, 'rb') as f:
        model = pickle.load(f)
    
    # store predictions
    test_predictions.append(model.predict(x_test) @ pca_y.components_)

100%|██████████| 5/5 [01:01<00:00, 12.32s/it]


In [69]:
test_predictions = np.stack(test_predictions)

In [70]:
final_coeffs_ = np.reshape(final_coeffs, (final_coeffs.shape[0], 1, 1))

In [71]:
test_predictions = final_coeffs_ * test_predictions

In [75]:
test_predictions = np.sum(test_predictions, 0)

In [78]:
pkl_filename = os.path.join(base_path, 'ensembled_test_pred.pkl')
with open(pkl_filename, "wb") as file:
    pickle.dump(test_predictions, file)