In [1]:
import gc
import pandas as pd
import pickle
from os.path import join
import numpy as np
from datetime import datetime

## Enter your prediction paths in the next cell

In [2]:
# predictions
BASE='/scratch/st-jiaruid-1/shenoy/projects/scRNA-competition/output/'
prediction_paths = {
    'multiome': join(BASE, 'catboost-exp-multiome-i32-o10/03_10_2022-12_09/test_pred.pkl'),
    'cite': join(BASE, 'krr-rbf-exp/30_09_2022-23_23/test_pred.pkl')
}

# load prediction files
predictions = {}
for tech, path in prediction_paths.items():
    predictions[tech] = pickle.load(open(path, 'rb'))

In [3]:
assert (predictions['multiome'].shape == (55935, 23418))
assert (predictions['cite'].shape == (48663, 140))

In [4]:
%%time
# Read the table of rows and columns required for submission
eval_ids = pd.read_parquet("/arc/project/st-jiaruid-1/yinian/multiome/sparse-data/evaluation.parquet")
# Convert the string columns to more efficient categorical types
#eval_ids.cell_id = eval_ids.cell_id.apply(lambda s: int(s, base=16))
eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())

CPU times: user 17.4 s, sys: 2.27 s, total: 19.7 s
Wall time: 19.8 s


In [5]:
# Prepare an empty series which will be filled with predictions
submission = pd.Series(name='target',
                       index=pd.MultiIndex.from_frame(eval_ids), 
                       dtype=np.float32)
submission

row_id    cell_id       gene_id        
0         c2150f55becb  CD86              NaN
1         c2150f55becb  CD274             NaN
2         c2150f55becb  CD270             NaN
3         c2150f55becb  CD155             NaN
4         c2150f55becb  CD112             NaN
                                           ..
65744175  2c53aa67933d  ENSG00000134419   NaN
65744176  2c53aa67933d  ENSG00000186862   NaN
65744177  2c53aa67933d  ENSG00000170959   NaN
65744178  2c53aa67933d  ENSG00000107874   NaN
65744179  2c53aa67933d  ENSG00000166012   NaN
Name: target, Length: 65744180, dtype: float32

In [6]:
%%time
y_columns = np.load("/arc/project/st-jiaruid-1/yinian/multiome/sparse-data/train_multi_targets_idxcol.npz",
                   allow_pickle=True)["columns"]

test_index = np.load("/arc/project/st-jiaruid-1/yinian/multiome/sparse-data/test_multi_inputs_idxcol.npz",
                    allow_pickle=True)["index"]

CPU times: user 19.1 ms, sys: 2.03 ms, total: 21.1 ms
Wall time: 42 ms


In [7]:
cell_dict = dict((k,v) for v,k in enumerate(test_index)) 
assert len(cell_dict)  == len(test_index)

gene_dict = dict((k,v) for v,k in enumerate(y_columns))
assert len(gene_dict) == len(y_columns)

### Adding Multiome predictions

In [8]:
eval_ids_cell_num = eval_ids.cell_id.apply(lambda x:cell_dict.get(x, -1))
eval_ids_gene_num = eval_ids.gene_id.apply(lambda x:gene_dict.get(x, -1))

valid_multi_rows = (eval_ids_gene_num !=-1) & (eval_ids_cell_num!=-1)

submission.iloc[valid_multi_rows] = predictions['multiome'][eval_ids_cell_num[valid_multi_rows].to_numpy(),
eval_ids_gene_num[valid_multi_rows].to_numpy()]
gc.collect()

0

### Adding Cite Predictions

In [9]:
submission.iloc[:len(predictions['cite'].ravel())] = predictions['cite'].ravel()

In [10]:
submission.reset_index(drop=True, inplace=True)
submission.index.name = 'row_id'

In [12]:
assert (submission.isnull().any() == False)

In [None]:
x = datetime.now().strftime("%d_%m_%Y-%H_%M") # to make sure we do not replace any predictions
BASE='/scratch/st-jiaruid-1/shenoy/projects/scRNA-competition/output/final_predictions'
submission.to_csv(join(BASE, f"submission_{x}.csv"))