Because the datasets are SO large (especially the Multiome dataset), instead of running both parts of the project in one notebook (and risk Kaggle running out of storage space then resetting all progress), it is more convenient to separate the multiome and citeseq parts of the project, then later merge the predicted outputs from the two parts together.

This notebook concerns itself with the multiome portion.

# First, all the basic imports and file names which may or may not be used is loaded in essentially as a header

In [1]:
!pip install tables

[0m

In [2]:
import os, gc, pickle, datetime, scipy.sparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from colorama import Fore, Back, Style

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from cycler import cycler
from IPython.display import display

import scipy.sparse

In [3]:
# Directory of the data
DATA_DIR = "/kaggle/input/open-problems-multimodal/"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")

FP_MULT_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
FP_MULT_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
FP_MULT_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

FP_MULT_TRAIN_TARGETS_idx = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_targets_idxcol.npz"
FP_MULT_TRAIN_TARGETS_sparse = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_targets_values.sparse.npz"
FP_MULT_TRAIN_INPUTS_idx = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_inputs_idxcol.npz"
FP_MULT_TRAIN_INPUTS_sparse = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_inputs_values.sparse.npz"
FP_MULT_TEST_INPUTS_idx = "../input/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_idxcol.npz"
FP_MULT_TEST_INPUTS_sparse = "../input/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_values.sparse.npz"

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

FP_EVALUATION_IDS_parquet = "../input/multimodal-single-cell-as-sparse-matrix/evaluation.parquet"


# Multiome

Now the multiome portion begins

Model from: https://www.kaggle.com/code/xiafire/msci-multiome-5-steps-x-5-folds-25-models

## Load in the data

In [4]:
%%time
# 1 min 54 s

train_inputs = scipy.sparse.load_npz(FP_MULT_TRAIN_INPUTS_sparse)
train_inputs = train_inputs.astype('float16', copy=False)

train_targets = scipy.sparse.load_npz(FP_MULT_TRAIN_TARGETS_sparse)

test_inputs = scipy.sparse.load_npz(FP_MULT_TEST_INPUTS_sparse)


train_target_cols = np.load(FP_MULT_TRAIN_TARGETS_idx,
                    allow_pickle=True)["columns"]

test_input_rows = np.load(FP_MULT_TEST_INPUTS_idx,
                  allow_pickle=True)["index"]

CPU times: user 50.8 s, sys: 3.88 s, total: 54.7 s
Wall time: 1min 51s


## Perform SVD

In [5]:
num_components = 32 # this will be the setting for n_components
random_state_num = 4060 # random state number

In [6]:
%%time
# 34 minutes 24 seconds

# reduce the dimensionality of the training inputs
pca_inputs = TruncatedSVD(n_components=num_components, random_state=random_state_num)
train_inputs = pca_inputs.fit_transform(train_inputs)

print(pca_inputs.explained_variance_ratio_.sum())

0.008240384
CPU times: user 7min 3s, sys: 8.66 s, total: 7min 11s
Wall time: 7min 5s


In [7]:
%%time
# 5 minutes 58 seconds

# Reduce the dimensionality of the training targets
pca_targets = TruncatedSVD(n_components=num_components, random_state=random_state_num)
train_target = pca_targets.fit_transform(train_targets)
print(pca_targets.explained_variance_ratio_.sum())

0.10154436
CPU times: user 1min 22s, sys: 3.89 s, total: 1min 26s
Wall time: 1min 21s


## Multiome learning model

From: https://www.kaggle.com/code/xiafire/msci-multiome-5-steps-x-5-folds-25-models

In [8]:
# Use kernel ridge regression
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
kernel = RBF(length_scale = 10)
krr = KernelRidge(alpha=0.2, kernel=kernel)

In [9]:
n = 5 # 5 folds

correlation score from MSCI CITEseq Keras Quickstart by AMBROSM

In [10]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    
    It is assumed that the predictions are not constant.
    
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    if y_true.shape != y_pred.shape: raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

In [11]:
%%time
# 47 min 59 s

# Perform training
from sklearn.model_selection import KFold
import os, gc, pickle

np.random.seed(random_state_num)
all_row_indices = np.arange(train_inputs.shape[0])
np.random.shuffle(all_row_indices)

kf = KFold(n_splits=5, shuffle=True, random_state=random_state_num)

index = 0
score = []

d = train_inputs.shape[0]//n
for i in range(0, n*d, d):
    print(f'start [{i}:{i+d}]')
    ind = all_row_indices[i:i+d]    
    for idx_tr, idx_va in kf.split(ind):
        X = train_inputs[ind]
        Y = train_target[ind] #.todense()
        Yva = train_targets[ind][idx_va]
        Xtr, Xva = X[idx_tr], X[idx_va]
        Ytr = Y[idx_tr]
        del X, Y
        gc.collect()
        
        print('Train...')
        model = krr #Ridge(copy_X=False)
        model.fit(Xtr, Ytr)
        del Xtr, Ytr
        gc.collect()
        
        s = correlation_score(Yva.todense(), model.predict(Xva)@pca_targets.components_)
        score.append(s)
        print(index, s)
        del Xva, Yva
        gc.collect()
        
        pkl_filename = f"model{index:02d}.pkl"
        index += 1
        with open(pkl_filename, 'wb') as file:
            pickle.dump(model, file)
    gc.collect()

start [0:21188]
Train...
0 0.6682822039680775
Train...
1 0.6685423789226382
Train...
2 0.6683247290583874
Train...
3 0.6674992456251114
Train...
4 0.6681502989691507
start [21188:42376]
Train...
5 0.6677000872544926
Train...
6 0.6669404886844817
Train...
7 0.6690231110314144
Train...
8 0.6677712568433687
Train...
9 0.6680094590291245
start [42376:63564]
Train...
10 0.6680071266413165
Train...
11 0.667344241699215
Train...
12 0.6669496109615377
Train...
13 0.6679260865085037
Train...
14 0.6678491788531686
start [63564:84752]
Train...
15 0.6678259170413687
Train...
16 0.6683554384880745
Train...
17 0.6660845812373106
Train...
18 0.6679400901607607
Train...
19 0.6686431731628043
start [84752:105940]
Train...
20 0.6683838962894157
Train...
21 0.6684722507866451
Train...
22 0.6676297959777407
Train...
23 0.6691010819157034
Train...
24 0.670624865461368
CPU times: user 37min 52s, sys: 2min 44s, total: 40min 37s
Wall time: 13min 48s


In [12]:
# Delete variables to free up space
del train_inputs # Multiome training data, has SVD applied (inputs)
del train_targets # Multiome training data, no SVD applied (targets)
del train_target # Multiome training data after SVD (targets)
gc.collect()

21

## Predictions for multiome

In [13]:
%%time
# 1 min 12 s

# For test inputs, match the pca transform of the training inputs
test_inputs = pca_inputs.transform(test_inputs)

CPU times: user 9.72 s, sys: 52.5 ms, total: 9.77 s
Wall time: 9.77 s


In [14]:
%%time
# less than a second

test_len = test_inputs.shape[0]
d = test_len//n
x = []
for i in range(n):
    x.append(test_inputs[i*d:i*d+d])
    
del test_inputs
gc.collect()

CPU times: user 111 ms, sys: 952 µs, total: 112 ms
Wall time: 111 ms


42

In [15]:
%%time
# 1 h 34 min 47 s

preds = np.zeros((test_len, 23418), dtype='float16')
for i,xx in enumerate(x):
    for ind in range(index): # index gives the number of models (25)
        print(ind, end=' ')
        with open(f'model{ind:02}.pkl', 'rb') as file:
            model = pickle.load(file)
        preds[i*d:i*d+d,:] += (model.predict(xx)@pca_targets.components_)/index
        gc.collect()
    print('')
    del xx
gc.collect()

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
CPU times: user 18min 2s, sys: 2min 48s, total: 20min 50s
Wall time: 14min 56s


0

In [16]:
del x
gc.collect()

21

In [17]:
%%time
# 4.38 s
np.save('preds.npy', preds) # save predictions

CPU times: user 3.05 ms, sys: 1.52 s, total: 1.52 s
Wall time: 1.52 s


In [18]:
%%time
# less than a minute
preds = preds.astype('float16', copy=False)
preds

CPU times: user 25 µs, sys: 2 µs, total: 27 µs
Wall time: 31 µs


array([[0.7593, 0.3389, 0.2305, ..., 1.016 , 1.483 , 2.32  ],
       [0.6523, 0.385 , 0.4346, ..., 1.4   , 1.399 , 2.318 ],
       [0.6807, 0.2515, 0.4285, ..., 0.9033, 1.198 , 1.868 ],
       ...,
       [0.274 , 0.2947, 0.3638, ..., 1.141 , 0.62  , 2.166 ],
       [0.3513, 0.3254, 0.327 , ..., 1.295 , 1.12  , 2.47  ],
       [0.6416, 0.462 , 0.202 , ..., 1.312 , 4.203 , 3.256 ]],
      dtype=float16)

## Submission format and save

In [19]:
%%time
# 42 seconds

# Read the table of rows and columns required for submission
eval_ids = pd.read_parquet(FP_EVALUATION_IDS_parquet)
# Convert the string columns to more efficient categorical types
#eval_ids.cell_id = eval_ids.cell_id.apply(lambda s: int(s, base=16))
eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())

CPU times: user 23.9 s, sys: 4.7 s, total: 28.6 s
Wall time: 28.4 s


In [20]:
# Prepare an empty series which will be filled with predictions
submission = pd.Series(name='target',
                       index=pd.MultiIndex.from_frame(eval_ids), 
                       dtype=np.float32)
submission

row_id    cell_id       gene_id        
0         c2150f55becb  CD86              NaN
1         c2150f55becb  CD274             NaN
2         c2150f55becb  CD270             NaN
3         c2150f55becb  CD155             NaN
4         c2150f55becb  CD112             NaN
                                           ..
65744175  2c53aa67933d  ENSG00000134419   NaN
65744176  2c53aa67933d  ENSG00000186862   NaN
65744177  2c53aa67933d  ENSG00000170959   NaN
65744178  2c53aa67933d  ENSG00000107874   NaN
65744179  2c53aa67933d  ENSG00000166012   NaN
Name: target, Length: 65744180, dtype: float32

In [21]:
%%time
# 34.5 ms

cell_dict = dict((k,v) for v,k in enumerate(test_input_rows)) 
assert len(cell_dict)  == len(test_input_rows)

gene_dict = dict((k,v) for v,k in enumerate(train_target_cols))
assert len(gene_dict) == len(train_target_cols)

CPU times: user 23.1 ms, sys: 1.01 ms, total: 24.1 ms
Wall time: 24 ms


In [22]:
%%time
# 2.92 s

eval_ids_cell_num = eval_ids.cell_id.apply(lambda x:cell_dict.get(x, -1))
eval_ids_gene_num = eval_ids.gene_id.apply(lambda x:gene_dict.get(x, -1))

valid_multi_rows = (eval_ids_gene_num !=-1) & (eval_ids_cell_num!=-1)

CPU times: user 1.22 s, sys: 361 ms, total: 1.58 s
Wall time: 1.4 s


In [23]:
%%time
# 4.78 s

submission.iloc[valid_multi_rows] = preds[eval_ids_cell_num[valid_multi_rows].to_numpy(),
eval_ids_gene_num[valid_multi_rows].to_numpy()]

CPU times: user 1.39 s, sys: 281 ms, total: 1.67 s
Wall time: 1.67 s


In [24]:
del eval_ids_cell_num, eval_ids_gene_num, valid_multi_rows, eval_ids, test_input_rows, train_target_cols
gc.collect()

55

In [25]:
submission

row_id    cell_id       gene_id        
0         c2150f55becb  CD86                    NaN
1         c2150f55becb  CD274                   NaN
2         c2150f55becb  CD270                   NaN
3         c2150f55becb  CD155                   NaN
4         c2150f55becb  CD112                   NaN
                                             ...   
65744175  2c53aa67933d  ENSG00000134419    6.183594
65744176  2c53aa67933d  ENSG00000186862    0.044525
65744177  2c53aa67933d  ENSG00000170959    0.039185
65744178  2c53aa67933d  ENSG00000107874    1.047852
65744179  2c53aa67933d  ENSG00000166012    5.597656
Name: target, Length: 65744180, dtype: float32

In [26]:
submission.reset_index()

Unnamed: 0,row_id,cell_id,gene_id,target
0,0,c2150f55becb,CD86,
1,1,c2150f55becb,CD274,
2,2,c2150f55becb,CD270,
3,3,c2150f55becb,CD155,
4,4,c2150f55becb,CD112,
...,...,...,...,...
65744175,65744175,2c53aa67933d,ENSG00000134419,6.183594
65744176,65744176,2c53aa67933d,ENSG00000186862,0.044525
65744177,65744177,2c53aa67933d,ENSG00000170959,0.039185
65744178,65744178,2c53aa67933d,ENSG00000107874,1.047852


In [27]:
%%time

submission = submission.reset_index().drop(['cell_id', 'gene_id'], axis=1)
submission

CPU times: user 1.68 s, sys: 421 ms, total: 2.1 s
Wall time: 2.1 s


Unnamed: 0,row_id,target
0,0,
1,1,
2,2,
3,3,
4,4,
...,...,...
65744175,65744175,6.183594
65744176,65744176,0.044525
65744177,65744177,0.039185
65744178,65744178,1.047852


In [28]:
%%time

submission.set_index('row_id', inplace=True)
submission


CPU times: user 675 µs, sys: 0 ns, total: 675 µs
Wall time: 687 µs


Unnamed: 0_level_0,target
row_id,Unnamed: 1_level_1
0,
1,
2,
3,
4,
...,...
65744175,6.183594
65744176,0.044525
65744177,0.039185
65744178,1.047852


In [29]:
%%time
# Approximately 2 minutes

# save as csv
submission.to_csv('multiome_only_32.csv')

CPU times: user 1min 31s, sys: 2.22 s, total: 1min 33s
Wall time: 1min 33s
