Because the datasets are SO large (especially the Multiome dataset), instead of running both parts of the project in one notebook (and risk Kaggle running out of storage space then resetting all progress), it is more convenient to separate the multiome and citeseq parts of the project, then later merge the predicted outputs from the two parts together.

This notebook concerns itself with the multiome portion.

# First, all the basic imports and file names which may or may not be used is loaded in essentially as a header

In [4]:
!pip install tables

[0m

In [5]:
import os, gc, pickle, datetime, scipy.sparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from colorama import Fore, Back, Style

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from cycler import cycler
from IPython.display import display

import scipy.sparse

In [31]:
# Directory of the data
DATA_DIR = "/kaggle/input/open-problems-multimodal/"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")

FP_MULT_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
FP_MULT_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
FP_MULT_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

FP_MULT_TRAIN_TARGETS_idx = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_targets_idxcol.npz"
FP_MULT_TRAIN_TARGETS_sparse = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_targets_values.sparse.npz"
FP_MULT_TRAIN_INPUTS_idx = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_inputs_idxcol.npz"
FP_MULT_TRAIN_INPUTS_sparse = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_inputs_values.sparse.npz"
FP_MULT_TEST_INPUTS_idx = "../input/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_idxcol.npz"
FP_MULT_TEST_INPUTS_sparse = "../input/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_values.sparse.npz"

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

FP_EVALUATION_IDS_parquet = "../input/multimodal-single-cell-as-sparse-matrix/evaluation.parquet"


# Multiome

Now the multiome portion begins

Model from: https://www.kaggle.com/code/xiafire/msci-multiome-5-steps-x-5-folds-25-models

## Load in the data

In [15]:
%%time
# 1 min 54 s

train_inputs = scipy.sparse.load_npz(FP_MULT_TRAIN_INPUTS_sparse)
train_inputs = train_inputs.astype('float16', copy=False)

train_targets = scipy.sparse.load_npz(FP_MULT_TRAIN_TARGETS_sparse)

test_inputs = scipy.sparse.load_npz(FP_MULT_TEST_INPUTS_sparse)


train_target_cols = np.load(FP_MULT_TRAIN_TARGETS_idx,
                    allow_pickle=True)["columns"]

test_input_rows = np.load(FP_MULT_TEST_INPUTS_idx,
                  allow_pickle=True)["index"]

CPU times: user 1min 16s, sys: 9.72 s, total: 1min 26s
Wall time: 1min 54s


## Perform SVD

In [16]:
num_components = 128 # this will be the setting for n_components
random_state_num = 4060 # random state number

In [17]:
%%time
# 34 minutes 24 seconds

# reduce the dimensionality of the training inputs
pca_inputs = TruncatedSVD(n_components=num_components, random_state=random_state_num)
train_inputs = pca_inputs.fit_transform(train_inputs)

print(pca_inputs.explained_variance_ratio_.sum())

0.011000143
CPU times: user 34min 3s, sys: 28.7 s, total: 34min 32s
Wall time: 34min 4s


In [18]:
%%time
# 5 minutes 58 seconds

# Reduce the dimensionality of the training targets
pca_targets = TruncatedSVD(n_components=num_components, random_state=random_state_num)
train_target = pca_targets.fit_transform(train_targets)
print(pca_targets.explained_variance_ratio_.sum())

0.117797665
CPU times: user 6min 2s, sys: 6.24 s, total: 6min 9s
Wall time: 5min 58s


## Multiome learning model

From: https://www.kaggle.com/code/xiafire/msci-multiome-5-steps-x-5-folds-25-models

In [19]:
# Use kernel ridge regression
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
kernel = RBF(length_scale = 10)
krr = KernelRidge(alpha=0.2, kernel=kernel)

In [20]:
n = 5 # 5 folds

In [21]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    
    It is assumed that the predictions are not constant.
    
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    if y_true.shape != y_pred.shape: raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

In [22]:
%%time
# 47 min 59 s

# Perform training
from sklearn.model_selection import KFold
import os, gc, pickle

np.random.seed(random_state_num)
all_row_indices = np.arange(train_inputs.shape[0])
np.random.shuffle(all_row_indices)

kf = KFold(n_splits=5, shuffle=True, random_state=random_state_num)

index = 0
score = []

d = train_inputs.shape[0]//n
for i in range(0, n*d, d):
    print(f'start [{i}:{i+d}]')
    ind = all_row_indices[i:i+d]    
    for idx_tr, idx_va in kf.split(ind):
        X = train_inputs[ind]
        Y = train_target[ind] #.todense()
        Yva = train_targets[ind][idx_va]
        Xtr, Xva = X[idx_tr], X[idx_va]
        Ytr = Y[idx_tr]
        del X, Y
        gc.collect()
        
        print('Train...')
        model = krr #Ridge(copy_X=False)
        model.fit(Xtr, Ytr)
        del Xtr, Ytr
        gc.collect()
        
        s = correlation_score(Yva.todense(), model.predict(Xva)@pca_targets.components_)
        score.append(s)
        print(index, s)
        del Xva, Yva
        gc.collect()
        
        pkl_filename = f"model{index:02d}.pkl"
        index += 1
        with open(pkl_filename, 'wb') as file:
            pickle.dump(model, file)
    gc.collect()

start [0:21188]
Train...
0 0.6676727468925469
Train...
1 0.6678320190259386
Train...
2 0.6676293676935088
Train...
3 0.6668018392232067
Train...
4 0.6675136946502929
start [21188:42376]
Train...
5 0.6670241644104904
Train...
6 0.6663319766976489
Train...
7 0.6683778637199231
Train...
8 0.6670932086526736
Train...
9 0.6673672756506956
start [42376:63564]
Train...
10 0.667226487170204
Train...
11 0.666734964689098
Train...
12 0.6661706872690037
Train...
13 0.6672669159425553
Train...
14 0.6672535444828195
start [63564:84752]
Train...
15 0.6671731090709904
Train...
16 0.6676842566176582
Train...
17 0.6654245963939633
Train...
18 0.6672898039846825
Train...
19 0.6679875543340067
start [84752:105940]
Train...
20 0.6677907016223873
Train...
21 0.6677506240283801
Train...
22 0.6669268101500415
Train...
23 0.6685467253060617
Train...
24 0.6699420190388119
CPU times: user 1h 21min 27s, sys: 5min 20s, total: 1h 26min 48s
Wall time: 47min 59s


In [23]:
# Delete variables to free up space
del train_inputs # Multiome training data, has SVD applied (inputs)
del train_targets # Multiome training data, no SVD applied (targets)
del train_target # Multiome training data after SVD (targets)
gc.collect()

23

## Predictions for multiome

In [24]:
%%time
# 1 min 12 s

# For test inputs, match the pca transform of the training inputs
test_inputs = pca_inputs.transform(test_inputs)

CPU times: user 1min 12s, sys: 71 ms, total: 1min 12s
Wall time: 1min 12s


In [25]:
%%time
# less than a second

test_len = test_inputs.shape[0]
d = test_len//n
x = []
for i in range(n):
    x.append(test_inputs[i*d:i*d+d])
    
del test_inputs
gc.collect()

CPU times: user 190 ms, sys: 1.06 ms, total: 191 ms
Wall time: 190 ms


46

In [26]:
%%time
# 1 h 34 min 47 s

preds = np.zeros((test_len, 23418), dtype='float16')
for i,xx in enumerate(x):
    for ind in range(index): # index gives the number of models (25)
        print(ind, end=' ')
        with open(f'model{ind:02}.pkl', 'rb') as file:
            model = pickle.load(file)
        preds[i*d:i*d+d,:] += (model.predict(xx)@pca_targets.components_)/index
        gc.collect()
    print('')
    del xx
gc.collect()

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
CPU times: user 1h 44min 32s, sys: 7min 19s, total: 1h 51min 51s
Wall time: 1h 34min 47s


0

In [27]:
del x
gc.collect()

23

In [28]:
%%time
# 4.38 s
np.save('preds.npy', preds) # save predictions

CPU times: user 2.74 ms, sys: 4.37 s, total: 4.37 s
Wall time: 4.38 s


In [29]:
%%time
# less than a minute
preds = preds.astype('float16', copy=False)
preds

CPU times: user 45 µs, sys: 0 ns, total: 45 µs
Wall time: 57.9 µs


array([[0.87  , 0.376 , 0.2837, ..., 0.9976, 1.639 , 2.848 ],
       [0.854 , 0.4656, 0.6665, ..., 1.777 , 1.911 , 3.057 ],
       [0.7783, 0.3435, 0.5366, ..., 1.157 , 1.488 , 2.514 ],
       ...,
       [0.331 , 0.3623, 0.4385, ..., 1.342 , 0.8057, 2.613 ],
       [0.3745, 0.3857, 0.3699, ..., 1.508 , 1.228 , 2.865 ],
       [0.8774, 0.6597, 0.2416, ..., 1.897 , 5.94  , 4.633 ]],
      dtype=float16)

## Submission format and save

In [32]:
%%time
# 42 seconds

# Read the table of rows and columns required for submission
eval_ids = pd.read_parquet(FP_EVALUATION_IDS_parquet)
# Convert the string columns to more efficient categorical types
#eval_ids.cell_id = eval_ids.cell_id.apply(lambda s: int(s, base=16))
eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())

CPU times: user 35 s, sys: 14.2 s, total: 49.1 s
Wall time: 42 s


In [33]:
# Prepare an empty series which will be filled with predictions
submission = pd.Series(name='target',
                       index=pd.MultiIndex.from_frame(eval_ids), 
                       dtype=np.float32)
submission

row_id    cell_id       gene_id        
0         c2150f55becb  CD86              NaN
1         c2150f55becb  CD274             NaN
2         c2150f55becb  CD270             NaN
3         c2150f55becb  CD155             NaN
4         c2150f55becb  CD112             NaN
                                           ..
65744175  2c53aa67933d  ENSG00000134419   NaN
65744176  2c53aa67933d  ENSG00000186862   NaN
65744177  2c53aa67933d  ENSG00000170959   NaN
65744178  2c53aa67933d  ENSG00000107874   NaN
65744179  2c53aa67933d  ENSG00000166012   NaN
Name: target, Length: 65744180, dtype: float32

In [35]:
%%time
# 34.5 ms

cell_dict = dict((k,v) for v,k in enumerate(test_input_rows)) 
assert len(cell_dict)  == len(test_input_rows)

gene_dict = dict((k,v) for v,k in enumerate(train_target_cols))
assert len(gene_dict) == len(train_target_cols)

CPU times: user 30.6 ms, sys: 4.98 ms, total: 35.6 ms
Wall time: 34.5 ms


In [36]:
%%time
# 2.92 s

eval_ids_cell_num = eval_ids.cell_id.apply(lambda x:cell_dict.get(x, -1))
eval_ids_gene_num = eval_ids.gene_id.apply(lambda x:gene_dict.get(x, -1))

valid_multi_rows = (eval_ids_gene_num !=-1) & (eval_ids_cell_num!=-1)

CPU times: user 1.9 s, sys: 1.32 s, total: 3.22 s
Wall time: 2.92 s


In [37]:
%%time
# 4.78 s

submission.iloc[valid_multi_rows] = preds[eval_ids_cell_num[valid_multi_rows].to_numpy(),
eval_ids_gene_num[valid_multi_rows].to_numpy()]

CPU times: user 2.86 s, sys: 1.92 s, total: 4.78 s
Wall time: 4.78 s


In [39]:
del eval_ids_cell_num, eval_ids_gene_num, valid_multi_rows, eval_ids, test_input_rows, train_target_cols
gc.collect()

NameError: name 'eval_ids_cell_num' is not defined

In [40]:
submission

row_id    cell_id       gene_id        
0         c2150f55becb  CD86                    NaN
1         c2150f55becb  CD274                   NaN
2         c2150f55becb  CD270                   NaN
3         c2150f55becb  CD155                   NaN
4         c2150f55becb  CD112                   NaN
                                             ...   
65744175  2c53aa67933d  ENSG00000134419    9.453125
65744176  2c53aa67933d  ENSG00000186862    0.062561
65744177  2c53aa67933d  ENSG00000170959    0.077026
65744178  2c53aa67933d  ENSG00000107874    1.852539
65744179  2c53aa67933d  ENSG00000166012    8.085938
Name: target, Length: 65744180, dtype: float32

In [42]:
submission.reset_index()

Unnamed: 0,row_id,cell_id,gene_id,target
0,0,c2150f55becb,CD86,
1,1,c2150f55becb,CD274,
2,2,c2150f55becb,CD270,
3,3,c2150f55becb,CD155,
4,4,c2150f55becb,CD112,
...,...,...,...,...
65744175,65744175,2c53aa67933d,ENSG00000134419,9.453125
65744176,65744176,2c53aa67933d,ENSG00000186862,0.062561
65744177,65744177,2c53aa67933d,ENSG00000170959,0.077026
65744178,65744178,2c53aa67933d,ENSG00000107874,1.852539


In [43]:
%%time

submission = submission.reset_index().drop(['cell_id', 'gene_id'], axis=1)
submission

CPU times: user 2.85 s, sys: 1.69 s, total: 4.54 s
Wall time: 4.54 s


Unnamed: 0,row_id,target
0,0,
1,1,
2,2,
3,3,
4,4,
...,...,...
65744175,65744175,9.453125
65744176,65744176,0.062561
65744177,65744177,0.077026
65744178,65744178,1.852539


In [44]:
%%time

submission.set_index('row_id', inplace=True)
submission


CPU times: user 1.47 ms, sys: 1.97 ms, total: 3.44 ms
Wall time: 3.6 ms


Unnamed: 0_level_0,target
row_id,Unnamed: 1_level_1
0,
1,
2,
3,
4,
...,...
65744175,9.453125
65744176,0.062561
65744177,0.077026
65744178,1.852539


In [45]:
%%time
# Approximately 2 minutes

# save as csv
submission.to_csv('multiome_only.csv')

CPU times: user 2min 9s, sys: 3.84 s, total: 2min 12s
Wall time: 2min 12s
