In notebook 1, several sparse matrices from this dataset by Fabien (https://www.kaggle.com/datasets/fabiencrom/multimodal-single-cell-as-sparse-matrix) were used

Fabien's notebook (https://www.kaggle.com/code/fabiencrom/multimodal-single-cell-creating-sparse-data/notebook) Explains how that dataset was obtained.

For completeness of explaining the full process from start to finish, this notebook will go over the creation of the sparse files which will be used in the later notebooks.

Note: The files generated in these methods would be used in the multiome method from Xiafire (https://www.kaggle.com/code/xiafire/msci-multiome-5-steps-x-5-folds-25-models)

# First, all the basic imports and file names which may or may not be used is loaded in essentially as a header¶

In [6]:
import os, gc, pickle, datetime, scipy.sparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from colorama import Fore, Back, Style

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from cycler import cycler
from IPython.display import display

import scipy.sparse

In [7]:
# Directory of the data
DATA_DIR = "/kaggle/input/open-problems-multimodal/"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")

FP_MULT_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
FP_MULT_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
FP_MULT_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

# This notebook will go over how to make these files but...
# FP_MULT_TRAIN_TARGETS_idx = "./train_multi_targets_idxcol.npz"
# FP_MULT_TRAIN_TARGETS_sparse = "./train_multi_targets_values.sparse.npz"
# FP_MULT_TRAIN_INPUTS_idx = "./train_multi_inputs_idxcol.npz"
# FP_MULT_TRAIN_INPUTS_sparse = "./train_multi_inputs_values.sparse.npz"
# FP_MULT_TEST_INPUTS_idx = "./test_multi_inputs_idxcol.npz"
# FP_MULT_TEST_INPUTS_sparse = "./test_multi_inputs_values.sparse.npz"
# Ultimately these files already exist so it would be a waste of space to
# use the new created files. So, the files from https://www.kaggle.com/datasets/fabiencrom/multimodal-single-cell-as-sparse-matrix
# will be used after going over how to make those files
FP_MULT_TRAIN_TARGETS_idx = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_targets_idxcol.npz"
FP_MULT_TRAIN_TARGETS_sparse = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_targets_values.sparse.npz"
FP_MULT_TRAIN_INPUTS_idx = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_inputs_idxcol.npz"
FP_MULT_TRAIN_INPUTS_sparse = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_inputs_values.sparse.npz"
FP_MULT_TEST_INPUTS_idx = "../input/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_idxcol.npz"
FP_MULT_TEST_INPUTS_sparse = "../input/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_values.sparse.npz"

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

# This file is made to go over the method but
#FP_EVALUATION_IDS_parquet = "./evaluation.parquet"
# Ultimately it's already saved here
FP_EVALUATION_IDS_parquet = "../input/multimodal-single-cell-as-sparse-matrix/evaluation.parquet"

# this is the original made file name
multiome_submission_filename = 'submission_multi_only.csv'

# afterwards my session was terminated and so this needs to be loaded in order to
# get the submission file:
multiome_submission_filename_saved = '../input/submission-based-on-xiafire/submission_multi_only.csv'

# This is the new file which will be saved from the file above
# (Some columns needed to be dropped)
multiome_submission_filename_cleaned_ver = 'submission_multi_only_cleaned.csv'

# This is the saved name in case the kernel resets again
multiome_submission_filename_cleaned_ver_saved = '../input/cleaned-submission-based-on-xiafire/submission_multi_only_cleaned.csv'

# Creating sparse data files for Multiome (only necessary for Multiome)

First, pytables is installed to deal with large amounts of data

In [8]:
%%time
!conda install pytables -y

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Retrieving notices: ...working... done
CPU times: user 962 ms, sys: 166 ms, total: 1.13 s
Wall time: 36.2 s


In [9]:
#necessary imports
import pandas as pd
import numpy as np
import scipy.sparse
import scipy

# Sparse file creation function

In [10]:
# Function from Fabien's notebook

# the multiome data is in h5 format
# we want to convert this to sparse files

# Inputs:
# filename - original file name
# out_filename - new file in sparse format
# chunksize - how much is read at a time

# Output:
# No output, but result is that the two files
# will be written. Once called out_filename_values.sparse.npz
# and another called out_filename_idxcol.npz

# The idea is to read in chunks (since it's not possible to read the full
# file at once) and slowly save the file as a sparse matrix (which will
# be able to be read all at once)
def convert_h5_to_sparse_csr(filename, out_filename, chunksize=2500):
    start = 0
    total_rows = 0

    sparse_chunks_data_list = []
    chunks_index_list = []
    columns_name = None
    while True:
        df_chunk = pd.read_hdf(filename, start=start, stop=start+chunksize)
        if len(df_chunk) == 0:
            break
        chunk_data_as_sparse = scipy.sparse.csr_matrix(df_chunk.to_numpy())
        sparse_chunks_data_list.append(chunk_data_as_sparse)
        chunks_index_list.append(df_chunk.index.to_numpy())

        if columns_name is None:
            columns_name = df_chunk.columns.to_numpy()
        else:
            assert np.all(columns_name == df_chunk.columns.to_numpy())

        total_rows += len(df_chunk)
        print(total_rows)
        if len(df_chunk) < chunksize: 
            del df_chunk
            break
        del df_chunk
        start += chunksize
        
    all_data_sparse = scipy.sparse.vstack(sparse_chunks_data_list)
    del sparse_chunks_data_list
    
    all_indices = np.hstack(chunks_index_list)
    
    scipy.sparse.save_npz(out_filename+"_values.sparse", all_data_sparse)
    np.savez(out_filename+"_idxcol.npz", index=all_indices, columns =columns_name)
    

# Using the function

In [11]:
%%time
# Approximately 7 minutes

convert_h5_to_sparse_csr(FP_MULT_TRAIN_TARGETS, 
                         "train_multi_targets")

# This gives:
# FP_MULT_TRAIN_TARGETS_idx = "./train_multi_targets_idxcol.npz"
# FP_MULT_TRAIN_TARGETS_sparse = "./train_multi_targets_values.sparse.npz"

2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
57500
60000
62500
65000
67500
70000
72500
75000
77500
80000
82500
85000
87500
90000
92500
95000
97500
100000
102500
105000
105942
CPU times: user 5min 44s, sys: 15 s, total: 5min 59s
Wall time: 6min 25s


In [12]:
%%time
# Approximately 22 minutes

convert_h5_to_sparse_csr(FP_MULT_TRAIN_INPUTS, 
                         "train_multi_inputs")

# This gives:
# FP_MULT_TRAIN_INPUTS_idx = "./train_multi_inputs_idxcol.npz"
# FP_MULT_TRAIN_INPUTS_sparse = "./train_multi_inputs_values.sparse.npz"

2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
57500
60000
62500
65000
67500
70000
72500
75000
77500
80000
82500
85000
87500
90000
92500
95000
97500
100000
102500
105000
105942
CPU times: user 23min 7s, sys: 1min 17s, total: 24min 24s
Wall time: 26min 12s


In [13]:
%%time
# Approximately 13 minutes
convert_h5_to_sparse_csr(FP_MULT_TEST_INPUTS, 
                         "test_multi_inputs")

# This gives:
# FP_MULT_TEST_INPUTS_idx = "./test_multi_inputs_idxcol.npz"
# FP_MULT_TEST_INPUTS_sparse = "./test_multi_inputs_values.sparse.npz"

2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
55935
CPU times: user 12min 38s, sys: 41.3 s, total: 13min 19s
Wall time: 15min 2s


# Some other files were also converted to parquet format for efficiency

In [14]:
def convert_to_parquet(filename, out_filename):
    df = pd.read_csv(filename)
    df.to_parquet(out_filename + ".parquet")

In [15]:
convert_to_parquet(FP_EVALUATION_IDS, "evaluation")
# This creates FP_EVALUATION_IDS_parquet = "./evaluation.parquet"