# Multi-modal Single Cell Integration


The task is to:
1. For the Multiome samples: given chromatin accessibility, predict gene expression.2. 
For the CITEseq samples: given gene expression, predict protein levels.

In [1]:
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
## READING METADATA ##

metadata = pd.read_csv('metadata.csv')
print(metadata)

             cell_id  day  donor cell_type technology
0       c2150f55becb    2  27678       HSC    citeseq
1       65b7edf8a4da    2  27678       HSC    citeseq
2       c1b26cb1057b    2  27678      EryP    citeseq
3       917168fa6f83    2  27678      NeuP    citeseq
4       2b29feeca86d    2  27678      EryP    citeseq
...              ...  ...    ...       ...        ...
281523  96a60b026659   10  31800    hidden   multiome
281524  d493e546991e   10  31800    hidden   multiome
281525  05666c99aa48   10  31800    hidden   multiome
281526  121f946642b5   10  31800    hidden   multiome
281527  b847ba21f59f   10  31800    hidden   multiome

[281528 rows x 5 columns]


In [3]:
## READING EVALUATION IDs DATA ##

evaluation_ids = pd.read_csv('evaluation_ids.csv')
print(evaluation_ids)

            row_id       cell_id          gene_id
0                0  c2150f55becb             CD86
1                1  c2150f55becb            CD274
2                2  c2150f55becb            CD270
3                3  c2150f55becb            CD155
4                4  c2150f55becb            CD112
...            ...           ...              ...
65744175  65744175  2c53aa67933d  ENSG00000134419
65744176  65744176  2c53aa67933d  ENSG00000186862
65744177  65744177  2c53aa67933d  ENSG00000170959
65744178  65744178  2c53aa67933d  ENSG00000107874
65744179  65744179  2c53aa67933d  ENSG00000166012

[65744180 rows x 3 columns]


In [4]:
## READING SUBMITTED SAMPLES DATA ## 

sample_submission = pd.read_csv('sample_submission.csv')
print(sample_submission)

            row_id  target
0                0     0.0
1                1     0.0
2                2     0.0
3                3     0.0
4                4     0.0
...            ...     ...
65744175  65744175     0.0
65744176  65744176     0.0
65744177  65744177     0.0
65744178  65744178     0.0
65744179  65744179     0.0

[65744180 rows x 2 columns]


### Loading Multiome and CITEseq datasets to verify the content

In [5]:
## MULTIOME DATA ##

multiome_train_input = h5py.File('train_multi_inputs.h5', 'r')
print(list(multiome_train_input.keys()))
train_multi_inputs = np.array(multiome_train_input['train_multi_inputs'])
print(train_multi_inputs)

multiome_train_target = h5py.File('train_multi_targets.h5', 'r')
print(list(multiome_train_target.keys()))
train_multi_targets = np.array(multiome_train_target['train_multi_targets'])
print(train_multi_targets)

multiome_test_input = h5py.File('test_multi_inputs.h5', 'r')
print(list(multiome_test_input.keys()))
test_multi_inputs = np.array(multiome_test_input['test_multi_inputs'])
print(test_multi_inputs)

['train_multi_inputs']
['axis0' 'axis1' 'block0_items' 'block0_values']
['train_multi_targets']
['axis0' 'axis1' 'block0_items' 'block0_values']
['test_multi_inputs']
['axis0' 'axis1' 'block0_items' 'block0_values']


In [6]:
multiome_train_df = pd.DataFrame(train_multi_inputs)
multiome_train_df.head()

Unnamed: 0,0
0,axis0
1,axis1
2,block0_items
3,block0_values


In [7]:
## CITESEQ DATA ##

cite_train_input = h5py.File('train_cite_inputs.h5', 'r')
print(list(cite_train_input.keys()))
train_cite_input = np.array(cite_train_input['train_cite_inputs'])
print(train_cite_input)

cite_train_target = h5py.File('train_cite_targets.h5', 'r')
print(list(cite_train_target.keys()))
train_cite_targets = np.array(cite_train_target['train_cite_targets'])
print(train_cite_targets)

cite_test_input = h5py.File('test_cite_inputs.h5', 'r')
print(list(cite_test_input.keys()))
test_cite_inputs = np.array(cite_test_input['test_cite_inputs'])
print(test_cite_inputs)

['train_cite_inputs']
['axis0' 'axis1' 'block0_items' 'block0_values']
['train_cite_targets']
['axis0' 'axis1' 'block0_items' 'block0_values']
['test_cite_inputs']
['axis0' 'axis1' 'block0_items' 'block0_values']


In [8]:
!pip install hdf5plugin



In [9]:
import hdf5plugin

In [10]:
## READING ATTRIBUTES OF THE MULTIOME TRAIN DATA ##

train_multi_input = h5py.File('train_multi_inputs.h5', 'r')
multiome_input_train = train_multi_input['train_multi_inputs']
multiome_input_train.keys()

print("axis0: {}".format(multiome_input_train['axis0']))
print("block0_values attributes: {}".format(multiome_input_train['axis0'].attrs))
print("axis0 attributes: {}".format(list(multiome_input_train['axis0'].attrs)))

print("axis1: {}".format(multiome_input_train['axis1']))
print("block0_values attributes: {}".format(multiome_input_train['axis1'].attrs))
print("axis1 attributes: {}".format(list(multiome_input_train['axis1'].attrs)))

print("block0_items: {}".format(multiome_input_train['block0_items']))
print("block0_values attributes: {}".format(multiome_input_train['block0_items'].attrs))
print("block0_items attributes: {}".format(list(multiome_input_train['block0_items'].attrs)))

print("block0_values: {}".format(multiome_input_train['block0_values']))
print("block0_values attributes: {}".format(multiome_input_train['block0_values'].attrs))
print("block0_values attributes: {}".format(list(multiome_input_train['block0_values'].attrs)))

axis0: <HDF5 dataset "axis0": shape (228942,), type "|S26">
block0_values attributes: <Attributes of HDF5 object at 1952043551136>
axis0 attributes: ['CLASS', 'TITLE', 'VERSION', 'kind', 'name', 'transposed']
axis1: <HDF5 dataset "axis1": shape (105942,), type "|S12">
block0_values attributes: <Attributes of HDF5 object at 1952043551136>
axis1 attributes: ['CLASS', 'TITLE', 'VERSION', 'kind', 'name', 'transposed']
block0_items: <HDF5 dataset "block0_items": shape (228942,), type "|S26">
block0_values attributes: <Attributes of HDF5 object at 1952043551136>
block0_items attributes: ['CLASS', 'TITLE', 'VERSION', 'kind', 'name', 'transposed']
block0_values: <HDF5 dataset "block0_values": shape (105942, 228942), type "<f4">
block0_values attributes: <Attributes of HDF5 object at 1950291910448>
block0_values attributes: ['CLASS', 'TITLE', 'VERSION', 'transposed']


In [11]:
## READING ATTRIBUTES OF THE CITESEQ TRAIN DATA ##

train_cite_input = h5py.File('train_cite_targets.h5', 'r')
citeseq_input_train = train_cite_input['train_cite_targets']
citeseq_input_train.keys()

print("axis0: {}".format(citeseq_input_train['axis0']))
print("block0_values attributes: {}".format(citeseq_input_train['axis0'].attrs))
print("axis0 attributes: {}".format(list(citeseq_input_train['axis0'].attrs)))

print("axis1: {}".format(citeseq_input_train['axis1']))
print("block0_values attributes: {}".format(citeseq_input_train['axis1'].attrs))
print("axis1 attributes: {}".format(list(citeseq_input_train['axis1'].attrs)))

print("block0_items: {}".format(citeseq_input_train['block0_items']))
print("block0_values attributes: {}".format(citeseq_input_train['block0_items'].attrs))
print("block0_items attributes: {}".format(list(citeseq_input_train['block0_items'].attrs)))

print("block0_values: {}".format(citeseq_input_train['block0_values']))
print("block0_values attributes: {}".format(citeseq_input_train['block0_values'].attrs))
print("block0_values attributes: {}".format(list(citeseq_input_train['block0_values'].attrs)))

axis0: <HDF5 dataset "axis0": shape (140,), type "|S11">
block0_values attributes: <Attributes of HDF5 object at 1952043551536>
axis0 attributes: ['CLASS', 'TITLE', 'VERSION', 'kind', 'name', 'transposed']
axis1: <HDF5 dataset "axis1": shape (70988,), type "|S12">
block0_values attributes: <Attributes of HDF5 object at 1951530782288>
axis1 attributes: ['CLASS', 'TITLE', 'VERSION', 'kind', 'name', 'transposed']
block0_items: <HDF5 dataset "block0_items": shape (140,), type "|S11">
block0_values attributes: <Attributes of HDF5 object at 1951530779728>
block0_items attributes: ['CLASS', 'TITLE', 'VERSION', 'kind', 'name', 'transposed']
block0_values: <HDF5 dataset "block0_values": shape (70988, 140), type "<f4">
block0_values attributes: <Attributes of HDF5 object at 1951530782288>
block0_values attributes: ['CLASS', 'TITLE', 'VERSION', 'transposed']


In [12]:
# Reading block0_values
file = 'train_multi_inputs.h5'
with h5py.File(file, 'r') as f:
    dataset = f['train_multi_inputs/block0_values']
    
    # Defining the slice
    rows = slice(0, 1000)     
    cols = slice(0, 1000)   
    
    # Reading the slice
    data_slice = dataset[rows, cols]
    
    # Slice converted to numpy array for easier manipulation
    data_array = np.array(data_slice)
    
    # Shape and preview of the data
    print(f"Shape of the sliced data: {data_array.shape}")
    print(data_array)

Shape of the sliced data: (1000, 1000)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Working with Multiome Data

1. train/test_multi_inputs.h5 - ATAC-seq peak counts transformed with TF-IDF using the default log(TF) * log(IDF) output (chromatin accessibility), with rows corresponding to cells and columns corresponding to the location of the genome whose level of accessibility is measured, here identified by the genomic coordinates on reference genome GRCh38.

2. train_multi_targets.h5 - RNA gene expression levels as library-size normalized and log1p transformed counts for the same cells..

### Concatenating Multiome Train Data

In [17]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000 #

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        mtrain_chunk_axis0 = pd.DataFrame(data_slice, columns=['axis0'])
        print(f'Processed chunk {chunk_count} with shape: {mtrain_chunk_axis0.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield mtrain_chunk_axis0

# Data Processing in chunks
for mtrain_chunk_axis0 in process_chunks('train_multi_inputs.h5', 'train_multi_inputs/axis0', row_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1)
Stopping after processing 10 chunks for testing.


In [18]:
mtrain_chunk_axis0

Unnamed: 0,axis0
0,b'chr1:183306299-183307211'
1,b'chr1:183346148-183347064'
2,b'chr1:183363705-183364561'
3,b'chr1:183416556-183417410'
4,b'chr1:183418060-183418850'
...,...
9995,b'chr1:45474787-45475662'
9996,b'chr1:45483239-45484118'
9997,b'chr1:45490809-45491708'
9998,b'chr1:45492821-45493731'


In [21]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000  

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        mtrain_chunk_axis1 = pd.DataFrame(data_slice, columns=['axis1'])
        print(f'Processed chunk {chunk_count} with shape: {mtrain_chunk_axis1.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield mtrain_chunk_axis1

# Data Processing in chunks
for mtrain_chunk_axis1 in process_chunks('train_multi_inputs.h5', 'train_multi_inputs/axis1', row_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1)
Stopping after processing 10 chunks for testing.


In [22]:
mtrain_chunk_axis1

Unnamed: 0,axis1
0,b'23c0c2c56cf6'
1,b'd3ef7a0bc439'
2,b'324879a9b198'
3,b'8b75946ab47c'
4,b'9e79714af43f'
...,...
9995,b'd4801fd51ca0'
9996,b'fce1c9f21efa'
9997,b'74e461d8a702'
9998,b'cc7597381e61'


In [24]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        mtrain_chunk_block0_items = pd.DataFrame(data_slice, columns=['block0_items'])
        print(f'Processed chunk {chunk_count} with shape: {mtrain_chunk_block0_items.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield mtrain_chunk_block0_items

# Data Processing in chunks
for mtrain_chunk_block0_items in process_chunks('train_multi_inputs.h5', 'train_multi_inputs/block0_items', row_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1)
Stopping after processing 10 chunks for testing.


In [25]:
mtrain_chunk_block0_items

Unnamed: 0,block0_items
0,b'chr1:183306299-183307211'
1,b'chr1:183346148-183347064'
2,b'chr1:183363705-183364561'
3,b'chr1:183416556-183417410'
4,b'chr1:183418060-183418850'
...,...
9995,b'chr1:45474787-45475662'
9996,b'chr1:45483239-45484118'
9997,b'chr1:45490809-45491708'
9998,b'chr1:45492821-45493731'


In [27]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size, col_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        total_cols = dataset.shape[1]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Iterate over column chunks
            for start_col in range(0, total_cols, col_chunk_size):
                end_col = min(start_col + col_chunk_size, total_cols)
                
                # Reading chunk of data
                data_slice = dataset[start_row:end_row, start_col:end_col]
                yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000  
col_chunk_size = 1000   

def process_chunks(file_path, dataset_name, row_chunk_size, col_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size, col_chunk_size):
        mtrain_chunk_block0_values = pd.DataFrame(data_slice)
        print(f'Processed chunk {chunk_count} with shape: {mtrain_chunk_block0_values.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield mtrain_chunk_block0_values

# Data Processing in chunks
for mtrain_chunk_block0_values in process_chunks('train_multi_inputs.h5', 'train_multi_inputs/block0_values', row_chunk_size, col_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1000)
Stopping after processing 10 chunks for testing.


In [28]:
mtrain_chunk_block0_values

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,5.853083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
multi_train_full_df =  pd.concat([mtrain_chunk_axis0, mtrain_chunk_axis1, mtrain_chunk_block0_items, mtrain_chunk_block0_values], axis=1)
multi_train_full_df

Unnamed: 0,axis0,axis1,block0_items,0,1,2,3,4,5,6,...,990,991,992,993,994,995,996,997,998,999
0,b'chr1:183306299-183307211',b'23c0c2c56cf6',b'chr1:183306299-183307211',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,b'chr1:183346148-183347064',b'd3ef7a0bc439',b'chr1:183346148-183347064',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,b'chr1:183363705-183364561',b'324879a9b198',b'chr1:183363705-183364561',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,b'chr1:183416556-183417410',b'8b75946ab47c',b'chr1:183416556-183417410',0.0,5.853083,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,b'chr1:183418060-183418850',b'9e79714af43f',b'chr1:183418060-183418850',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,b'chr1:45474787-45475662',b'd4801fd51ca0',b'chr1:45474787-45475662',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,b'chr1:45483239-45484118',b'fce1c9f21efa',b'chr1:45483239-45484118',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,b'chr1:45490809-45491708',b'74e461d8a702',b'chr1:45490809-45491708',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,b'chr1:45492821-45493731',b'cc7597381e61',b'chr1:45492821-45493731',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Concatenating Multiome Target Data

In [49]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000  

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        mtarget_chunk_axis0 = pd.DataFrame(data_slice, columns=['axis0'])
        print(f'Processed chunk {chunk_count} with shape: {mtarget_chunk_axis0.shape}')
        chunk_count += 1
        if chunk_count > 1:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield mtarget_chunk_axis0

# Data Processing in chunks
for mtarget_chunk_axis0 in process_chunks('train_multi_targets.h5', 'train_multi_targets/axis0', row_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1)
Processed chunk 1 with shape: (10000, 1)
Stopping after processing 10 chunks for testing.


In [50]:
mtarget_chunk_axis0

Unnamed: 0,axis0
0,b'ENSG00000121410'
1,b'ENSG00000268895'
2,b'ENSG00000175899'
3,b'ENSG00000245105'
4,b'ENSG00000166535'
...,...
9995,b'ENSG00000130165'
9996,b'ENSG00000066322'
9997,b'ENSG00000197977'
9998,b'ENSG00000119915'


In [36]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000  

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        mtarget_chunk_axis1 = pd.DataFrame(data_slice, columns=['axis1'])
        print(f'Processed chunk {chunk_count} with shape: {mtarget_chunk_axis1.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield mtarget_chunk_axis1

# Data Processing in chunks
for mtarget_chunk_axis1 in process_chunks('train_multi_targets.h5', 'train_multi_targets/axis1', row_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1)
Stopping after processing 10 chunks for testing.


In [37]:
mtarget_chunk_axis1

Unnamed: 0,axis1
0,b'23c0c2c56cf6'
1,b'd3ef7a0bc439'
2,b'324879a9b198'
3,b'8b75946ab47c'
4,b'9e79714af43f'
...,...
9995,b'd4801fd51ca0'
9996,b'fce1c9f21efa'
9997,b'74e461d8a702'
9998,b'cc7597381e61'


In [41]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000  

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        mtarget_chunk_block0_items = pd.DataFrame(data_slice, columns=['block0_items'])
        print(f'Processed chunk {chunk_count} with shape: {mtarget_chunk_block0_items.shape}')
        chunk_count += 1
        if chunk_count > 1:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield mtarget_chunk_block0_items

# Data Processing in chunks
for mtarget_chunk_block0_items in process_chunks('train_multi_targets.h5', 'train_multi_targets/block0_items', row_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1)
Processed chunk 1 with shape: (10000, 1)
Stopping after processing 10 chunks for testing.


In [42]:
mtarget_chunk_block0_items

Unnamed: 0,block0_items
0,b'ENSG00000121410'
1,b'ENSG00000268895'
2,b'ENSG00000175899'
3,b'ENSG00000245105'
4,b'ENSG00000166535'
...,...
9995,b'ENSG00000130165'
9996,b'ENSG00000066322'
9997,b'ENSG00000197977'
9998,b'ENSG00000119915'


In [30]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size, col_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        total_cols = dataset.shape[1]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Iterate over column chunks
            for start_col in range(0, total_cols, col_chunk_size):
                end_col = min(start_col + col_chunk_size, total_cols)
                
                # Reading chunk of data
                data_slice = dataset[start_row:end_row, start_col:end_col]
                yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000  
col_chunk_size = 1000   

def process_chunks(file_path, dataset_name, row_chunk_size, col_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size, col_chunk_size):
        mtarget_chunk_block0_values = pd.DataFrame(data_slice)
        print(f'Processed chunk {chunk_count} with shape: {mtarget_chunk_block0_values.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield mtarget_chunk_block0_values

# Data Processing in chunks
for mtarget_chunk_block0_values in process_chunks('train_multi_targets.h5', 'train_multi_targets/block0_values', row_chunk_size, col_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1000)
Stopping after processing 10 chunks for testing.


In [31]:
mtarget_chunk_block0_values

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,5.583255,0.000000,0.000000,0.0,0.0,0.0
1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,5.788494,4.695988,4.695988,0.000000,0.0,0.0,0.0
2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,5.107832,0.000000,5.797950,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,0.000000,4.507936,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,4.507936,0.000000,5.195558,0.000000,0.000000,4.507936,0.0,0.0,0.0
4,4.842377,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,5.935717,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,5.203262,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
9996,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,4.714328,0.0,...,0.0,4.714328,0.000000,5.402983,5.402983,0.000000,0.000000,0.0,0.0,0.0
9997,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,4.434201,0.000000,5.121398,0.000000,0.0,0.0,0.0
9998,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0


In [51]:
multi_target_full_df =  pd.concat([mtarget_chunk_axis0, mtarget_chunk_axis1, mtarget_chunk_block0_items, mtarget_chunk_block0_values], axis=1)
multi_target_full_df

Unnamed: 0,axis0,axis1,block0_items,0,1,2,3,4,5,6,...,990,991,992,993,994,995,996,997,998,999
0,b'ENSG00000121410',b'23c0c2c56cf6',b'ENSG00000121410',0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,5.583255,0.000000,0.000000,0.0,0.0,0.0
1,b'ENSG00000268895',b'd3ef7a0bc439',b'ENSG00000268895',0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,5.788494,4.695988,4.695988,0.000000,0.0,0.0,0.0
2,b'ENSG00000175899',b'324879a9b198',b'ENSG00000175899',0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,5.107832,0.000000,5.797950,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,b'ENSG00000245105',b'8b75946ab47c',b'ENSG00000245105',0.000000,4.507936,0.0,0.0,0.0,0.0,0.0,...,0.0,4.507936,0.000000,5.195558,0.000000,0.000000,4.507936,0.0,0.0,0.0
4,b'ENSG00000166535',b'9e79714af43f',b'ENSG00000166535',4.842377,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,5.935717,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,b'ENSG00000130165',b'd4801fd51ca0',b'ENSG00000130165',0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,5.203262,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
9996,b'ENSG00000066322',b'fce1c9f21efa',b'ENSG00000066322',0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,4.714328,0.000000,5.402983,5.402983,0.000000,0.000000,0.0,0.0,0.0
9997,b'ENSG00000197977',b'74e461d8a702',b'ENSG00000197977',0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,4.434201,0.000000,5.121398,0.000000,0.0,0.0,0.0
9998,b'ENSG00000119915',b'cc7597381e61',b'ENSG00000119915',0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0


In [54]:
multi_train_full_df.columns = multi_train_full_df.columns.astype(str)
multi_train_full_df

Unnamed: 0,axis0,axis1,block0_items,0,1,2,3,4,5,6,...,990,991,992,993,994,995,996,997,998,999
0,b'chr1:183306299-183307211',b'23c0c2c56cf6',b'chr1:183306299-183307211',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,b'chr1:183346148-183347064',b'd3ef7a0bc439',b'chr1:183346148-183347064',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,b'chr1:183363705-183364561',b'324879a9b198',b'chr1:183363705-183364561',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,b'chr1:183416556-183417410',b'8b75946ab47c',b'chr1:183416556-183417410',0.0,5.853083,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,b'chr1:183418060-183418850',b'9e79714af43f',b'chr1:183418060-183418850',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,b'chr1:45474787-45475662',b'd4801fd51ca0',b'chr1:45474787-45475662',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,b'chr1:45483239-45484118',b'fce1c9f21efa',b'chr1:45483239-45484118',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,b'chr1:45490809-45491708',b'74e461d8a702',b'chr1:45490809-45491708',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,b'chr1:45492821-45493731',b'cc7597381e61',b'chr1:45492821-45493731',0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
multi_target_full_df.columns = multi_target_full_df.columns.astype(str)
multi_target_full_df

Unnamed: 0,axis0,axis1,block0_items,0,1,2,3,4,5,6,...,990,991,992,993,994,995,996,997,998,999
0,b'ENSG00000121410',b'23c0c2c56cf6',b'ENSG00000121410',0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,5.583255,0.000000,0.000000,0.0,0.0,0.0
1,b'ENSG00000268895',b'd3ef7a0bc439',b'ENSG00000268895',0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,5.788494,4.695988,4.695988,0.000000,0.0,0.0,0.0
2,b'ENSG00000175899',b'324879a9b198',b'ENSG00000175899',0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,5.107832,0.000000,5.797950,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,b'ENSG00000245105',b'8b75946ab47c',b'ENSG00000245105',0.000000,4.507936,0.0,0.0,0.0,0.0,0.0,...,0.0,4.507936,0.000000,5.195558,0.000000,0.000000,4.507936,0.0,0.0,0.0
4,b'ENSG00000166535',b'9e79714af43f',b'ENSG00000166535',4.842377,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,5.935717,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,b'ENSG00000130165',b'd4801fd51ca0',b'ENSG00000130165',0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,5.203262,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
9996,b'ENSG00000066322',b'fce1c9f21efa',b'ENSG00000066322',0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,4.714328,0.000000,5.402983,5.402983,0.000000,0.000000,0.0,0.0,0.0
9997,b'ENSG00000197977',b'74e461d8a702',b'ENSG00000197977',0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,4.434201,0.000000,5.121398,0.000000,0.0,0.0,0.0
9998,b'ENSG00000119915',b'cc7597381e61',b'ENSG00000119915',0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0


### Data Preprocessing and Model Fitting

In [56]:
assert all(multi_train_full_df.index == multi_target_full_df.index)

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [58]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(mtrain_chunk_block0_values, mtarget_chunk_block0_values, test_size=0.2, random_state=42)
# Train a model
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 3.2386255264282227


In [59]:
# Standardizing and applying PCA
X_all = mtrain_chunk_block0_values
y_all = mtarget_chunk_block0_values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_all)

pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)
 
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X_pca, y_all, test_size=0.2, random_state=42)
    
# Training and evaluating the model
ridge_model = Ridge(alpha=0.1)
ridge_model.fit(X_train, y_train)
    
y_pred = ridge_model.predict(X_val)
    
r2 = r2_score(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
    
print(f'R^2 Score: {r2}')
print(f'Mean Squared Error: {mse}')

R^2 Score: 0.06742116587490173
Mean Squared Error: 2.807215929031372


### Concatenating Multiome Test Data

In [220]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000  

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        mtest_chunk_axis0 = pd.DataFrame(data_slice, columns=['axis0'])
        print(f'Processed chunk {chunk_count} with shape: {mtest_chunk_axis0.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield mtest_chunk_axis0

# Data Processing in chunks
for mtest_chunk_axis0 in process_chunks('test_multi_inputs.h5', 'test_multi_inputs/axis0', row_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1)
Stopping after processing 10 chunks for testing.


In [221]:
mtest_chunk_axis0

Unnamed: 0,axis0
0,b'chr1:183306299-183307211'
1,b'chr1:183346148-183347064'
2,b'chr1:183363705-183364561'
3,b'chr1:183416556-183417410'
4,b'chr1:183418060-183418850'
...,...
9995,b'chr1:45474787-45475662'
9996,b'chr1:45483239-45484118'
9997,b'chr1:45490809-45491708'
9998,b'chr1:45492821-45493731'


In [226]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000  

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        mtest_chunk_axis1 = pd.DataFrame(data_slice, columns=['axis1'])
        print(f'Processed chunk {chunk_count} with shape: {mtest_chunk_axis1.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield mtest_chunk_axis1

# Data Processing in chunks
for mtest_chunk_axis1 in process_chunks('test_multi_inputs.h5', 'test_multi_inputs/axis1', row_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1)
Stopping after processing 10 chunks for testing.


In [227]:
mtest_chunk_axis1

Unnamed: 0,axis1
0,b'458c2ae2c9b1'
1,b'01a0659b0710'
2,b'028a8bc3f2ba'
3,b'7ec0ca8bb863'
4,b'caa0b0022cdc'
...,...
9995,b'799f04d25ab7'
9996,b'72925b1375c0'
9997,b'167cd98351c9'
9998,b'318d80df4c61'


In [228]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000  

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        mtest_chunk_block0_items = pd.DataFrame(data_slice, columns=['block0_items'])
        print(f'Processed chunk {chunk_count} with shape: {mtest_chunk_block0_items.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield mtest_chunk_block0_items

# Data Processing in chunks
for mtest_chunk_block0_items in process_chunks('test_multi_inputs.h5', 'test_multi_inputs/block0_items', row_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1)
Stopping after processing 10 chunks for testing.


In [229]:
mtest_chunk_block0_items

Unnamed: 0,block0_items
0,b'chr1:183306299-183307211'
1,b'chr1:183346148-183347064'
2,b'chr1:183363705-183364561'
3,b'chr1:183416556-183417410'
4,b'chr1:183418060-183418850'
...,...
9995,b'chr1:45474787-45475662'
9996,b'chr1:45483239-45484118'
9997,b'chr1:45490809-45491708'
9998,b'chr1:45492821-45493731'


In [60]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size, col_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        total_cols = dataset.shape[1]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Iterate over column chunks
            for start_col in range(0, total_cols, col_chunk_size):
                end_col = min(start_col + col_chunk_size, total_cols)
                
                # Reading chunk of data
                data_slice = dataset[start_row:end_row, start_col:end_col]
                yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000  
col_chunk_size = 1000   

def process_chunks(file_path, dataset_name, row_chunk_size, col_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size, col_chunk_size):
        mtest_chunk_block0_values = pd.DataFrame(data_slice)
        print(f'Processed chunk {chunk_count} with shape: {mtest_chunk_block0_values.shape}')
        chunk_count += 1
        if chunk_count > 10:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield mtest_chunk_block0_values

# Data Processing in chunks
for mtest_chunk_block0_values in process_chunks('test_multi_inputs.h5', 'test_multi_inputs/block0_values', row_chunk_size, col_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1000)
Processed chunk 1 with shape: (10000, 1000)
Processed chunk 2 with shape: (10000, 1000)
Processed chunk 3 with shape: (10000, 1000)
Processed chunk 4 with shape: (10000, 1000)
Processed chunk 5 with shape: (10000, 1000)
Processed chunk 6 with shape: (10000, 1000)
Processed chunk 7 with shape: (10000, 1000)
Processed chunk 8 with shape: (10000, 1000)
Processed chunk 9 with shape: (10000, 1000)
Processed chunk 10 with shape: (10000, 1000)
Stopping after processing 10 chunks for testing.


In [61]:
mtest_chunk_block0_values

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,5.259344,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.347107,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,1.052914,0.0,0.000000,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0


In [232]:
multi_test_full_df =  pd.concat([mtest_chunk_axis0, mtest_chunk_axis1, mtest_chunk_block0_items, mtest_chunk_block0_values], axis=1)
multi_test_full_df

Unnamed: 0,axis0,axis1,block0_items,0,1,2,3,4,5,6,...,990,991,992,993,994,995,996,997,998,999
0,b'chr1:183306299-183307211',b'458c2ae2c9b1',b'chr1:183306299-183307211',0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,5.259344,0.0
1,b'chr1:183346148-183347064',b'01a0659b0710',b'chr1:183346148-183347064',0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
2,b'chr1:183363705-183364561',b'028a8bc3f2ba',b'chr1:183363705-183364561',0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.347107,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
3,b'chr1:183416556-183417410',b'7ec0ca8bb863',b'chr1:183416556-183417410',0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
4,b'chr1:183418060-183418850',b'caa0b0022cdc',b'chr1:183418060-183418850',0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,b'chr1:45474787-45475662',b'799f04d25ab7',b'chr1:45474787-45475662',0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
9996,b'chr1:45483239-45484118',b'72925b1375c0',b'chr1:45483239-45484118',0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,1.052914,0.0,0.000000,0.0
9997,b'chr1:45490809-45491708',b'167cd98351c9',b'chr1:45490809-45491708',0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
9998,b'chr1:45492821-45493731',b'318d80df4c61',b'chr1:45492821-45493731',0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0


### Multiome Model Validation

In [62]:
# Making predictions on the multiome test set
test_predictions = model.predict(mtest_chunk_block0_values)

In [63]:
# Loading evaluation IDs
evaluation_ids = pd.read_csv('evaluation_ids.csv')

# Predictions to a DataFrame
test_predictions_df = pd.DataFrame(test_predictions, index=mtest_chunk_block0_values.index)

# Flattening the DataFrame to match the evaluation IDs
test_predictions_flat = test_predictions_df.unstack().reset_index()
test_predictions_flat.columns = ['gene_id', 'cell_id', 'predicted_value']

evaluation_ids['cell_id'] = evaluation_ids['cell_id'].astype(str)
test_predictions_flat['cell_id'] = test_predictions_flat['cell_id'].astype(str)

# 'gene_id' columns converted to string
evaluation_ids['gene_id'] = evaluation_ids['gene_id'].astype(str)
test_predictions_flat['gene_id'] = test_predictions_flat['gene_id'].astype(str)

# Merging with evaluation IDs to create the submission file
multi_submission_df = evaluation_ids.merge(test_predictions_flat, on=['cell_id', 'gene_id'], how='left')

# Filling missing values
multi_submission_df['predicted_value'].fillna(0, inplace=True)

# Final submission file
multi_submission_df[['row_id', 'predicted_value']].to_csv('submission.csv', index=False)

In [64]:
multi_submission_df

Unnamed: 0,row_id,cell_id,gene_id,predicted_value
0,0,c2150f55becb,CD86,0.0
1,1,c2150f55becb,CD274,0.0
2,2,c2150f55becb,CD270,0.0
3,3,c2150f55becb,CD155,0.0
4,4,c2150f55becb,CD112,0.0
...,...,...,...,...
65744175,65744175,2c53aa67933d,ENSG00000134419,0.0
65744176,65744176,2c53aa67933d,ENSG00000186862,0.0
65744177,65744177,2c53aa67933d,ENSG00000170959,0.0
65744178,65744178,2c53aa67933d,ENSG00000107874,0.0


## Working with Citeseq Data

1. train/test_cite_inputs.h5 - RNA library-size normalized and log1p transformed counts (gene expression levels), with rows corresponding to cells and columns corresponding to genes given by {gene_name}_{gene_ensemble-ids}.
2. 
train_cite_targets.h5 - Surface protein levels for the same cells that have been dsb normalized.

### Concatenating Citeseq Train Data

In [298]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        ctrain_chunk_axis0 = pd.DataFrame(data_slice, columns=['axis0'])
        print(f'Processed chunk {chunk_count} with shape: {ctrain_chunk_axis0.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield ctrain_chunk_axis0

# Data Processing in chunks
for ctrain_chunk_axis0 in process_chunks('train_cite_inputs.h5', 'train_cite_inputs/axis0', row_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1)
Stopping after processing 10 chunks for testing.


In [299]:
# This data consists of {gene_name}_{gene_ensemble-ids}

ctrain_chunk_axis0

Unnamed: 0,axis0
0,b'ENSG00000121410_A1BG'
1,b'ENSG00000268895_A1BG-AS1'
2,b'ENSG00000175899_A2M'
3,b'ENSG00000245105_A2M-AS1'
4,b'ENSG00000166535_A2ML1'
...,...
9995,b'ENSG00000184206_GOLGA6L4'
9996,b'ENSG00000230373_GOLGA6L5P'
9997,b'ENSG00000261649_GOLGA6L7'
9998,b'ENSG00000197978_GOLGA6L9'


In [300]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000  

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        ctrain_chunk_axis1 = pd.DataFrame(data_slice, columns=['axis1'])
        print(f'Processed chunk {chunk_count} with shape: {ctrain_chunk_axis1.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield ctrain_chunk_axis1

# Data Processing in chunks
for ctrain_chunk_axis1 in process_chunks('train_cite_inputs.h5', 'train_cite_inputs/axis1', row_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1)
Stopping after processing 10 chunks for testing.


In [301]:
ctrain_chunk_axis1

Unnamed: 0,axis1
0,b'45006fe3e4c8'
1,b'd02759a80ba2'
2,b'c016c6b0efa5'
3,b'ba7f733a4f75'
4,b'fbcf2443ffb2'
...,...
9995,b'88a2f9e1d85a'
9996,b'5ae58a7d2d95'
9997,b'9244230aa681'
9998,b'35d7180bafb0'


In [302]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000  

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        ctrain_chunk_block0_items = pd.DataFrame(data_slice, columns=['block0_items'])
        print(f'Processed chunk {chunk_count} with shape: {ctrain_chunk_block0_items.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield ctrain_chunk_block0_items

# Data Processing in chunks
for ctrain_chunk_block0_items in process_chunks('train_cite_inputs.h5', 'train_cite_inputs/block0_items', row_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 1)
Stopping after processing 10 chunks for testing.


In [303]:
ctrain_chunk_block0_items

Unnamed: 0,block0_items
0,b'ENSG00000121410_A1BG'
1,b'ENSG00000268895_A1BG-AS1'
2,b'ENSG00000175899_A2M'
3,b'ENSG00000245105_A2M-AS1'
4,b'ENSG00000166535_A2ML1'
...,...
9995,b'ENSG00000184206_GOLGA6L4'
9996,b'ENSG00000230373_GOLGA6L5P'
9997,b'ENSG00000261649_GOLGA6L7'
9998,b'ENSG00000197978_GOLGA6L9'


In [65]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size, col_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        total_cols = dataset.shape[1]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Iterate over column chunks
            for start_col in range(0, total_cols, col_chunk_size):
                end_col = min(start_col + col_chunk_size, total_cols)
                
                # Reading chunk of data
                data_slice = dataset[start_row:end_row, start_col:end_col]
                yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000  
col_chunk_size = 100

def process_chunks(file_path, dataset_name, row_chunk_size, col_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size, col_chunk_size):
        ctrain_chunk_block0_values = pd.DataFrame(data_slice)
        print(f'Processed chunk {chunk_count} with shape: {ctrain_chunk_block0_values.shape}')
        chunk_count += 1
        if chunk_count > 10:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield ctrain_chunk_block0_values

# Data Processing in chunks
for ctrain_chunk_block0_values in process_chunks('train_cite_inputs.h5', 'train_cite_inputs/block0_values', row_chunk_size, col_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 100)
Processed chunk 1 with shape: (10000, 100)
Processed chunk 2 with shape: (10000, 100)
Processed chunk 3 with shape: (10000, 100)
Processed chunk 4 with shape: (10000, 100)
Processed chunk 5 with shape: (10000, 100)
Processed chunk 6 with shape: (10000, 100)
Processed chunk 7 with shape: (10000, 100)
Processed chunk 8 with shape: (10000, 100)
Processed chunk 9 with shape: (10000, 100)
Processed chunk 10 with shape: (10000, 100)
Stopping after processing 10 chunks for testing.


In [66]:
ctrain_chunk_block0_values

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,4.090185,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,3.847321,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,3.436846,0.0,0.0,0.0,0.0,...,0.0,0.0,3.436846,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,3.518610,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,3.375613,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,4.088598,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,3.401583,0.0,0.0,0.0,0.0,...,0.0,0.0,3.401583,0.0,3.401583,0.0,0.0,0.0,0.0,0.0


In [313]:
cite_train_full_df =  pd.concat([ctrain_chunk_axis0, ctrain_chunk_axis1, ctrain_chunk_block0_items, ctrain_chunk_block0_values], axis=1)
cite_train_full_df

Unnamed: 0,axis0,axis1,block0_items,0,1,2,3,4,5,6,...,90,91,92,93,94,95,96,97,98,99
0,b'ENSG00000121410_A1BG',b'45006fe3e4c8',b'ENSG00000121410_A1BG',0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,4.090185,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,b'ENSG00000268895_A1BG-AS1',b'd02759a80ba2',b'ENSG00000268895_A1BG-AS1',0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,b'ENSG00000175899_A2M',b'c016c6b0efa5',b'ENSG00000175899_A2M',0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,3.847321,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,b'ENSG00000245105_A2M-AS1',b'ba7f733a4f75',b'ENSG00000245105_A2M-AS1',0.0,0.0,0.0,0.0,0.0,3.436846,0.0,...,0.0,0.0,3.436846,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,b'ENSG00000166535_A2ML1',b'fbcf2443ffb2',b'ENSG00000166535_A2ML1',0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,3.518610,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,b'ENSG00000184206_GOLGA6L4',b'88a2f9e1d85a',b'ENSG00000184206_GOLGA6L4',0.0,0.0,0.0,0.0,0.0,3.375613,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
9996,b'ENSG00000230373_GOLGA6L5P',b'5ae58a7d2d95',b'ENSG00000230373_GOLGA6L5P',0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,4.088598,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
9997,b'ENSG00000261649_GOLGA6L7',b'9244230aa681',b'ENSG00000261649_GOLGA6L7',0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
9998,b'ENSG00000197978_GOLGA6L9',b'35d7180bafb0',b'ENSG00000197978_GOLGA6L9',0.0,0.0,0.0,0.0,0.0,3.401583,0.0,...,0.0,0.0,3.401583,0.0,3.401583,0.0,0.0,0.0,0.0,0.0


### Concatenating Citeseq Target Data

In [314]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 100

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        ctarget_chunk_axis0 = pd.DataFrame(data_slice, columns=['axis0'])
        print(f'Processed chunk {chunk_count} with shape: {ctarget_chunk_axis0.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield ctarget_chunk_axis0

# Data Processing in chunks
for ctarget_chunk_axis0 in process_chunks('train_cite_targets.h5', 'train_cite_targets/axis0', row_chunk_size):
    pass

Processed chunk 0 with shape: (100, 1)
Stopping after processing 10 chunks for testing.


In [315]:
ctarget_chunk_axis0

Unnamed: 0,axis0
0,b'CD86'
1,b'CD274'
2,b'CD270'
3,b'CD155'
4,b'CD112'
...,...
95,b'CD2'
96,b'CD226'
97,b'CD29'
98,b'CD303'


In [316]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 100

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        ctarget_chunk_axis1 = pd.DataFrame(data_slice, columns=['axis1'])
        print(f'Processed chunk {chunk_count} with shape: {ctarget_chunk_axis1.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield ctarget_chunk_axis1

# Data Processing in chunks
for ctarget_chunk_axis1 in process_chunks('train_cite_targets.h5', 'train_cite_targets/axis1', row_chunk_size):
    pass

Processed chunk 0 with shape: (100, 1)
Stopping after processing 10 chunks for testing.


In [317]:
ctarget_chunk_axis1

Unnamed: 0,axis1
0,b'45006fe3e4c8'
1,b'd02759a80ba2'
2,b'c016c6b0efa5'
3,b'ba7f733a4f75'
4,b'fbcf2443ffb2'
...,...
95,b'613f1d17569a'
96,b'997cb42b095c'
97,b'21e74451fc57'
98,b'a7de418fbaa3'


In [318]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 100

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        ctarget_chunk_block0_items = pd.DataFrame(data_slice, columns=['block0_items'])
        print(f'Processed chunk {chunk_count} with shape: {ctarget_chunk_block0_items.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield ctarget_chunk_block0_items

# Data Processing in chunks
for ctarget_chunk_block0_items in process_chunks('train_cite_targets.h5', 'train_cite_targets/block0_items', row_chunk_size):
    pass

Processed chunk 0 with shape: (100, 1)
Stopping after processing 10 chunks for testing.


In [319]:
ctarget_chunk_block0_items

Unnamed: 0,block0_items
0,b'CD86'
1,b'CD274'
2,b'CD270'
3,b'CD155'
4,b'CD112'
...,...
95,b'CD2'
96,b'CD226'
97,b'CD29'
98,b'CD303'


In [68]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size, col_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        total_cols = dataset.shape[1]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Iterate over column chunks
            for start_col in range(0, total_cols, col_chunk_size):
                end_col = min(start_col + col_chunk_size, total_cols)
                
                # Reading chunk of data
                data_slice = dataset[start_row:end_row, start_col:end_col]
                yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000
col_chunk_size = 100

def process_chunks(file_path, dataset_name, row_chunk_size, col_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size, col_chunk_size):
        ctarget_chunk_block0_values = pd.DataFrame(data_slice)
        print(f'Processed chunk {chunk_count} with shape: {ctarget_chunk_block0_values.shape}')
        chunk_count += 1
        if chunk_count > 1:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield ctarget_chunk_block0_values

# Data Processing in chunks
for ctarget_chunk_block0_values in process_chunks('train_cite_targets.h5', 'train_cite_targets/block0_values', row_chunk_size, col_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 100)
Processed chunk 1 with shape: (10000, 40)
Stopping after processing 10 chunks for testing.


In [69]:
ctarget_chunk_block0_values

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.167804,0.622530,0.106959,0.324989,3.331674,6.426002,1.480766,-0.728392,-0.468851,-0.073285,...,-0.031258,-0.936979,-0.019728,0.698779,1.338748,0.107811,0.581864,3.531209,0.355473,3.451938
1,0.818970,0.506009,1.078682,6.848758,3.524885,5.279456,4.930438,2.069372,0.333652,-0.468088,...,0.172785,-0.243277,0.421166,1.476295,2.382426,-0.189925,0.119327,6.511853,0.796505,4.606451
2,-0.356703,-0.422261,-0.824493,1.137495,0.518924,7.221962,-0.375034,1.738071,0.142919,-0.971460,...,-0.364963,0.072048,-0.554623,0.369749,-0.537699,-0.107136,1.171039,7.867631,0.039053,0.604688
3,-1.201507,0.149115,2.022468,6.021595,7.258670,2.792436,21.708519,-0.137913,1.649969,-0.754680,...,0.282499,1.129072,1.145883,1.803966,3.713470,2.687241,0.464015,9.096437,0.592963,4.733758
4,-0.100404,0.697461,0.625836,-0.298404,1.369898,3.254521,-1.659380,0.643531,0.902710,1.291877,...,2.625586,0.365598,0.282687,1.414874,-0.905282,2.540805,2.796179,5.900300,0.598366,0.032215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.206402,0.775128,0.076378,5.074299,6.765769,7.363028,1.967928,0.950564,-0.405055,0.549575,...,1.152012,-0.535550,-0.122142,-0.388705,1.423468,0.104979,1.125456,6.498184,0.737427,4.999094
9996,1.050847,1.098720,1.092681,2.451385,1.806154,4.689914,0.864083,0.555972,1.595025,-1.527038,...,2.125439,1.092389,0.774509,0.823105,-0.177216,-0.962447,2.188918,8.301915,0.609284,2.553094
9997,-0.781171,0.390308,1.455070,6.611130,9.191357,13.543301,18.752163,1.160042,0.833745,-0.279060,...,1.756792,0.556424,0.612829,1.443926,5.080153,-0.481556,0.090370,14.396953,0.384295,2.799502
9998,-0.058351,0.414793,1.335613,7.505342,7.036863,12.238686,1.967698,0.952533,-0.194902,1.559120,...,0.866976,0.836212,-0.515535,0.568552,3.527648,0.106547,1.128222,9.320018,0.062544,5.327307


In [324]:
cite_target_full_df =  pd.concat([ctarget_chunk_axis0, ctarget_chunk_axis1, ctarget_chunk_block0_items, ctarget_chunk_block0_values], axis=1)
cite_target_full_df

Unnamed: 0,axis0,axis1,block0_items,0,1,2,3,4,5,6,...,30,31,32,33,34,35,36,37,38,39
0,b'CD86',b'45006fe3e4c8',b'CD86',4.352104,-0.729717,12.194817,-1.497277,2.531128,3.359310,8.105627,...,-0.292643,15.632978,-0.929662,0.599275,-0.559571,0.542892,10.036562,2.655112,5.164501,13.087812
1,b'CD274',b'd02759a80ba2',b'CD274',7.806087,-0.402004,5.945415,2.549871,3.931399,-0.742468,14.692758,...,0.987164,8.994514,0.224537,0.003676,-0.183820,1.928215,4.894279,-0.695263,3.710673,3.159948
2,b'CD270',b'c016c6b0efa5',b'CD270',6.836812,-0.212401,-0.237459,0.683954,2.236428,1.479423,4.604853,...,0.578546,5.118548,1.454528,0.892154,0.033575,1.829587,3.188252,1.206259,0.356242,4.453875
3,b'CD155',b'ba7f733a4f75',b'CD155',3.207314,-0.010209,6.157664,0.019909,2.633278,0.310955,9.200804,...,0.384719,6.854942,1.604430,0.949199,0.265405,1.695040,9.186251,1.696285,5.857133,4.520548
4,b'CD112',b'fbcf2443ffb2',b'CD112',4.013659,1.728208,2.139688,1.209587,0.972724,2.696377,12.970881,...,1.263576,7.129957,1.501827,1.309801,0.232003,1.622582,6.510516,0.703951,2.997573,8.582087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,,,,2.572666,-0.274680,8.204807,2.093819,1.092000,0.312089,5.842934,...,-0.509277,4.174660,1.409939,-0.380895,-0.135269,1.156800,4.790791,-0.150676,1.614694,2.441091
9996,,,,6.587154,1.528400,2.258496,1.400390,1.724300,1.169350,7.381717,...,0.819537,5.553139,1.560005,0.915513,-0.076554,1.295204,4.636050,1.076275,3.631060,4.923013
9997,,,,3.581842,1.446842,2.161012,0.765146,-0.621705,1.010290,7.237905,...,-1.137071,2.002238,-0.743587,0.265340,-0.149190,1.123985,4.239871,0.568489,2.008964,1.791940
9998,,,,2.571857,-0.275141,2.179144,0.297879,2.005047,1.039875,6.339834,...,0.251174,4.796071,-0.709057,0.194636,-0.135680,-0.506589,3.514225,0.600337,2.278924,1.131902


In [289]:
cite_train_full_df.columns = cite_train_full_df.columns.astype(str)
cite_train_full_df

Unnamed: 0,axis0,axis1,block0_items,0,1,2,3,4,5,6,...,990,991,992,993,994,995,996,997,998,999
0,b'ENSG00000121410_A1BG',b'45006fe3e4c8',b'ENSG00000121410_A1BG',0.000000,0.000000,0.000000,0.000000,4.090185,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,4.090185
1,b'ENSG00000268895_A1BG-AS1',b'd02759a80ba2',b'ENSG00000268895_A1BG-AS1',0.000000,0.000000,0.000000,0.000000,4.039545,0.000000,0.000000,...,4.039545,0.000000,4.039545,0.0,0.000000,0.0,0.0,0.0,0.000000,4.723850
2,b'ENSG00000175899_A2M',b'c016c6b0efa5',b'ENSG00000175899_A2M',0.000000,3.847321,0.000000,0.000000,3.847321,0.000000,4.931607,...,0.000000,4.529743,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,3.847321
3,b'ENSG00000245105_A2M-AS1',b'ba7f733a4f75',b'ENSG00000245105_A2M-AS1',3.436846,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,4.113780,4.513782,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,3.436846
4,b'ENSG00000166535_A2ML1',b'fbcf2443ffb2',b'ENSG00000166535_A2ML1',0.000000,0.000000,0.000000,4.196826,4.597264,4.196826,3.518610,...,0.000000,5.438786,4.196826,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,b'ENSG00000184206_GOLGA6L4',b'88a2f9e1d85a',b'ENSG00000184206_GOLGA6L4',0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,4.451163,3.375613,0.0,0.000000,0.0,0.0,0.0,0.000000,3.375613
9996,b'ENSG00000230373_GOLGA6L5P',b'5ae58a7d2d95',b'ENSG00000230373_GOLGA6L5P',0.000000,0.000000,0.000000,3.412074,4.488460,3.412074,0.000000,...,0.000000,4.088598,0.000000,0.0,0.000000,0.0,0.0,0.0,3.412074,3.412074
9997,b'ENSG00000261649_GOLGA6L7',b'9244230aa681',b'ENSG00000261649_GOLGA6L7',0.000000,0.000000,0.000000,3.910304,0.000000,3.236993,0.000000,...,0.000000,4.814501,0.000000,0.0,3.236993,0.0,0.0,0.0,0.000000,4.593383
9998,b'ENSG00000197978_GOLGA6L9',b'35d7180bafb0',b'ENSG00000197978_GOLGA6L9',0.000000,0.000000,0.000000,3.401583,5.165183,0.000000,0.000000,...,4.477731,4.984003,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,4.077930


In [290]:
cite_target_full_df.columns = cite_target_full_df.columns.astype(str)
cite_target_full_df

Unnamed: 0,axis0,axis1,block0_items,0,1,2,3,4,5,6,...,130,131,132,133,134,135,136,137,138,139
0,b'CD86',b'45006fe3e4c8',b'CD86',-0.700679,-0.480588,0.758890,4.005430,4.488583,10.441545,3.422814,...,0.037198,8.450324,-1.336935,0.710298,-0.411739,1.521677,4.254705,0.018501,1.559338,-0.249746
1,b'CD274',b'd02759a80ba2',b'CD274',-0.247911,0.092436,0.451228,3.167296,7.088810,2.611744,2.703693,...,-0.102393,4.415429,0.408973,-0.212225,-0.103553,0.806964,4.870988,0.232639,0.448099,3.518433
2,b'CD270',b'c016c6b0efa5',b'CD270',0.837538,0.529510,1.667610,7.541543,6.337766,5.939754,1.063424,...,0.345147,7.014535,0.654636,0.345590,-0.230989,1.856015,6.055012,-0.005114,2.770377,2.763580
3,b'CD155',b'ba7f733a4f75',b'CD155',-0.319230,0.351232,0.600684,3.066540,3.450701,4.806094,1.209885,...,0.479556,4.302376,0.312743,-0.538247,-0.152097,1.003596,3.251727,0.502906,0.005339,2.918778
4,b'CD112',b'fbcf2443ffb2',b'CD112',-0.055494,-0.899805,0.170888,3.838875,2.361532,7.665431,5.433636,...,0.575630,8.111736,0.126940,0.354106,0.027420,0.068219,3.385519,0.476991,2.102832,0.170595
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,b'CD2',b'613f1d17569a',b'CD2',-0.259112,0.427318,-0.092760,1.275369,4.356452,4.179922,-0.251711,...,-0.363319,2.186473,0.393861,0.820339,-0.111177,0.487945,1.508934,0.915179,1.498958,1.828224
96,b'CD226',b'997cb42b095c',b'CD226',-0.501343,0.749696,0.593603,1.451426,1.633180,7.009009,5.983242,...,-0.396293,7.404981,-0.474646,-0.421708,-0.276057,-0.143001,1.481658,0.921807,1.289560,1.167890
97,b'CD29',b'21e74451fc57',b'CD29',1.175080,-0.504161,0.645865,4.516240,4.918075,5.803754,-0.047604,...,-0.176008,3.966429,1.110082,0.684169,-0.001234,0.365347,2.317305,0.033802,1.014779,2.044507
98,b'CD303',b'a7de418fbaa3',b'CD303',-0.399984,-0.477037,1.533079,4.430243,4.852415,9.177228,9.052576,...,-0.790629,10.507541,0.203781,0.115619,-0.207064,1.171972,1.619624,0.400356,0.737931,2.255992


### Data Preprocessing and Model Fitting

In [70]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(ctrain_chunk_block0_values, ctarget_chunk_block0_values, test_size=0.2, random_state=42)
# Train a model
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 3.116652250289917


In [71]:
# Standardize and apply PCA
X_all = ctrain_chunk_block0_values
y_all = ctarget_chunk_block0_values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_all)

pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)
 
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X_pca, y_all, test_size=0.2, random_state=42)
    
# Train and evaluate the model
ridge_model = Ridge(alpha=0.1)
ridge_model.fit(X_train, y_train)
    
y_pred = ridge_model.predict(X_val)
    
r2 = r2_score(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
    
print(f'R^2 Score: {r2}')
print(f'Mean Squared Error: {mse}')

R^2 Score: 0.002407210537758715
Mean Squared Error: 3.137584924697876


### Concatenating Citeseq Test Data

In [85]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 100

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        ctest_chunk_axis0 = pd.DataFrame(data_slice, columns=['axis0'])
        print(f'Processed chunk {chunk_count} with shape: {ctest_chunk_axis0.shape}')
        chunk_count += 1
        if chunk_count > 1:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield ctest_chunk_axis0

# Data Processing in chunks
for ctest_chunk_axis0 in process_chunks('test_cite_inputs.h5', 'test_cite_inputs/axis0', row_chunk_size):
    pass

Processed chunk 0 with shape: (100, 1)
Processed chunk 1 with shape: (100, 1)
Stopping after processing 10 chunks for testing.


In [86]:
ctest_chunk_axis0

Unnamed: 0,axis0
0,b'ENSG00000121410_A1BG'
1,b'ENSG00000268895_A1BG-AS1'
2,b'ENSG00000175899_A2M'
3,b'ENSG00000245105_A2M-AS1'
4,b'ENSG00000166535_A2ML1'
...,...
95,b'ENSG00000175164_ABO'
96,b'ENSG00000159842_ABR'
97,b'ENSG00000146386_ABRACL'
98,b'ENSG00000163322_ABRAXAS1'


In [89]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 100

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        ctest_chunk_axis1 = pd.DataFrame(data_slice, columns=['axis1'])
        print(f'Processed chunk {chunk_count} with shape: {ctest_chunk_axis1.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield ctest_chunk_axis1

# Data Processing in chunks
for ctest_chunk_axis1 in process_chunks('test_cite_inputs.h5', 'test_cite_inputs/axis1', row_chunk_size):
    pass

Processed chunk 0 with shape: (100, 1)
Stopping after processing 10 chunks for testing.


In [90]:
ctest_chunk_axis1

Unnamed: 0,axis1
0,b'932005eec770'
1,b'af731637ea60'
2,b'f7ff5b90ad5c'
3,b'1c51e25d23ae'
4,b'16faa9f6a72f'
...,...
95,b'4dd7df88bc4a'
96,b'3aa145d60f6c'
97,b'504dffe33c39'
98,b'db2c3d2793f2'


In [93]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Reading chunk of data
            data_slice = dataset[start_row:end_row]
            yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 100

def process_chunks(file_path, dataset_name, row_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size):
        ctest_chunk_block0_items = pd.DataFrame(data_slice, columns=['block0_items'])
        print(f'Processed chunk {chunk_count} with shape: {ctest_chunk_block0_items.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield ctest_chunk_block0_items

# Data Processing in chunks
for ctest_chunk_block0_items in process_chunks('test_cite_inputs.h5', 'test_cite_inputs/block0_items', row_chunk_size):
    pass

Processed chunk 0 with shape: (100, 1)
Stopping after processing 10 chunks for testing.


In [94]:
ctest_chunk_block0_items

Unnamed: 0,block0_items
0,b'ENSG00000121410_A1BG'
1,b'ENSG00000268895_A1BG-AS1'
2,b'ENSG00000175899_A2M'
3,b'ENSG00000245105_A2M-AS1'
4,b'ENSG00000166535_A2ML1'
...,...
95,b'ENSG00000175164_ABO'
96,b'ENSG00000159842_ABR'
97,b'ENSG00000146386_ABRACL'
98,b'ENSG00000163322_ABRAXAS1'


In [73]:
def load_h5_data_chunked(file_path, dataset_name, row_chunk_size, col_chunk_size):
    with h5py.File(file_path, 'r') as f:
        dataset = f[dataset_name]
        total_rows = dataset.shape[0]
        total_cols = dataset.shape[1]
        
        # Iterate over row chunks
        for start_row in range(0, total_rows, row_chunk_size):
            end_row = min(start_row + row_chunk_size, total_rows)
            
            # Iterate over column chunks
            for start_col in range(0, total_cols, col_chunk_size):
                end_col = min(start_col + col_chunk_size, total_cols)
                
                # Reading chunk of data
                data_slice = dataset[start_row:end_row, start_col:end_col]
                yield data_slice

# Adjusting chunk size (based on memory constraints)
row_chunk_size = 10000  
col_chunk_size = 100 

def process_chunks(file_path, dataset_name, row_chunk_size, col_chunk_size):
    chunk_count = 0
    for data_slice in load_h5_data_chunked(file_path, dataset_name, row_chunk_size, col_chunk_size):
        ctest_chunk_block0_values = pd.DataFrame(data_slice)
        print(f'Processed chunk {chunk_count} with shape: {ctest_chunk_block0_values.shape}')
        chunk_count += 1
        if chunk_count > 0:
            print('Stopping after processing 10 chunks for testing.')
            break
        yield ctest_chunk_block0_values

# Data Processing in chunks
for ctest_chunk_block0_values in process_chunks('test_cite_inputs.h5', 'test_cite_inputs/block0_values', row_chunk_size, col_chunk_size):
    pass

Processed chunk 0 with shape: (10000, 100)
Stopping after processing 10 chunks for testing.


In [74]:
ctest_chunk_block0_values

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,4.090185,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,3.847321,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,3.436846,0.0,0.0,0.0,0.0,...,0.0,0.0,3.436846,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,3.518610,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,3.651413,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
cite_test_full_df =  pd.concat([ctest_chunk_axis0, ctest_chunk_axis1, ctest_chunk_block0_items, ctest_chunk_block0_values], axis=1)
cite_test_full_df

Unnamed: 0,axis0,axis1,block0_items,0,1,2,3,4,5,6,...,90,91,92,93,94,95,96,97,98,99
0,b'ENSG00000121410_A1BG',b'932005eec770',b'ENSG00000121410_A1BG',0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,4.090185,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,b'ENSG00000268895_A1BG-AS1',b'af731637ea60',b'ENSG00000268895_A1BG-AS1',0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,b'ENSG00000175899_A2M',b'f7ff5b90ad5c',b'ENSG00000175899_A2M',0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,3.847321,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,b'ENSG00000245105_A2M-AS1',b'1c51e25d23ae',b'ENSG00000245105_A2M-AS1',0.0,0.0,0.0,0.0,0.0,3.436846,0.0,...,0.0,0.0,3.436846,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,b'ENSG00000166535_A2ML1',b'16faa9f6a72f',b'ENSG00000166535_A2ML1',0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,3.518610,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,,,,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,,,,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,,,,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,,,,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,3.651413,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### CITEseq Model Validation

In [76]:
# Making predictions on the test set
test_predictions = model.predict(ctest_chunk_block0_values)

In [77]:
# Loading evaluation IDs
evaluation_ids = pd.read_csv('evaluation_ids.csv')

# Predictions to a DataFrame
test_predictions_df = pd.DataFrame(test_predictions, index=ctest_chunk_block0_values.index)

# Flattening the DataFrame to match the evaluation IDs
test_predictions_flat = test_predictions_df.unstack().reset_index()
test_predictions_flat.columns = ['gene_id', 'cell_id', 'surface_protein_predicted_value']

evaluation_ids['cell_id'] = evaluation_ids['cell_id'].astype(str)
test_predictions_flat['cell_id'] = test_predictions_flat['cell_id'].astype(str)

# 'gene_id' columns converted to string
evaluation_ids['gene_id'] = evaluation_ids['gene_id'].astype(str)
test_predictions_flat['gene_id'] = test_predictions_flat['gene_id'].astype(str)

# Merging with evaluation IDs to create the submission file
cite_submission_df = evaluation_ids.merge(test_predictions_flat, on=['cell_id', 'gene_id'], how='left')

# Filling missing values
cite_submission_df['surface_protein_predicted_value'].fillna(0, inplace=True)

# Final submission file
cite_submission_df[['row_id', 'surface_protein_predicted_value']].to_csv('submission.csv', index=False)

In [78]:
cite_submission_df

Unnamed: 0,row_id,cell_id,gene_id,surface_protein_predicted_value
0,0,c2150f55becb,CD86,0.0
1,1,c2150f55becb,CD274,0.0
2,2,c2150f55becb,CD270,0.0
3,3,c2150f55becb,CD155,0.0
4,4,c2150f55becb,CD112,0.0
...,...,...,...,...
65744175,65744175,2c53aa67933d,ENSG00000134419,0.0
65744176,65744176,2c53aa67933d,ENSG00000186862,0.0
65744177,65744177,2c53aa67933d,ENSG00000170959,0.0
65744178,65744178,2c53aa67933d,ENSG00000107874,0.0
