In [1]:
!pip install tables

[0m

In [2]:
import os, gc, pickle, datetime, scipy.sparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from colorama import Fore, Back, Style

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from cycler import cycler
from IPython.display import display

import scipy.sparse

In [3]:
# Directory of the data
DATA_DIR = "/kaggle/input/open-problems-multimodal/"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")

FP_MULT_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
FP_MULT_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
FP_MULT_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

FP_MULT_TRAIN_TARGETS_idx = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_targets_idxcol.npz"
FP_MULT_TRAIN_TARGETS_sparse = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_targets_values.sparse.npz"
FP_MULT_TRAIN_INPUTS_idx = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_inputs_idxcol.npz"
FP_MULT_TRAIN_INPUTS_sparse = "../input/multimodal-single-cell-as-sparse-matrix/train_multi_inputs_values.sparse.npz"
FP_MULT_TEST_INPUTS_idx = "../input/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_idxcol.npz"
FP_MULT_TEST_INPUTS_sparse = "../input/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_values.sparse.npz"

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

# Multiome, light exploration

NOTE: Multiome data is TOO BIG for loading in directly, so a preprocessing step where the data is reduced into sparse matrices is done first.

The process to do this is outlined in this notebook by Fabien Crom: https://www.kaggle.com/code/fabiencrom/multimodal-single-cell-creating-sparse-data/

The data output from that notebook is given here by Fabien Crom:
https://www.kaggle.com/datasets/fabiencrom/multimodal-single-cell-as-sparse-matrix/code

Essentially, the method is to read the data in chunks and save it in those chunks to a sparse matrix.

## Multiome Targets

First, the targets of the Multome can be lightly explored to see what the prediction is for

In [4]:
%%time
# 25.1s

Multi_targets = scipy.sparse.load_npz(FP_MULT_TRAIN_TARGETS_sparse)
Multi_targets_idx = np.load(FP_MULT_TRAIN_TARGETS_idx,
                   allow_pickle=True)["columns"]

CPU times: user 16.3 s, sys: 2.45 s, total: 18.7 s
Wall time: 25.2 s


In [5]:
# In order to explore the rows and columns of the 
# training targets, first print what the .npz file has 
# defined as the keys
print(np.load(FP_MULT_TRAIN_TARGETS_idx,
                   allow_pickle=True).files)

# This means that there are index (row) names and
# column names given in the .npz file, which can be accessed
# in the form 
# np.load(FP_MULT_TRAIN_TARGETS_idx,
#                    allow_pickle=True)['index']

['index', 'columns']


In [6]:
# Checking the shape to make sure there is a match
# There are 105942 rows, each row being a cell uniquely
# defined by a 12 character long cell_id
# There are 23418 columns, each columns being a gene
# uniquely defined by a gene_id
Multi_targets.shape, len(np.load(FP_MULT_TRAIN_TARGETS_idx,
        allow_pickle=True)['index']), Multi_targets_idx.shape

((105942, 23418), 105942, (23418,))

In [7]:
# The rows of the training targets are previewed below
display(len(np.load(FP_MULT_TRAIN_TARGETS_idx,
        allow_pickle=True)['index']))
display(np.load(FP_MULT_TRAIN_TARGETS_idx,
        allow_pickle=True)['index'])

# each row is a 12 character id uniquely defining a cell

105942

array(['56390cf1b95e', 'fc0c60183c33', '9b4a87e22ad0', ...,
       '00783f28b463', 'e7abb1a0f251', '193992d571a5'], dtype=object)

In [8]:
# The columns of the training targets are previewed below
display(len(np.load(FP_MULT_TRAIN_TARGETS_idx,
        allow_pickle=True)['columns']))
display(np.load(FP_MULT_TRAIN_TARGETS_idx,
        allow_pickle=True)['columns'])

# each column is a gene name

23418

array(['ENSG00000121410', 'ENSG00000268895', 'ENSG00000175899', ...,
       'ENSG00000162378', 'ENSG00000159840', 'ENSG00000074755'],
      dtype=object)

In [9]:
# One row of the target matrix is printed to show that the expected
# output is a floating point number for each cell_id and gene_id pair.
# The number is 
print(Multi_targets[0,:])

# These floating point values are RNA gene expression levels which are defined as
# library-size normalized and log1p transformed counts.
# Essentially, a higher level means more weighted reads, which means a higher
# number means more expression

  (0, 12)	4.893861
  (0, 13)	5.583255
  (0, 14)	4.893861
  (0, 23)	4.893861
  (0, 34)	4.893861
  (0, 44)	5.583255
  (0, 58)	4.893861
  (0, 64)	4.893861
  (0, 65)	4.893861
  (0, 66)	5.987466
  (0, 67)	4.893861
  (0, 86)	4.893861
  (0, 87)	4.893861
  (0, 92)	4.893861
  (0, 94)	4.893861
  (0, 96)	4.893861
  (0, 101)	4.893861
  (0, 108)	5.583255
  (0, 111)	5.583255
  (0, 113)	4.893861
  (0, 240)	4.893861
  (0, 359)	4.893861
  (0, 407)	6.27452
  (0, 411)	4.893861
  (0, 419)	4.893861
  :	:
  (0, 23278)	4.893861
  (0, 23297)	5.583255
  (0, 23301)	4.893861
  (0, 23312)	4.893861
  (0, 23315)	5.583255
  (0, 23320)	4.893861
  (0, 23321)	4.893861
  (0, 23325)	5.987466
  (0, 23327)	4.893861
  (0, 23341)	5.987466
  (0, 23352)	4.893861
  (0, 23354)	5.987466
  (0, 23355)	4.893861
  (0, 23356)	5.583255
  (0, 23360)	4.893861
  (0, 23363)	4.893861
  (0, 23364)	4.893861
  (0, 23365)	4.893861
  (0, 23375)	6.679357
  (0, 23378)	5.583255
  (0, 23381)	4.893861
  (0, 23395)	4.893861
  (0, 23410)	4.893861
  (0,

In conclusion about the Multiome targets, the gene expression level as a floating point number must be predicted given a gene for a cell.

## Multiome inputs

Next, the inputs for the multiome can be explored to see what parameters may be used for prediction

In [10]:
%%time
# 1 min 4 s

Multi_inputs = scipy.sparse.load_npz(FP_MULT_TRAIN_INPUTS_sparse)
Multi_inputs_idx = np.load(FP_MULT_TRAIN_INPUTS_idx,
                   allow_pickle=True)["columns"]

CPU times: user 36.3 s, sys: 5.16 s, total: 41.5 s
Wall time: 1min 4s


In [11]:
# Checking the shape to make sure there is a match
# There are 105942 rows, each row being a cell uniquely
# defined by a 12 character long cell_id. There are the
# same number of cell_ids in the input as there is in the targets
# There are 228942 columns, each column being a genomic
# coordinate for where chromatin accessibility was measured
Multi_inputs.shape, len(np.load(FP_MULT_TRAIN_INPUTS_idx,
        allow_pickle=True)['index']), Multi_inputs_idx.shape

((105942, 228942), 105942, (228942,))

In [12]:
# In order to explore the rows and columns of the 
# training inputs, first print what the .npz file has 
# defined as the keys
print(np.load(FP_MULT_TRAIN_INPUTS_idx,
                   allow_pickle=True).files)

# This means that there are index (row) names and
# column names given in the .npz file, which can be accessed
# in the form 
# np.load(FP_MULT_TRAIN_INPUTS_idx,
#                    allow_pickle=True)['index']

['index', 'columns']


In [13]:
# Checking that the cell_ids for the training inputs and targets
# match exactly (both in number and order)
sum(np.load(FP_MULT_TRAIN_INPUTS_idx,
        allow_pickle=True)['index'] ==\
np.load(FP_MULT_TRAIN_TARGETS_idx,
        allow_pickle=True)['index']) == Multi_inputs.shape[0]

# The output is True, meaning that the training input and training target
# data have their cell_id (rows) ordered exactly the same

True

In [14]:
# The rows of the training targets are previewed below
display(len(np.load(FP_MULT_TRAIN_INPUTS_idx,
        allow_pickle=True)['index']))
display(np.load(FP_MULT_TRAIN_INPUTS_idx,
        allow_pickle=True)['index'])

# each row is a 12 character id uniquely defining a cell

105942

array(['56390cf1b95e', 'fc0c60183c33', '9b4a87e22ad0', ...,
       '00783f28b463', 'e7abb1a0f251', '193992d571a5'], dtype=object)

In [15]:
# The columns of the training targets are previewed below
display(len(np.load(FP_MULT_TRAIN_INPUTS_idx,
        allow_pickle=True)['columns']))
display(np.load(FP_MULT_TRAIN_INPUTS_idx,
        allow_pickle=True)['columns'])

# each column is a location on the genome for where the chromatin accessibility 
# was measured

228942

array(['GL000194.1:114519-115365', 'GL000194.1:55758-56597',
       'GL000194.1:58217-58957', ..., 'chrY:7836768-7837671',
       'chrY:7869454-7870371', 'chrY:7873814-7874709'], dtype=object)

In [16]:
# Can clearly see that the values of the inputs are floating point numbers
#display(Multi_inputs[0,:])
print(Multi_inputs[0,:])
#display(Multi_inputs[0,:][Multi_inputs[0,:] > 0])

#  Each value of the input is a floating point number indicating the 
# chromatin accessibility

  (0, 121)	2.7798393
  (0, 132)	1.8324159
  (0, 178)	1.3441861
  (0, 206)	2.7048109
  (0, 281)	1.993638
  (0, 304)	1.0189339
  (0, 342)	1.3371854
  (0, 349)	1.2167372
  (0, 352)	1.478264
  (0, 364)	1.2530675
  (0, 438)	1.4439429
  (0, 440)	2.128663
  (0, 458)	1.9236114
  (0, 470)	4.4668913
  (0, 471)	3.1427772
  (0, 478)	4.1509223
  (0, 514)	1.5360577
  (0, 538)	7.6406264
  (0, 606)	2.337737
  (0, 677)	2.6183214
  (0, 678)	2.8350036
  (0, 681)	2.24269
  (0, 700)	3.1052935
  (0, 706)	2.7141986
  (0, 729)	0.8664662
  :	:
  (0, 228003)	5.713204
  (0, 228076)	3.9001691
  (0, 228103)	1.2692189
  (0, 228127)	4.1223474
  (0, 228207)	5.229238
  (0, 228212)	3.8581614
  (0, 228227)	2.949207
  (0, 228276)	2.2193623
  (0, 228319)	4.2803874
  (0, 228345)	1.8020262
  (0, 228368)	1.5879754
  (0, 228510)	3.7330737
  (0, 228519)	1.9850894
  (0, 228599)	1.7331359
  (0, 228661)	3.651702
  (0, 228664)	4.654979
  (0, 228669)	4.922051
  (0, 228676)	4.840466
  (0, 228713)	3.6842017
  (0, 228714)	2.3858082
  

In [17]:
%%time
# 39.7 s

Multi_inputs_test = scipy.sparse.load_npz(FP_MULT_TEST_INPUTS_sparse)
Multi_inputs_idx_test = np.load(FP_MULT_TEST_INPUTS_idx,
                   allow_pickle=True)["columns"]

CPU times: user 21.2 s, sys: 2.99 s, total: 24.2 s
Wall time: 37.4 s


In [18]:
# In order to explore the rows and columns of the 
# testing inputs, first print what the .npz file has 
# defined as the keys
print(np.load(FP_MULT_TEST_INPUTS_idx,
                   allow_pickle=True).files)

# This means that there are index (row) names and
# column names given in the .npz file, which can be accessed
# in the form 
# np.load(FP_MULT_TEST_INPUTS_idx,
#                    allow_pickle=True)['index']

['index', 'columns']


In [19]:
# Checking the shape to make sure there is a match
# There are 55935 rows, each row being a cell uniquely
# defined by a 12 character long cell_id. There are the
# same number of cell_ids in the input as there is in the targets
# There are 228942 columns, each column being a genomic
# coordinate for where chromatin accessibility was measured
Multi_inputs_test.shape, len(np.load(FP_MULT_TEST_INPUTS_idx,
        allow_pickle=True)['index']), Multi_inputs_idx_test.shape

((55935, 228942), 55935, (228942,))

In [20]:
# Checking that the genomic locations for the training inputs and testing inputs
# match exactly 
sum(np.load(FP_MULT_TRAIN_INPUTS_idx,
        allow_pickle=True)['columns'] ==\
np.load(FP_MULT_TEST_INPUTS_idx,
        allow_pickle=True)['columns']) == Multi_inputs.shape[1]

# The output is True, meaning that the training input and testing input
# data have their genomic coordinates (columns) ordered exactly the same

True

In [21]:
# The rows of the testing targets are previewed below
display(len(np.load(FP_MULT_TEST_INPUTS_idx,
        allow_pickle=True)['index']))
display(np.load(FP_MULT_TEST_INPUTS_idx,
        allow_pickle=True)['index'])

# each row is a 12 character id uniquely defining a cell

55935

array(['458c2ae2c9b1', '01a0659b0710', '028a8bc3f2ba', ...,
       '05666c99aa48', '121f946642b5', 'b847ba21f59f'], dtype=object)

In [22]:
# The columns of the testing targets are previewed below
display(len(np.load(FP_MULT_TEST_INPUTS_idx,
        allow_pickle=True)['columns']))
display(np.load(FP_MULT_TEST_INPUTS_idx,
        allow_pickle=True)['columns'])

# each column is a location on the genome for where the chromatin accessibility 
# was measured

228942

array(['GL000194.1:114519-115365', 'GL000194.1:55758-56597',
       'GL000194.1:58217-58957', ..., 'chrY:7836768-7837671',
       'chrY:7869454-7870371', 'chrY:7873814-7874709'], dtype=object)

In [23]:
# Can clearly see that the values of the inputs are floating point numbers
print(Multi_inputs_test[0,:])

#  Each value of the input is a floating point number indicating the 
# chromatin accessibility

  (0, 83)	3.809964
  (0, 224)	4.269998
  (0, 230)	4.9450355
  (0, 304)	1.3452605
  (0, 308)	4.9980745
  (0, 340)	2.5276005
  (0, 354)	1.4460272
  (0, 361)	4.420143
  (0, 392)	1.7801597
  (0, 401)	1.6961472
  (0, 437)	1.7614198
  (0, 457)	3.7477815
  (0, 463)	4.600738
  (0, 515)	4.43197
  (0, 533)	6.7457433
  (0, 571)	2.419014
  (0, 685)	1.6049813
  (0, 910)	6.4601045
  (0, 1013)	1.8260169
  (0, 1048)	2.419014
  (0, 1129)	6.9834394
  (0, 1409)	6.7320933
  (0, 1635)	5.8816104
  (0, 1679)	2.410658
  (0, 1847)	4.397452
  :	:
  (0, 226029)	3.3689616
  (0, 226119)	7.100891
  (0, 226354)	3.7509642
  (0, 226360)	1.3939122
  (0, 226404)	5.1971364
  (0, 226537)	3.9367533
  (0, 226543)	3.1011198
  (0, 226740)	5.3450637
  (0, 226869)	3.7084696
  (0, 226877)	7.5941153
  (0, 226972)	4.8288417
  (0, 227056)	2.9302938
  (0, 227080)	2.1274364
  (0, 227157)	3.3456414
  (0, 227192)	2.2589688
  (0, 227196)	1.5066074
  (0, 227231)	4.708414
  (0, 227265)	2.6101677
  (0, 227280)	2.5098143
  (0, 227418)	4.213

In conclusion about the multiome inputs, the chromatin accessibility is given for a bunch of cells at different genomic coordinates. Chromatin accessibility is how compacted chromatin is, and chromatin is essentially a convenient package of DNA which can be uncoiled to make euchromatin which can be read by DNA reading proteins to make genes turn on. Hence, it makes sense that more accessible chromatin would correspond to gene expression. In fact, chromatin accessibility is directly related to gene expression (https://www.frontiersin.org/articles/10.3389/fgene.2020.618478/full). 

Helpful video: https://www.youtube.com/watch?v=6Z8aQhV_aD4

# CITEseq, light exploration

## CITEseq Targets

In [24]:
%%time
# 1 second

Y = pd.read_hdf(FP_CITE_TRAIN_TARGETS)

CPU times: user 260 ms, sys: 104 ms, total: 364 ms
Wall time: 855 ms


In [25]:
# previewing the training targets shows that each
# row is a cell uniquely defined by its cell_id and each
# column is a gene uniquely defined by the gene_id
# Each value is a floating point value is a surface protein
# level which has been dsb normalized.

# There are 70988 rows for 70988 different cells 
# and 140 columns for 140 different genes
Y

gene_id,CD86,CD274,CD270,CD155,CD112,CD47,CD48,CD40,CD154,CD52,...,CD94,CD162,CD85j,CD23,CD328,HLA-E,CD82,CD101,CD88,CD224
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,1.167804,0.622530,0.106959,0.324989,3.331674,6.426002,1.480766,-0.728392,-0.468851,-0.073285,...,-0.448390,3.220174,-0.533004,0.674956,-0.006187,0.682148,1.398105,0.414292,1.780314,0.548070
d02759a80ba2,0.818970,0.506009,1.078682,6.848758,3.524885,5.279456,4.930438,2.069372,0.333652,-0.468088,...,0.323613,8.407108,0.131301,0.047607,-0.243628,0.547864,1.832587,0.982308,2.736507,2.184063
c016c6b0efa5,-0.356703,-0.422261,-0.824493,1.137495,0.518924,7.221962,-0.375034,1.738071,0.142919,-0.971460,...,1.348692,4.888579,-0.279483,-0.131097,-0.177604,-0.689188,9.013709,-1.182975,3.958148,2.868600
ba7f733a4f75,-1.201507,0.149115,2.022468,6.021595,7.258670,2.792436,21.708519,-0.137913,1.649969,-0.754680,...,1.504426,12.391979,0.511394,0.587863,-0.752638,1.714851,3.893782,1.799661,1.537249,4.407671
fbcf2443ffb2,-0.100404,0.697461,0.625836,-0.298404,1.369898,3.254521,-1.659380,0.643531,0.902710,1.291877,...,0.777023,6.496499,0.279898,-0.841950,-0.869419,0.675092,5.259685,-0.835379,9.631781,1.765445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650ee456f0f3,0.905420,0.386141,0.961590,5.090580,2.854346,6.093729,-0.586178,0.452389,0.040806,0.191407,...,1.261118,3.092832,0.003275,0.278930,-0.272002,0.249477,3.789460,0.138330,1.466193,4.278504
cc506e7707f5,2.101247,2.117462,0.112699,2.065512,2.176803,3.900090,-0.586001,-0.175479,1.363232,0.109905,...,0.714624,5.029233,0.909861,0.057322,2.633387,1.340077,11.456146,-1.431453,5.275882,2.510530
a91f1b55a520,1.221313,0.476566,1.437551,5.135631,2.926102,1.615081,-0.586910,1.760421,1.944711,-0.095096,...,-0.176027,5.027534,-0.703609,1.139491,-0.078092,1.592960,9.358179,0.981883,6.911032,3.415310
3a9882c98205,-0.151433,-0.850024,0.461556,3.546561,1.996473,5.702821,0.883038,1.309014,1.029737,-0.072851,...,-0.484493,12.883892,1.579381,-0.382835,-0.065286,-0.021458,7.372662,1.010247,1.864805,3.449289


In conclusion about the CITEseq targets, the surface protein level as a floating point number must be found given the gene for a cell.

## CITEseq Inputs

In [26]:
%%time
# 2 min 39 s

X = pd.read_hdf(FP_CITE_TRAIN_INPUTS) # training inputs
X_test = pd.read_hdf(FP_CITE_TEST_INPUTS) # testing inputs

CPU times: user 53.6 s, sys: 19.5 s, total: 1min 13s
Wall time: 1min 43s


In [27]:
# training inputs:

# each row is a cell uniquely defined by its cell_id and each
# column is a gene uniquely defined by the gene_id
# Each value is a floating point value which is the gene
# expression level which is the RNA count that has been
# library normalized and log1p transformed

# There are 70988 rows for 70988 different cells 
# and 22050 columns for 22050 different genes
X

gene_id,ENSG00000121410_A1BG,ENSG00000268895_A1BG-AS1,ENSG00000175899_A2M,ENSG00000245105_A2M-AS1,ENSG00000166535_A2ML1,ENSG00000128274_A4GALT,ENSG00000094914_AAAS,ENSG00000081760_AACS,ENSG00000109576_AADAT,ENSG00000103591_AAGAB,...,ENSG00000153975_ZUP1,ENSG00000086827_ZW10,ENSG00000174442_ZWILCH,ENSG00000122952_ZWINT,ENSG00000198205_ZXDA,ENSG00000198455_ZXDB,ENSG00000070476_ZXDC,ENSG00000162378_ZYG11B,ENSG00000159840_ZYX,ENSG00000074755_ZZEF1
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
45006fe3e4c8,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,4.090185,0.000000
d02759a80ba2,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,4.039545,0.0,0.0,0.000000,0.000000,0.000000,0.000000
c016c6b0efa5,0.0,0.0,0.0,0.0,0.0,3.847321,0.000000,3.847321,3.847321,0.000000,...,0.000000,0.000000,3.847321,4.529743,0.0,0.0,0.000000,3.847321,3.847321,0.000000
ba7f733a4f75,0.0,0.0,0.0,0.0,0.0,0.000000,3.436846,3.436846,0.000000,0.000000,...,3.436846,0.000000,4.113780,5.020215,0.0,0.0,0.000000,3.436846,4.113780,0.000000
fbcf2443ffb2,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,4.196826,0.000000,0.000000,...,0.000000,4.196826,4.196826,4.196826,0.0,0.0,3.518610,4.196826,3.518610,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650ee456f0f3,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,4.397535,4.397535,5.084510,0.0,0.0,0.000000,0.000000,4.397535,4.397535
cc506e7707f5,0.0,0.0,0.0,0.0,0.0,0.000000,3.981467,4.665241,0.000000,0.000000,...,3.981467,0.000000,4.665241,3.981467,0.0,0.0,0.000000,0.000000,3.981467,0.000000
a91f1b55a520,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,4.497696,0.000000,4.497696,...,0.000000,0.000000,0.000000,4.497696,0.0,0.0,3.815622,4.497696,0.000000,0.000000
3a9882c98205,0.0,0.0,0.0,0.0,0.0,0.000000,3.900907,0.000000,0.000000,4.583891,...,0.000000,0.000000,4.583891,4.985945,0.0,0.0,0.000000,0.000000,0.000000,3.900907


In [28]:
# checking that the cell_ids for the training
# input and training targets are the same
sum(X.index == Y.index) == X.shape[0]

# since the result is true, it means all the cell_ids
# are present in both and in the same order

True

In [29]:
# testing inputs:

# each row is a cell uniquely defined by its cell_id and each
# column is a gene uniquely defined by the gene_id
# Each value is a floating point value which is the gene
# expression level which is the RNA count that has been
# library normalized and log1p transformed

# There are 48663 rows for 48663 different cells 
# and 22050 columns for 22050 different genes
X_test

gene_id,ENSG00000121410_A1BG,ENSG00000268895_A1BG-AS1,ENSG00000175899_A2M,ENSG00000245105_A2M-AS1,ENSG00000166535_A2ML1,ENSG00000128274_A4GALT,ENSG00000094914_AAAS,ENSG00000081760_AACS,ENSG00000109576_AADAT,ENSG00000103591_AAGAB,...,ENSG00000153975_ZUP1,ENSG00000086827_ZW10,ENSG00000174442_ZWILCH,ENSG00000122952_ZWINT,ENSG00000198205_ZXDA,ENSG00000198455_ZXDB,ENSG00000070476_ZXDC,ENSG00000162378_ZYG11B,ENSG00000159840_ZYX,ENSG00000074755_ZZEF1
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
c2150f55becb,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,4.090185,0.000000
65b7edf8a4da,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,4.039545,0.000000,0.0,0.00000,0.000000,0.000000,0.000000
c1b26cb1057b,0.0,0.0,0.0,0.0,0.0,3.847321,0.000000,3.847321,3.847321,0.000000,...,0.000000,0.000000,3.847321,4.529743,0.000000,0.0,0.00000,3.847321,3.847321,0.000000
917168fa6f83,0.0,0.0,0.0,0.0,0.0,0.000000,3.436846,3.436846,0.000000,0.000000,...,3.436846,0.000000,4.113780,5.020215,0.000000,0.0,0.00000,3.436846,4.113780,0.000000
2b29feeca86d,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,4.196826,0.000000,0.000000,...,0.000000,4.196826,4.196826,4.196826,0.000000,0.0,3.51861,4.196826,3.518610,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
a9b4d99f1f50,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,3.719836,...,0.000000,0.000000,0.000000,3.719836,0.000000,0.0,0.00000,0.000000,3.719836,0.000000
0e2c1d0782af,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,4.026206,0.000000,...,4.026206,0.000000,0.000000,4.026206,0.000000,0.0,0.00000,0.000000,4.710393,0.000000
a3cbc5aa0ec3,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,4.306634,0.000000,0.000000,0.0,0.00000,4.306634,6.933096,4.993019
75b350243add,0.0,0.0,0.0,0.0,0.0,0.000000,3.624848,3.624848,0.000000,0.000000,...,0.000000,0.000000,0.000000,3.624848,3.624848,0.0,0.00000,3.624848,0.000000,0.000000


In conclusion about the CITEseq inputs, gene expression is given for various genes of various cells. When it gene is expressed it means that the gene is doing something rather than being inactive, so gene expresssion is more or less how active a particular gene is being (say, in allowing for protein creation). For the problem the given gene expression will be used to predict protein levels, which makes sense since some genes are used in providing instructions for creating protein, and thus if those genes are expressed (or active), then it would make sense that there would be more of that protein, increasing the protein level.