In [1]:
import numpy as np
import pandas as pd
import os

## Submission template

In [2]:
## load template
template_df = pd.read_csv('../data/sample_submission.csv')
assert template_df.shape[0] == 255
template_df.head()

Unnamed: 0,id,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
gene_list = template_df.columns[1:].tolist()

In [4]:
len(gene_list)

18211

In [5]:
gene2idx = dict()
idx2gene = dict()

for idx, gene in enumerate(gene_list):
    gene2idx[gene] = idx
    idx2gene[idx] = gene

del idx, gene

## Multiome data

In [6]:
df = pd.read_parquet('../data/multiome_train.parquet')
print(df.shape)
df.head()

(216251368, 4)


Unnamed: 0,obs_id,location,count,normalized_count
0,000225c1151ab841,AAMP,1,6.320659
1,000225c1151ab841,AASS,1,6.320659
2,000225c1151ab841,ABCC11,1,6.320659
3,000225c1151ab841,ABCC2,1,6.320659
4,000225c1151ab841,ABR,1,6.320659


In [7]:
df = df.drop(columns=['count'])
df.head()

Unnamed: 0,obs_id,location,normalized_count
0,000225c1151ab841,AAMP,6.320659
1,000225c1151ab841,AASS,6.320659
2,000225c1151ab841,ABCC11,6.320659
3,000225c1151ab841,ABCC2,6.320659
4,000225c1151ab841,ABR,6.320659


In [8]:
df = df[df['location'].isin(gene_list)]
df.head()

Unnamed: 0,obs_id,location,normalized_count
0,000225c1151ab841,AAMP,6.320659
1,000225c1151ab841,AASS,6.320659
3,000225c1151ab841,ABCC2,6.320659
4,000225c1151ab841,ABR,6.320659
5,000225c1151ab841,ABRAXAS2,6.320659


In [9]:
df.shape

(40845122, 3)

In [10]:
print(min(df['normalized_count'].unique()))
print(max(df['normalized_count'].unique()))

3.9328556
13.086681


## Look up cell types

In [11]:
meta = pd.read_csv('../data/multiome_obs_meta.csv')
meta = meta.drop(columns=['donor_id'])
print(meta.shape)
meta.head()

(25551, 2)


Unnamed: 0,obs_id,cell_type
0,000225c1151ab841,B cells
1,0003c40a54367871,T cells CD4+
2,0004bf574b822c3c,T cells CD4+
3,000d59b5478f28e2,B cells
4,0011b7473923d7b5,NK cells


In [12]:
meta['obs_id'].nunique()

25551

In [13]:
obsid2celltype = dict()

for i in range(meta.shape[0]):
    obs_id = meta.iloc[i]['obs_id']
    cell_type = meta.iloc[i]['cell_type']
    obsid2celltype[obs_id] = cell_type

In [14]:
meta['cell_type'].value_counts()

cell_type
T cells CD4+          9371
Myeloid cells         6874
NK cells              4497
B cells               3809
T cells CD8+           508
T regulatory cells     492
Name: count, dtype: int64

In [15]:
len(np.setdiff1d(df['obs_id'].unique(), meta['obs_id'].unique()))

0

In [16]:
len(np.setdiff1d(meta['obs_id'].unique(), df['obs_id'].unique()))

0

In [17]:
meta['cell_type'].value_counts().to_dict()

{'T cells CD4+': 9371,
 'Myeloid cells': 6874,
 'NK cells': 4497,
 'B cells': 3809,
 'T cells CD8+': 508,
 'T regulatory cells': 492}

In [18]:
type(meta['cell_type'].value_counts().to_dict()['B cells'])

int

In [19]:
celltype_list = meta['cell_type'].unique().tolist()
celltype_list

['B cells',
 'T cells CD4+',
 'NK cells',
 'Myeloid cells',
 'T regulatory cells',
 'T cells CD8+']

## Extract multiome gene expressions

In [20]:
import tqdm

In [21]:
celltype2cnt = meta['cell_type'].value_counts().to_dict()

celltype2geneexp = dict()

for cell_type in celltype_list:
    celltype2geneexp[cell_type] = [0 for _ in range(18211)]

for i in tqdm.tqdm(range(df.shape[0])):
    obs_id = df.iloc[i]['obs_id']
    gene = df.iloc[i]['location']
    cnt = df.iloc[i]['normalized_count']
    
    cell_type = obsid2celltype[obs_id]
    
    if gene not in gene2idx:
        continue
        
    if obs_id not in obsid2celltype:
        continue
        
    if cnt < 1:
        continue

    celltype2geneexp[cell_type][gene2idx[gene]] = celltype2geneexp[cell_type][gene2idx[gene]] + cnt/(1.0*celltype2cnt[cell_type])

100%|████████████████████████████████████████████████████████| 40845122/40845122 [46:03<00:00, 14781.09it/s]


## Save as csv

In [22]:
cell_emb_df = pd.DataFrame(columns=['cell_type']+gene_list)

for k, v in celltype2geneexp.items():
    
    v_np = np.array(v).astype(float)
    #v_np /= np.sum(v_np)
    
    cell_emb_df.loc[len(cell_emb_df.index)] = [k] + v_np.tolist()

cell_emb_df.head()

Unnamed: 0,cell_type,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,B cells,0.286709,0.369192,0.174579,0.04098,0.0,0.004556,0.160233,0.189103,0.508853,...,0.289054,0.218039,0.146898,0.010962,0.020037,0.055597,1.41121,0.731649,0.443267,1.138623
1,T cells CD4+,0.345561,0.276483,0.845896,0.397363,0.0,0.004368,0.207783,0.155854,0.671557,...,0.301208,0.231177,0.164825,0.013874,0.028687,0.059835,1.596086,0.810385,0.741314,1.573785
2,NK cells,0.198593,0.193077,2.525363,1.3214,0.0,0.006829,0.171567,0.138596,0.623483,...,0.264486,0.229794,0.147945,0.006202,0.010658,0.04588,1.375194,0.478963,0.794888,1.440577
3,Myeloid cells,0.426668,0.309088,0.842713,0.05328,0.0,0.057028,0.291705,0.120023,0.786532,...,0.578372,0.530244,0.217056,0.023516,0.014606,0.034963,1.783012,1.38516,1.725515,2.837625
4,T regulatory cells,0.244087,0.15209,0.146131,0.053442,0.0,0.0,0.135021,0.064394,0.65629,...,0.242616,0.191995,0.124759,0.048676,0.023823,0.053844,1.144881,0.64771,0.464262,1.315307


In [23]:
cell_emb_df.to_csv('./cell_emb_normcnt.csv', index=False)

In [24]:
tmp = pd.read_csv('./cell_emb_normcnt.csv')
print(tmp.shape)
tmp.head()

(6, 18212)


Unnamed: 0,cell_type,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,B cells,0.286709,0.369192,0.174579,0.04098,0.0,0.004556,0.160233,0.189103,0.508853,...,0.289054,0.218039,0.146898,0.010962,0.020037,0.055597,1.41121,0.731649,0.443267,1.138623
1,T cells CD4+,0.345561,0.276483,0.845896,0.397363,0.0,0.004368,0.207783,0.155854,0.671557,...,0.301208,0.231177,0.164825,0.013874,0.028687,0.059835,1.596086,0.810385,0.741314,1.573785
2,NK cells,0.198593,0.193077,2.525363,1.3214,0.0,0.006829,0.171567,0.138596,0.623483,...,0.264486,0.229794,0.147945,0.006202,0.010658,0.04588,1.375194,0.478963,0.794888,1.440577
3,Myeloid cells,0.426668,0.309088,0.842713,0.05328,0.0,0.057028,0.291705,0.120023,0.786532,...,0.578372,0.530244,0.217056,0.023516,0.014606,0.034963,1.783012,1.38516,1.725515,2.837625
4,T regulatory cells,0.244087,0.15209,0.146131,0.053442,0.0,0.0,0.135021,0.064394,0.65629,...,0.242616,0.191995,0.124759,0.048676,0.023823,0.053844,1.144881,0.64771,0.464262,1.315307


In [25]:
del tmp

In [26]:
del meta

In [27]:
del df