In [3]:
from pybiomart import Dataset as PyBiomartDataset

dataset = PyBiomartDataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')
genes_proteins = dataset.query(attributes=['ensembl_gene_id', 'ensembl_peptide_id',
                                            'gene_biotype', 'transcript_is_canonical', 'peptide'], filters={})
#genes_proteins = genes_proteins[genes_proteins['Ensembl Canonical'] == 1.0]
genes_proteins.to_csv('/home/ec2-user/cytoself-data/gene2protein.csv')

In [None]:
import pandas as pd

In [None]:
genes_proteins = pd.read_csv('/home/ec2-user/cytoself-data/gene2protein.csv', index_col=0)
genes_proteins = genes_proteins[genes_proteins['Ensembl Canonical'] == 1.0]

In [None]:
genes_proteins = genes_proteins.set_index('Gene stable ID')

In [None]:
genes_proteins

In [None]:
from os.path import join


datapath = "/home/ec2-user/cytoself-data"

df = pd.read_csv(join(datapath, "labels.csv"), index_col=0)

In [None]:
df['Peptide'] = df['ensg'].map(genes_proteins['Peptide'])
df['Protein stable ID'] = df['ensg'].map(genes_proteins['Protein stable ID'])
df.to_csv(join(datapath, "labels.csv"))

In [None]:
prots = df[['ensg', 'Peptide']].drop_duplicates().set_index('ensg')
prots.to_csv(join(datapath, "sequences.csv"))

In [None]:
prots['Peptide'].str.find('*')

In [None]:
prots['Peptide'].iloc[0]

## Load embeddings

In [1]:
import pandas as pd

In [2]:
GENE2PROTEIN_PATH = '/home/ec2-user/cytoself-data/sequences.csv'
PROTEIN_EMBED_PATH_F = '/home/ec2-user/cytoself-data/ESM_sequence_embeddings_full.zarr'
PROTEIN_EMBED_PATH_R = '/home/ec2-user/cytoself-data/ESM_sequence_embeddings_reduced.zarr'

In [3]:
df = pd.read_csv(GENE2PROTEIN_PATH)
# sequences = df['Peptide']
sequences = df['Peptide'].apply(lambda x: x.replace('*', '')).values
genes = df.index.values
num_genes = len(genes)
genes_proteins = df

In [4]:
import zarr


z_embedding_prot_f = zarr.open(
    PROTEIN_EMBED_PATH_F,
    mode="r"
)

z_embedding_prot_r = zarr.open(
    PROTEIN_EMBED_PATH_R,
    mode="r"
)

In [5]:
z_embedding_prot_r[0]

array([[ 0.04050937,  0.0092671 ,  0.06806058, ..., -0.24188687,
         0.19956455,  0.07856545],
       [ 0.04562815, -0.08524852,  0.00493111, ..., -0.10848487,
         0.07301189,  0.06637612]], dtype=float32)

In [6]:
z_embedding_prot_f[0, 1]

array([-0.04980165, -0.06002725,  0.03606275, ...,  0.2720689 ,
       -0.08399116,  0.01600615], dtype=float32)

In [7]:
z_embedding_prot_f

<zarr.core.Array (1311, 1025, 1280) float32 read-only>

In [8]:
truncation_length = [min(1024, len(s)) for s in sequences]

In [9]:
genes_proteins['truncation'] = truncation_length

In [10]:
truncation_length[0]

375

In [11]:
z_embedding_prot_f[0, 376]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

## Add labels and trunctation lengths

In [12]:
import os

datapath = '/home/ec2-user/cytoself-data'
labels_path = os.path.join(datapath, 'labels.csv')

In [13]:
import pandas as pd

df = pd.read_csv(labels_path, index_col=0)

In [14]:
df

Unnamed: 0,index,ensg,name,loc_grade1,loc_grade2,loc_grade3,protein_id,FOV_id,split_protein,split_images,label,Peptide,Protein stable ID
0,0,ENSG00000075624,ACTB,membrane;cytoskeleton,cytoplasmic,,2,27979,train,train,0,MDDDIAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,ENSP00000494750
1,1,ENSG00000075624,ACTB,membrane;cytoskeleton,cytoplasmic,,2,27979,train,train,0,MDDDIAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,ENSP00000494750
2,2,ENSG00000075624,ACTB,membrane;cytoskeleton,cytoplasmic,,2,27979,train,train,0,MDDDIAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,ENSP00000494750
3,3,ENSG00000075624,ACTB,membrane;cytoskeleton,cytoplasmic,,2,27979,train,val,0,MDDDIAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,ENSP00000494750
4,4,ENSG00000075624,ACTB,membrane;cytoskeleton,cytoplasmic,,2,27979,train,train,0,MDDDIAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,ENSP00000494750
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1134587,70512,ENSG00000125912,NCLN,er,,vesicles,2039,38864,train,train,1048,MLEEAGEVLENMLKASCLPLGFIVFLPAVLLLVAPPLPAADAAHEF...,ENSP00000246117
1134588,70513,ENSG00000125912,NCLN,er,,vesicles,2039,38864,train,train,1048,MLEEAGEVLENMLKASCLPLGFIVFLPAVLLLVAPPLPAADAAHEF...,ENSP00000246117
1134589,70514,ENSG00000125912,NCLN,er,,vesicles,2039,38864,train,val,1048,MLEEAGEVLENMLKASCLPLGFIVFLPAVLLLVAPPLPAADAAHEF...,ENSP00000246117
1134590,70515,ENSG00000125912,NCLN,er,,vesicles,2039,38864,train,train,1048,MLEEAGEVLENMLKASCLPLGFIVFLPAVLLLVAPPLPAADAAHEF...,ENSP00000246117


In [15]:
genes_proteins = genes_proteins.reset_index()
genes_proteins = genes_proteins.set_index('ensg')

In [17]:
df['seq_embedding_index'] = df['ensg'].map(genes_proteins['index'])
df['truncation'] = df['ensg'].map(genes_proteins['truncation'])

In [18]:
df.to_csv(os.path.join(datapath, "labels.csv"))