In [13]:
import pickle
import numpy as np
from tqdm import tqdm
from pathlib import Path

In [14]:
main_path = Path('Data/hkunlp_instructor/hkunlp_instructor_large_embeddings_instructions.pkl')

In [15]:
embs = pickle.load(open(main_path, 'rb'))

In [16]:
embs['descriptions'][0]

['Represent the Biomedical database entry of a gene such that genes which are functionally related have high similarity: ',
 'Symbol: ENG1 Type: protein Organism: Rhizomucor miehei Description: Cleaves internal linkages in 1,3-beta-glucan. Tris binds the active site and may act as an inhibitor. Belongs to the glycosyl hydrolase 81 family.']

In [17]:
no_symbol = 0
no_type = 0
no_organism = 0
no_description = 0
for i, des in enumerate(embs['descriptions']):
    if 'Symbol' not in des:
        no_symbol += 1
    if 'Organism' not in des:
        no_organism += 1
    if 'Description' not in des:
        no_description += 1
    if 'Type' not in des:
        no_type += 1
print('No symbol: ', no_symbol)
print('No organism: ', no_organism)
print('No description: ', no_description)
print('No type: ', no_type)

No symbol:  488897
No organism:  488897
No description:  488897
No type:  488897


In [19]:
from collections import defaultdict
import pandas as pd

res = defaultdict(dict)
organisms = set()
entities = set()
types = set()
meta = []
for i, des in enumerate(embs['descriptions']):
    des = des[1] # for hkunlp_instructor ['Instruction', 'Description: ...']
    symbol = des.split('Type')[0].split(':')[-1].strip()
    type = des.split('Organism')[0].split(':')[-1].strip()
    organism = des.split('Description')[0].split(':')[-1].strip()
    description = des.split('Description')[-1].strip().strip(':')
    
    res[i]['Symbol'] = symbol
    res[i]['Type'] = type
    res[i]['Organism'] = organism
    res[i]['Description'] = description
    res[i]['Embedding'] = embs['embeddings'][i]

    organisms.add(organism)
    entities.add(organism + '_' + symbol)
    types.add(type)
    meta.append([symbol, type, organism, description])

meta = pd.DataFrame(meta, columns=['Symbol', 'Type', 'Organism', 'Description'])
meta

Unnamed: 0,Symbol,Type,Organism,Description
0,ENG1,protein,Rhizomucor miehei,"Cleaves internal linkages in 1,3-beta-glucan...."
1,cbh2,protein,Hypocrea jecorina (strain ATCC 56765 / BCRC 32...,Exocellobiohydrolases (CBH) that catalyzes th...
2,Orco,protein,Ooceraea biroi,Odorant coreceptor which complexes with conve...
3,Bli,protein,Onchocerca volvulus,Serine endoprotease which cleaves substrates ...
4,CCR1,protein,Petunia hybrida,Involved in the latter stages of lignin biosy...
...,...,...,...,...
488892,RPS8,protein,Griffithsia japonica,Belongs to the eukaryotic ribosomal protein e...
488893,ninE,protein,Escherichia phage 933W,Belongs to the ninE family.
488894,AP,protein,Fragaria ananassa,"Interacts with FRAA1E, FRAA2 and FRAA3."
488895,GUCA1ANB,protein,Homo sapiens,Product of a dubious gene prediction.


In [12]:
organism_count = meta.value_counts('Organism')
organisms_to_keep = organism_count[organism_count > 1000].index.to_list()
print(len(organism_count), len(organisms_to_keep))
print(meta.shape)
print(meta[[o in organisms_to_keep for o in meta.Organism]].shape)
meta_to_keep = meta[[o in organisms_to_keep for o in meta.Organism]]
embs_to_keep = meta_to_keep.index.to_list()
print(len(embs_to_keep))

res_to_keep = {
    k: v for k, v in tqdm(res.items()) if k in embs_to_keep
}
pickle.dump(res_to_keep, open('Data/hkunlp_instructor/embeddings_1000.pickle', 'wb'))
meta_to_keep.to_csv('Data/hkunlp_instructor/meta_1000.csv', index=True)

11145 30
(488897, 4)
(119830, 4)
119830


100%|██████████| 488897/488897 [04:51<00:00, 1678.13it/s]


In [13]:
import pickle
import numpy as np
from tqdm import tqdm
import pandas as pd


meta = pd.read_csv('Data/bert/meta_1000.csv', index_col=0)
res = pickle.load(open('Data/bert/embeddings_1000.pickle', 'rb'))

meta_yeast = meta[meta.Organism == 'Saccharomyces cerevisiae (strain ATCC 204508 / S288c)'][['Symbol', 'Description']]


res_yeast = []
for i in meta_yeast.index:
    res_yeast.append(res[i]['Embedding'])
res_yeast = np.array(res_yeast)

res_yeast = pd.DataFrame(res_yeast, index=meta_yeast['Symbol'])
res_yeast.head()



mapper = pd.read_csv('TakeYeast/YeastMine.tsv', sep='\t', header=None, index_col=None)
orfs = mapper[[1]].T.values[0].tolist()

sgdid2orfs = mapper[[0, 1]].set_index([0]).to_dict()[1]
gname2orfs = mapper[[1, 3]].set_index([3]).to_dict()[1]

res_yeast.index = res_yeast.index.map(lambda x: gname2orfs[x] if x in gname2orfs else 'Delete').tolist()

res_yeast.head()


res_yeast = res_yeast[res_yeast.index != 'Delete']
res_yeast.to_csv('TakeYeast/YeastEmbeddings_BERT.tsv', sep='\t', header=True, index=True)