In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATASET_DIR = './dataset/'

In [19]:
pangenome_nodes = pd.read_csv(DATASET_DIR + 'pangenome_nodes.csv')
hit_family_nodes = pangenome_nodes[pangenome_nodes['nodeLabels'] == 'HitFamily']
pfam_accessions = hit_family_nodes['accession'].unique()
print('Number of unique Pfam accessions:', len(pfam_accessions))

Number of unique Pfam accessions: 979


In [20]:
def get_pfam_basic(accession):
    acc_num = accession.split('PF')[1]
    url = 'https://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid=pfam' + acc_num
    html = urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')
    try:
        desc = soup.find_all('meta', attrs={'name': 'description'})[0]['content']
        desc = desc.split('Conserved Protein Domain Family')[1]
        return desc
    except:
        print('No description found for ' + accession)
        return None

In [29]:
# Write rows to CSV continuously and start from last written

import csv

fetch_pfams = sorted(pfam_accessions)
filename = DATASET_DIR + 'pfam_llm.csv'
try:
    pfam_descriptions = pd.read_csv(filename)
    last_written = pfam_descriptions['accession'].values[-1]
    last_written_index = fetch_pfams.index(last_written)
    fetch_pfams = fetch_pfams[last_written_index + 1:]
except:
    with open(filename, 'w', newline="") as f:
        writer = csv.writer(f)
        writer.writerow(['accession', 'description'])

print(len(fetch_pfams), 'Pfams to fetch')

for pfam in fetch_pfams:
    with open(filename, 'a', newline="") as f:
        writer = csv.writer(f)
        writer.writerow([pfam, get_pfam_basic(pfam)])


977 Pfams to fetch


In [74]:
taxon_nodes = pd.read_csv(DATASET_DIR + 'taxon_nodes.csv')
taxon_nodes = taxon_nodes.loc[
    taxon_nodes['taxKingdom'] == 'Viruses'
]
taxon_orders = taxon_nodes[taxon_nodes['rank'] == 'order']['taxOrder'].unique()
print('Number of unique orders:', len(taxon_orders))

Number of unique orders: 65


In [72]:
def get_taxon_order_description(order):
    # url = f'https://ictv.global/report/chapter/{str(order)}/{str(order)}'
    try:
        url = f'https://en.wikipedia.org/wiki/{order}'
        html = urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')
        text = ''
        for i in soup.select('p'):
            text += i.text
        text = text.strip()
        text = text.replace('\n', ' ')
        return text
    except:
        print('No description found for ' + order)
        return None

test = get_taxon_order_description('Jingchuvirales')
print(test)

Jingchuvirales is an order of viruses. The order contains the following families:[1] This virus-related article is a stub. You can help Wikipedia by expanding it.


In [75]:
fetch_taxons = sorted(taxon_orders)
filename = DATASET_DIR + 'taxon_order_llm.csv'

try:
    taxon_order_descriptions = pd.read_csv(filename)
    last_written = taxon_order_descriptions['order'].values[-1]
    last_written_index = fetch_taxons.index(last_written)
    fetch_taxons = fetch_taxons[last_written_index + 1:]
except:
    with open(filename, 'w', newline="") as f:
        writer = csv.writer(f)
        writer.writerow(['taxId', 'order', 'description'])
    
print(len(fetch_taxons), 'Taxon orders to fetch')

for tax_order in fetch_taxons:
    with open(filename, 'a', newline="") as f:
        writer = csv.writer(f)
        tax_id = taxon_nodes[taxon_nodes['taxOrder'] == tax_order]['taxId'].values[0]
        writer.writerow([tax_id, tax_order, get_taxon_order_description(tax_order)])


65 Taxon orders to fetch
No description found for Crassvirales
No description found for Kirjokansivirales
No description found for Methanobavirales
No description found for Rivendellvirales
No description found for Rohanvirales
No description found for Thumleimavirales
No description found for Yadokarivirales


In [110]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [123]:
def add_embedding_to_df(df, text_column):
    embeddings = model.encode(df[text_column].values)

    df['embeddings'] = embeddings.tolist()
    return df


In [131]:
taxon_order_descriptions = pd.read_csv(DATASET_DIR + 'taxon_order_llm.csv')

taxon_order_descriptions = add_embedding_to_df(taxon_order_descriptions, 'description')
taxon_order_descriptions.to_csv(DATASET_DIR + 'taxon_order_llm.csv', index=False)

In [132]:
pfam_descriptions = pd.read_csv(DATASET_DIR + 'pfam_llm.csv')

pfam_descriptions = add_embedding_to_df(pfam_descriptions, 'description')
pfam_descriptions.to_csv(DATASET_DIR + 'pfam_llm.csv', index=False)

In [None]:

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# "microsoft/BioGPT-Large"
# "microsoft/biogpt"
model_name = 'sentence-transformers/all-MiniLM-L6-v2'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

pipe = pipeline('feature-extraction', model=model, tokenizer=tokenizer)
def get_embedding(text):
    data = pipe(text, return_tensors="pt")
    return data[0].numpy().mean(axis=0)
    
out = get_embedding('This is a test.')
# 57717, 42384
print(out.shape)

In [156]:
import ast
import umap
import matplotlib.pyplot as plt

reducer = umap.UMAP()
emb = pfam_descriptions['embeddings'].values

umap_emb = reducer.fit_transform(x)

plt.scatter(
    umap_emb[:, 0],
    umap_emb[:, 1],
    s=10,
    alpha=0.5,
)

  warn(


Disconnection_distance = inf has removed 0 edges.
It has fully disconnected 2 vertices.
You might consider using find_disconnected_points() to find and remove these points from your data.
Use umap.utils.disconnected_vertices() to identify them.
  warn(


ValueError: zero-size array to reduction operation maximum which has no identity