In [28]:
from utils.fasta_utils import *
from utils.proteome_process import *
from utils.prefetcher import *
from utils.network_utils import *

import os
from tqdm import tqdm
import pandas as pd
import hdbscan
import faiss
from sklearn.metrics.pairwise import cosine_distances,euclidean_distances
import numpy as np
import networkx as nx



In [32]:
data_folder = "/media/microscopie-lcb/swapnesh/protein/embeddings/phages/1Sept2024_INPHARED_db_latest/"

db_accession_path = os.path.join(data_folder , 'db_accessions.npy')
faiss_index_path = os.path.join(data_folder , "ESM2_650m_1Sept24_650m.zarrfaiss_index.bin")
eps_values_path = os.path.join(data_folder , "ESM2_650m_1Sept24_650m_eps_values_flat_clusters.npy")
HieVi_tree =  os.path.join(data_folder , "ESM2_3b_1Sept24_3b.zarr.gexf")
HieVi_INPHARED_ordered_annotation = "HieVi_INPHARED_ordered_annotation.csv"


htree = nx.read_gexf(HieVi_tree)
db_accessions = np.load(db_accession_path)

annotations = pd.read_csv(HieVi_INPHARED_ordered_annotation)
annotations= annotations[annotations["Accession"].isin(db_accessions)]
annotations = annotations.set_index("Accession").loc[db_accessions].reset_index()

eps_values = np.load(eps_values_path)
index = faiss.read_index(faiss_index_path)


In [None]:
filename = "/media/microscopie-lcb/swapnesh/protein/embeddings/phages/NewGenomes/HieVi_test_case.faa"
# @title Compute phage representations
expt_name = "MyPhages"  # @param {type:"string"}
expt_name = expt_name.replace(' ','_')
output_folder = os.path.dirname(filename) + os.sep 
fasta_path = filename
model_name = "650m" # This colab works for 650m only
mode = "mean"
query_zarr_path = os.path.join(output_folder,f"{expt_name}_{model_name}.zarr")
#!python GenPhageRepresentationsESM2.py {expt_name} {output_folder} {fasta_path} {model_name} {mode}


In [None]:
query_zarr_path = f"{output_folder}{expt_name}_{model_name}.zarr"

query_zarr_store = zarr.open(query_zarr_path,'r')
query_vectors = query_zarr_store['vectors_mean'][:]*1.0
query_accessions = query_zarr_store['accessions'][:]


In [2]:
# required files
'''
Required files
faiss_index for the right model
the network gexf

'''




data_folder = "/media/microscopie-lcb/swapnesh/protein/embeddings/phages/1Sept2024_INPHARED_db_latest/"
faiss_index_path = data_folder +"ESM2_650m_1Sept24_650m.zarrfaiss_index.bin"
query_zarr_path = data_folder +"ESM2_650m_1Sept24_650m.zarr"#f"{expt_name}_{model_name}.zarr"
db_clusters_path = data_folder +"ESM2_3b_1Sept24_3b_flat_clusters_3b.csv"
eps_values_path = data_folder +"ESM2_650m_1Sept24_650m_eps_values_flat_clusters.npy"
db_zarr_path = data_folder +"ESM2_650m_1Sept24_650m.zarr"
HieVi_INPHARED_ordered_annotation = "HieVi_INPHARED_ordered_annotation.csv"
HieVi_tree = data_folder + "ESM2_3b_1Sept24_3b.zarr.gexf"

htree = nx.read_gexf(HieVi_tree)

zarr_store = zarr.open(db_zarr_path,'r')
db_accessions = zarr_store['accessions'][:]

annotations = pd.read_csv(HieVi_INPHARED_ordered_annotation)
annotations= annotations[annotations["Accession"].isin(db_accessions)]
annotations = annotations.set_index("Accession").loc[db_accessions].reset_index()

eps_values = np.load(eps_values_path)


index = faiss.read_index(faiss_index_path)

hievi_cluster = pd.read_csv(db_clusters_path)
hievi_cluster= hievi_cluster[hievi_cluster["Accession"].isin(db_accessions)]
hievi_cluster = hievi_cluster.set_index("Accession").loc[db_accessions].reset_index()


In [3]:
filename = "/media/microscopie-lcb/swapnesh/protein/embeddings/phages/NewGenomes/HieVi_test_case.faa"
# @title Compute phage representations
expt_name = "MyPhages"  # @param {type:"string"}
output_folder = os.path.dirname(filename) + os.sep 
fasta_path = filename
model_name = "650m" # This colab works for 650m only
mode = "mean"
#!python GenPhageRepresentationsESM2.py {expt_name} {output_folder} {fasta_path} {model_name} {mode}


In [4]:
query_zarr_path = f"{output_folder}{expt_name}_{model_name}.zarr"

query_zarr_store = zarr.open(query_zarr_path,'r')
query_vectors = query_zarr_store['vectors_mean'][:]*1.0
query_accessions = query_zarr_store['accessions'][:]



In [36]:
hievi_cluster_prefix = 'HC_p'
k_neighbours = 64 # 
distance_threshold = 0.023 # 
distance_in_tree = 2

distances, indices = index.search(query_vectors, 1)
valid_idx = distances[:,0] < distance_threshold
invalid_idx = np.logical_not(valid_idx)

if len(np.where(invalid_idx)[0]):
    print('Cannot classifiy: ' ,len(np.where(invalid_idx)[0]))
    invalid_query_df = pd.DataFrame({"Accession": query_accessions[invalid_idx]})
    invalid_query_df.to_csv(query_zarr_path[:-5] + "_HieVi_Unclassifieds.csv")

D1 = np.squeeze(distances[valid_idx,0])
I1 = np.squeeze(indices[valid_idx,0])

all_indices = np.unique(np.array(I1))
all_nearest_accessions = db_accessions[all_indices]

new_accession = []
for a in all_nearest_accessions:
    new_accession += find_predecessor_and_leaves(htree, a,distance_in_tree)
    

all_nearest_accessions = np.unique(np.ravel(np.array(new_accession)))
all_indices = np.array([np.where(db_accessions==a)[0] for a in all_nearest_accessions])

subset_db_vectors = np.squeeze(np.array(zarr_store['vectors_mean'])[all_indices]*1.0)
subset_db_vectors.shape


(71, 1280)

In [None]:
hievi_cluster_prefix = 'HC_p'
k_neighbours = 64 # 
distance_threshold = 0.023 # 

distances, indices = index.search(query_vectors, 1)
valid_idx = distances[:,0] < distance_threshold
invalid_idx = np.logical_not(valid_idx)

if len(np.where(invalid_idx)[0]):
    print('Cannot classifiy: ' ,len(np.where(invalid_idx)[0]))
    invalid_query_df = pd.DataFrame({"Accession": query_accessions[invalid_idx]})
    invalid_query_df.to_csv(query_zarr_path[:-5] + "_HieVi_Unclassifieds.csv")

D = np.squeeze(distances[valid_idx,0])
I = np.squeeze(indices[valid_idx,0])

D1, I1 = index.search(np.array(zarr_store['vectors_mean'])[I]*1.0, k_neighbours)

all_indices = []
for ds,idxs in zip(I1,D1):
    for d,i in zip(ds,idxs):
        if d < distance_threshold*10:
            all_indices +=[i]


all_indices = np.unique(np.array(all_indices+ list(I)))
all_nearest_accessions = db_accessions[all_indices]

new_accession = []
for a in all_nearest_accessions:
    new_accession += find_predecessor_and_leaves(htree, a,2)
    

all_nearest_accessions = np.unique(np.ravel(np.array(new_accession)))
all_indices = np.array([np.where(db_accessions==a)[0] for a in all_nearest_accessions])

subset_db_vectors = np.squeeze(np.array(zarr_store['vectors_mean'])[all_indices]*1.0)
subset_db_vectors.shape


In [30]:
# @title Combine query and nearest phages from database data

# indices = np.unique(np.ravel(indices))
# print(f"Nearest neighbor search completed. Found {len(indices)} unique neighbors.")
# Get nearest accessions in tree
nearest_accessions = annotations[annotations["Accession"].isin(all_nearest_accessions)]
nearest_accessions = nearest_accessions.set_index("Accession").loc[all_nearest_accessions].reset_index()


# Combine query and database data
query_df = pd.DataFrame({"Accession": query_accessions[valid_idx]})
annotation_df = pd.concat([nearest_accessions, query_df], axis=0)
mprs = np.concatenate((subset_db_vectors, query_vectors[valid_idx]), axis=0)

# Perform clustering
# dist_scaled = euclidean_distances(mprs).astype("double")
# clusterer = hdbscan.HDBSCAN(
#     min_cluster_size=2,
#     n_jobs=32,
#     min_samples=1,
#     allow_single_cluster=False,
#     cluster_selection_method="leaf",
#     metric="precomputed",
#     gen_min_span_tree=True
# )
# clusterer.fit(dist_scaled)

clusterer = hdbscan.HDBSCAN(
    min_cluster_size=2,
    min_samples=1,
    allow_single_cluster=False,
    cluster_selection_method="leaf",
    metric="euclidean",
    gen_min_span_tree=True
)
clusterer.fit(mprs)

annotation_df["HieVi_granular_cluster"] = clusterer.labels_
for i,eps in enumerate(eps_values):
    annotation_df[hievi_cluster_prefix+str(i)] = clusterer.dbscan_clustering(cut_distance=eps,min_cluster_size=2)
#annotation_df

In [31]:
# @title Generate Network


min_lambda =13.00 # @param {type:"slider", min:-1, max:32, step:1}
# Create and save network
G = make_network(clusterer, annotation_df,min_lambda=min_lambda)
nx.write_gexf(G, query_zarr_path[:-5] + "_HieVi.gexf")
