In [1]:
import hdbscan 
import zarr
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from sklearn import metrics# import adjusted_rand_score, adjusted_mutual_info_score,silhouette_score
from tqdm import tqdm
import matplotlib.pyplot as plt
import networkx as nx
from scipy.signal import find_peaks
import pandas as pd
import faiss

In [2]:
zarr_path = "/media/microscopie-lcb/swapnesh/protein/embeddings/phages/1Sept2024_INPHARED_db_latest/ESM2_650m_1Sept24_650m.zarr"
zarr_store = zarr.open(zarr_path,'r')
vectors = zarr_store['vectors_mean'][:]*1.0
accessions = zarr_store['accessions'][:]
df = pd.read_csv(zarr_path[:-4] + '_cluster.csv')
vectors.shape

(24362, 1280)

In [3]:
annotations = pd.read_csv('../HieVi_INPHARED_ordered_annotation.csv')
annotations= annotations[annotations["Accession"].isin(accessions)]
annotations = annotations.set_index("Accession").loc[accessions].reset_index()
# annotations["Accession"] = pd.Categorical(annotations["Accession"], categories=accessions, ordered=True)
# annotations = annotations.sort_values("Accession")

annotations.head()

Unnamed: 0,Accession,Virus_Description,Virus_Genome_size,Virus_molGC_(%),Virus_Number_CDS,Realm,Kingdom,Phylum,Class,Order,...,VC_Subcluster_Size,VC_number,VC_subcluster,Adj P-value,Families in VC,Genera in VC,Genus Confidence Score,Orders in VC,Quality,Topology Confidence Score
0,AY319521,Salmonella phage SopEPhi,35155.0,51.3,45.0,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Unclassified,...,44.0,4.0,0.0,1.0,0.0,0.0,1.0,0.0,0.9656,0.9656
1,MW175890,Dompiswa phage TSP7_1,150892.0,39.1,272.0,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Unclassified,...,44.0,913.0,1.0,1.0,0.0,0.0,0.9683,0.0,0.7568,0.7568
2,GU339467,Mycobacterium phage RedRock,53332.0,64.5,90.0,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Unclassified,...,5.0,221.0,1.0,0.99999997,0.0,0.0,1.0,0.0,0.0484,0.0484
3,MF417929,Uncultured Caudovirales phage clone 2F_1,32618.0,39.2,42.0,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Unclassified,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified
4,MH616963,crAssphage sp. isolate ctbg_1,94878.0,28.5,89.0,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Crassvirales,...,16.0,703.0,1.0,1.0,0.0,0.0,0.725,0.0,0.8536,0.8536


In [4]:
idx = np.arange(len(vectors))
np.random.shuffle(idx)
idx = idx[0:5]

faiss_index_path = zarr_path+'faiss_index.bin'
# Load FAISS index and search for nearest neighbors
index = faiss.read_index(faiss_index_path)

# Load query Zarr data

query_phage_ids = [a+'new' for a in zarr_store["accessions"][idx]]
query_mprs = zarr_store["vectors_mean"][idx]



In [None]:
from collections import defaultdict

k_neighbours = 5
distance_threshold = 3e-2
distances, indices = index.search(query_mprs, k_neighbours)
groupings = np.zeros(len(query_mprs))-1
outliers = np.min(distances,axis = -1) > distance_threshold
groupings[outliers] = -2



IndexError: index 5920 is out of bounds for axis 0 with size 5

In [7]:
indices = np.unique(np.ravel(indices))
print(f"Nearest neighbor search completed. Found {len(indices)} unique neighbors." , accessions[indices])

# Get nearest accessions in tree
nearest_accessions = annotations[annotations["Accession"].isin(accessions[indices])]

print(query_phage_ids)
nearest_accessions.head()

Nearest neighbor search completed. Found 25 unique neighbors. ['PP580404' 'PP526764' 'OR003939' 'OQ401624' 'OQ401623' 'MZ892987'
 'MW514247' 'MN850625' 'MN850624' 'MN850570' 'MK448231' 'MK575466'
 'KY554776' 'KY554777' 'KY554770' 'KY554769' 'KY554768' 'KJ545483'
 'KJ572845' 'KM247287' 'JQ182734' 'JQ182733' 'JQ086371' 'AY736146'
 'PP839371']
['OQ401624new', 'JQ182733new', 'MN850625new', 'MK575466new', 'KY554777new']


Unnamed: 0,Accession,Virus_Description,Virus_Genome_size,Virus_molGC_(%),Virus_Number_CDS,Realm,Kingdom,Phylum,Class,Order,...,VC_Subcluster_Size,VC_number,VC_subcluster,Adj P-value,Families in VC,Genera in VC,Genus Confidence Score,Orders in VC,Quality,Topology Confidence Score
584,PP580404,Vibrio phage vB_VpaM_XM1,46056.0,42.5,68.0,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Unclassified,...,7.0,758.0,0.0,1.0,0.0,0.0,1.0,0.0,0.4876,0.4876
965,PP526764,Pantoea phage ROV85,41003.0,54.9,51.0,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Unclassified,...,14.0,171.0,0.0,1.0,0.0,0.0,1.0,0.0,0.8727,0.8727
5264,OR003939,Phage DSL-LC04,61547.0,51.3,85.0,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Unclassified,...,4.0,646.0,0.0,0.99997249,0.0,0.0,1.0,0.0,0.5546,0.5546
5920,OQ401624,Roseobacter phage CRP-902,51954.0,45.5,80.0,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Unclassified,...,2.0,1224.0,0.0,0.93733875,0.0,0.0,1.0,0.0,0.5806,0.5442
5921,OQ401623,Roseobacter phage CRP-901,53013.0,45.6,77.0,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Unclassified,...,2.0,1224.0,0.0,0.93733875,0.0,0.0,1.0,0.0,0.5806,0.5442
