In [None]:
# Standard imports for data analysis and visualization
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
# Import custom utilities for protein analysis and conformal prediction
# These include functions for FAISS database operations, sequence handling, etc.
from protein_conformal.util import *

In [None]:
# Load protein embeddings for JCVI Syn3.0 genes of unknown function
# These are high-dimensional vector representations of protein sequences
# Embeddings capture semantic relationships between proteins based on sequence patterns
# query_embeddings = np.load('/data/ron/protein-conformal/data/gene_unknown/all_aa_seqs.npy')
query_embeddings = np.load('/data/ron/protein-conformal/data/gene_unknown/unknown_aa_seqs.npy')

FileNotFoundError: [Errno 2] No such file or directory: '/data/ron/protein-conformal/data/gene_unknown/unknown_aa_seqs.npy'

In [None]:
# Check the shape of query embeddings: (number_of_proteins, embedding_dimension)
query_embeddings.shape

In [None]:
# Load the actual protein sequences and metadata from FASTA files
# FASTA format stores biological sequences with headers containing metadata
# query_fastas, query_metadata = read_fasta('/data/ron/protein-conformal/data/gene_unknown/all_aa_seqs.fasta')
query_fastas, query_metadata = read_fasta('/data/ron/protein-conformal/data/gene_unknown/unknown_aa_seqs.fasta')

In [None]:
# Load reference database: protein embeddings and metadata
# This is our "lookup table" of known proteins with functional annotations
embeddings = np.load('/data/ron/protein-vec/src_run/protein_vec_embeddings/lookup_embeddings.npy')
lookup_proteins_meta = pd.read_csv('/data/ron/protein-vec/src_run/protein_vec_embeddings/lookup_embeddings_meta_data.tsv', sep="\t")

In [None]:
# Prepare Pfam-annotated reference database
# Pfam is a database of protein families based on sequence homology and functional domains
column = 'Pfam'
col_lookup = lookup_proteins_meta[~lookup_proteins_meta[column].isnull()]  # Filter proteins with Pfam annotations
col_lookup_embeddings = embeddings[col_lookup.index]  # Get corresponding embeddings
col_meta_data = col_lookup[column].values  # Extract Pfam labels

# Load into FAISS database for efficient similarity search
# FAISS enables fast approximate nearest neighbor search in high-dimensional space
lookup_database = load_database(col_lookup_embeddings)

In [None]:
# Perform similarity search: find nearest neighbors in embedding space
# For each unknown protein, find the most similar known protein
k = 1  # Number of nearest neighbors to retrieve
D, I = query(lookup_database, query_embeddings, k)  # D=distances, I=indices

In [None]:
# Get maximum distances for each query (though k=1, so just the single distance)
# These distances measure dissimilarity between unknown and known proteins
D_max = np.max(D, axis=1)

In [None]:
# Load calibration data for conformal prediction
# This data will be used to convert distances into reliable confidence scores
# Conformal prediction provides statistical guarantees about prediction uncertainty
data = np.load('/data/ron/protein-conformal/data/pfam_new_proteins.npy', allow_pickle=True)



In [None]:
# Prepare calibration set for Venn-Abers prediction
# Use a small sample of known protein pairs with similarity scores and labels
n_calib = 100  # Number of calibration examples
np.random.shuffle(data)
cal_data = data[:n_calib]
X_cal, y_cal = get_sims_labels(cal_data, partial=False)  # Extract similarity scores and binary labels
X_cal = X_cal.flatten()  # Flatten to 1D array
y_cal = y_cal.flatten()  # Binary labels: 1=same family, 0=different family


In [None]:
# Apply Venn-Abers prediction to get confidence scores
# This method provides well-calibrated probabilities for each distance
p_s = []
for d in D:
    # Get probability estimates: p_0 for "different family", p_1 for "same family"
    p_0, p_1 = simplifed_venn_abers_prediction(X_cal, y_cal, d)
    p_s.append([p_0, p_1])
p_s = np.array(p_s)

In [None]:
# Calculate prediction confidence as absolute difference between p_0 and p_1
# Larger differences indicate more confident predictions
abs_p = [np.abs(p[0] - p[1]) for p in p_s]

In [None]:
# Check maximum confidence score across all predictions
max(abs_p)

In [None]:
# Load False Discovery Rate (FDR) control results
# These contain pre-computed thresholds for controlling prediction errors
fdr = np.load('/data/ron/protein-conformal/data/pfam_fdr_2024-06-25.npy', allow_pickle=True)

In [None]:
# Convert numpy object array to dictionary for easier access
fdr = fdr.item()

In [None]:
# Explore what's stored in the FDR results
fdr.keys()

In [None]:
# Check the average threshold value (lambda hat) from FDR control
np.mean(fdr['lhats'])

In [None]:
# Set the distance threshold for accepting predictions
# Proteins with distance > l_hat are considered confident matches to known families
# l_hat = 0.999980225003127 # mean result
l_hat = np.mean(fdr['lhats'])

In [None]:
# Count how many proteins pass the confidence threshold
(D_max > l_hat).sum()

In [None]:
# Filter to keep only high-confidence predictions (indices of matching proteins)
filtered_I = I[D_max > l_hat]

In [None]:
# Visualize the distribution of average prediction probabilities
plt.hist(np.mean(p_s, axis=1))

In [None]:
# Extract indices of the closest matching proteins for high-confidence predictions
first_entries = filtered_I[:, 0]

In [None]:
# Organize prediction results into dataframe format for analysis
# Combine Pfam annotations, distances, and confidence scores
first_entries_meta = col_meta_data
first_entries_D = D_max
first_entries_p_s = np.mean(p_s, axis=1)


In [None]:
# Create summary dataframe with prediction results
first_entries_df = pd.DataFrame({'Pfam': first_entries_meta, 'D': first_entries_D, 'p_s': first_entries_p_s})

In [None]:
first_entries_df

In [None]:
# Create detailed results dataframe with both query and matched protein information
df_hits = col_lookup.iloc[first_entries].drop(columns='Unnamed: 0').reset_index(drop=True)
df_hits['query sequence'] = np.array(query_fastas)[D_max > l_hat]  # Add query sequences
df_hits['query_name'] = np.array(query_metadata)[D_max > l_hat]    # Add query names

# Reorder columns to put query information first for easier reading
first_columns = ['query_name', 'query sequence']
remaining_columns = [col for col in df_hits.columns if col not in first_columns]
new_column_order = first_columns + remaining_columns
df_hits = df_hits[new_column_order]

In [None]:
# Import Biopython for pairwise sequence alignment
# This will help us calculate sequence identity between query and matched proteins
# from Bio import pairwise2  # Legacy alignment method
# from Bio.pairwise2 import format_alignment
from Bio.Align import PairwiseAligner  # Modern alignment method


In [None]:
# Define function to calculate sequence identity between proteins
# This provides a traditional measure of similarity based on exact amino acid matches
def seq_identity(seq1, seq2):
    """
    Calculate the sequence identity between two sequences using pairwise alignment.
    Sequence identity = (number of identical residues) / (length of longer sequence) * 100
    
    Parameters:
    seq1 (str): First protein sequence  
    seq2 (str): Second protein sequence
    
    Returns:
    float: Sequence identity percentage (0-100)
    """
    aligner = PairwiseAligner()
    alignments = aligner.align(seq1, seq2)
    best_alignment = alignments[0]
    seq1_aligned, seq2_aligned = best_alignment.aligned
    
    # Calculate identity as percentage of matching positions
    matches = sum(a == b for a, b in zip(seq1, seq2) if a == b)
    length = max(len(seq1), len(seq2))
    return matches / length * 100

In [None]:
# Calculate sequence identity between each query and its matched protein
# This adds traditional sequence similarity as a validation of embedding-based matches
df_hits['seq_identity'] = df_hits.apply(lambda x: seq_identity(x['query sequence'], x['Sequence']), axis=1)

In [None]:
# Visualize the distribution of sequence identities for our matches
df_hits['seq_identity'].hist()

In [None]:
df_hits.head()

In [None]:
# Export results to CSV file for further analysis and validation
df_hits.to_csv('/data/ron/protein-conformal/data/gene_unknown/unknown_aa_seqs_pfam_hits.csv', index=False)

In [None]:
# Visualize the distribution of embedding distances and the confidence threshold
# Red line shows the threshold - proteins to the right are confident predictions
# sns.distplot(D_max)  # Legacy plotting method
# sns.displot(D_max)   # Alternative plotting method
sns.histplot(D_max, bins=30)
plt.axvline(l_hat, color='r', linestyle='--')  # Show threshold line


In [None]:
# Calculate summary statistics for the results
# Count how many unknown proteins got confident functional predictions
total_count = len(D_max)               # Total number of unknown proteins analyzed
hits_count = np.sum(D_max >= l_hat)    # Confident functional assignments
no_hits_count = np.sum(D_max < l_hat)  # Proteins remaining truly unknown

In [None]:
# Create final summary visualization: pie chart showing prediction success rate  
# This visualizes how many unknown proteins we successfully assigned functions to
sns.set_theme(style="whitegrid")

# Prepare pie chart data with percentages
labels = [
    f'Hits: {hits_count} ({hits_count / total_count * 100:.1f}%)',     # Proteins with confident predictions
    f'No Hits: {no_hits_count} ({no_hits_count / total_count * 100:.1f}%)'  # Still unknown proteins
]
sizes = [hits_count, no_hits_count]
colors = sns.color_palette()[0:2]  # Use Seaborn's default colors
explode = (0.1, 0)  # Emphasize the "hits" slice

# Create and save the pie chart
plt.figure(figsize=(5, 5))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
        shadow=True, startangle=140)
plt.axis('equal')  # Ensure pie chart is circular
plt.title(f'Proportion of Protein Hits in JCVI Syn3.0 genes\n of unknown function (Total: {total_count})')
plt.savefig('/data/ron/protein-conformal/figs/pfam_new/protein_hits_pie.pdf', bbox_inches='tight')