In [None]:
# GOAL: Analyze newly discovered proteins by comparing them to existing proteins
# using Pfam domain annotations and protein embeddings for similarity search

# This notebook:
# 1. Loads new proteins (discovered after 2022) and existing protein database
# 2. Filters proteins that have Pfam domain annotations
# 3. Uses AI embeddings to find similar proteins based on sequence
# 4. Creates data for conformal prediction analysis

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os
from protein_conformal.util import load_database, query

In [None]:
## STEP 1: Load and filter NEW proteins (query set)
# Load UniProt database - contains all reviewed proteins with their annotations
filename = 'uniprotkb_AND_reviewed_true_2023_07_03.tsv'

# Try different locations to find the file
if os.path.exists(filename):
    data_path = filename
elif os.path.exists(os.path.join('..', 'data', filename)):
    data_path = os.path.join('..', 'data', filename)
elif os.path.exists(os.path.join('data', filename)):
    data_path = os.path.join('data', filename)
else:
    raise FileNotFoundError(f"Cannot find {filename}")

meta_data_new = pd.read_csv(data_path, sep='\t')

# Filter for recently discovered proteins (after May 2022)
# These are our "query" proteins - we want to analyze them
new_proteins = meta_data_new[meta_data_new['Date of creation'] > '2022-05-25'].reset_index(drop=True)

# Only keep proteins with reasonable sequence length (≤2000 amino acids)
# Very long proteins are harder to analyze and might be incomplete
new_proteins['length'] = new_proteins['Sequence'].str.len()
new_proteins = new_proteins[new_proteins['length'] <= 2000].reset_index(drop=True)

In [4]:
max(new_proteins['Date of creation'])

'2023-06-28'

In [5]:
meta_data_new.shape

(569793, 16)

In [None]:
## STEP 2: Load REFERENCE proteins (lookup database)
# Load the existing proteins we'll compare against
# This is our reference database of known proteins with embeddings
filename = 'lookup_embeddings_meta_data.tsv'

if os.path.exists(filename):
    data_path = filename
elif os.path.exists(os.path.join('..', 'data', filename)):
    data_path = os.path.join('..', 'data', filename)
elif os.path.exists(os.path.join('data', filename)):
    data_path = os.path.join('data', filename)
else:
    raise FileNotFoundError(f"Cannot find {filename}")

# Load reference proteins (should be ~540k proteins)
lookup_proteins_meta = pd.read_csv(data_path, sep="\t")

In [None]:
# Verify that new and reference proteins don't overlap
# Should be 0 - meaning our new proteins aren't already in the reference database
# This ensures we're testing on truly "unseen" proteins
new_proteins['Entry'].isin(lookup_proteins_meta['Entry']).sum()

0

In [None]:
## STEP 3: Load AI embeddings for reference proteins  
# Embeddings are vector representations of protein sequences
# Each protein becomes a 512-dimensional vector that captures its features
filename = 'lookup_embeddings.npy'

if os.path.exists(filename):
    data_path = filename
elif os.path.exists(os.path.join('..', 'data', filename)):
    data_path = os.path.join('..', 'data', filename)
elif os.path.exists(os.path.join('data', filename)):
    data_path = os.path.join('data', filename)
else:
    raise FileNotFoundError(f"Cannot find {filename}")

# Load embeddings for reference proteins (should be 540k x 512)
lookup_embeddings = np.load(data_path)

In [9]:
print(lookup_embeddings.shape, lookup_proteins_meta.shape)

(540560, 512) (540560, 17)


In [None]:
# Load embeddings for NEW proteins (our query set)
# These match the new proteins we loaded earlier
filename = 'new_protein_embeddings.npy'

if os.path.exists(filename):
    data_path = filename
elif os.path.exists(os.path.join('..', 'data', filename)):
    data_path = os.path.join('..', 'data', filename)
elif os.path.exists(os.path.join('data', filename)):
    data_path = os.path.join('data', filename)
else:
    raise FileNotFoundError(f"Cannot find {filename}")

# Load query embeddings (should be ~2295 x 512)
query_embeddings = np.load(data_path)

In [11]:
# verify that the embeddings are the same size
print(query_embeddings.shape, new_proteins.shape)

(2295, 512) (2295, 17)


In [None]:
## STEP 4: Focus on Pfam domain analysis
# Pfam domains are protein functional units - like building blocks
# By filtering for proteins with Pfam annotations, we ensure both:
# 1. Query and reference proteins have known functional domains
# 2. We can compare proteins based on functional similarity
column = 'Pfam'

In [None]:
# Filter REFERENCE proteins: only keep those with Pfam annotations
# This creates our filtered reference database for comparison
col_proteins_meta = lookup_proteins_meta[~lookup_proteins_meta[column].isnull()]
col_embeddings = lookup_embeddings[col_proteins_meta.index]  # Get matching embeddings
# Reset index to avoid confusion (0, 1, 2, 3... instead of original indices)
col_proteins_meta = col_proteins_meta.reset_index(drop=True)

In [None]:
# Filter QUERY proteins: only keep new proteins that also have Pfam annotations
# This ensures fair comparison - both query and reference have functional annotations
new_proteins_meta = new_proteins[~new_proteins[column].isnull()]
new_proteins_embeddings = query_embeddings[new_proteins_meta.index]  # Get matching embeddings
# Reset index to keep things organized
new_proteins_meta = new_proteins_meta.reset_index(drop=True)

In [15]:
new_proteins_meta.head()

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Gene Ontology (biological process),Gene Ontology (cellular component),Gene Ontology (molecular function),Gene Ontology (GO),EC number,Sequence,Date of creation,Gene3D,Pfam,length
0,A0A023I7E1,reviewed,ENG1_RHIMI,"Glucan endo-1,3-beta-D-glucosidase 1 (Endo-1,3...",ENG1 LAM81A,Rhizomucor miehei,796,cell wall organization [GO:0071555]; polysacch...,extracellular region [GO:0005576],"glucan endo-1,3-beta-D-glucosidase activity [G...",extracellular region [GO:0005576]; glucan endo...,3.2.1.39,MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIF...,2022-12-14,1.10.287.1170;1.20.5.420;,PF17652;PF03639;,796
1,A0A061AE05,reviewed,PAPSH_CAEEL,Bifunctional 3'-phosphoadenosine 5'-phosphosul...,pps-1 T14G10.1,Caenorhabditis elegans,654,3'-phosphoadenosine 5'-phosphosulfate biosynth...,nucleus [GO:0005634],adenylylsulfate kinase activity [GO:0004020]; ...,nucleus [GO:0005634]; adenylylsulfate kinase a...,2.7.1.25; 2.7.7.4,MLTPRDENNEGDAMPMLKKPRYSSLSGQSTNITYQEHTISREERAA...,2023-02-22,3.40.50.620;3.40.50.300;3.10.400.10;,PF01583;PF01747;PF14306;,654
2,A0A072VDF2,reviewed,CCR1_MEDTR,Cinnamoyl-CoA reductase 1 (Mt-CCR1) (EC 1.2.1....,CCR1 MTR_2g104960 MtrunA17_Chr2g0333781,Medicago truncatula (Barrel medic) (Medicago t...,342,lignin biosynthetic process [GO:0009809]; phen...,cytoplasm [GO:0005737],cinnamoyl-CoA reductase activity [GO:0016621];...,cytoplasm [GO:0005737]; cinnamoyl-CoA reductas...,1.2.1.-; 1.2.1.44,MPAATAAAAAESSSVSGETICVTGAGGFIASWMVKLLLEKGYTVRG...,2023-02-22,3.40.50.720;,PF01370;,342
3,A0A076FFM5,reviewed,F8H1_OCIBA,"Flavonoid 8-hydroxylase 1, chloroplastic (ObF8...",F8H-1,Ocimum basilicum (Sweet basil),523,flavonoid metabolic process [GO:0009812],chloroplast [GO:0009507]; chloroplast membrane...,"2 iron, 2 sulfur cluster binding [GO:0051537];...",chloroplast [GO:0009507]; chloroplast membrane...,1.14.15.-,MPFPMEVLQASSLSFPLLRRHSRNNLINKFRNPTLPRIDIPRQNID...,2022-12-14,2.102.10.10;,PF08417;PF00355;,523
4,A0A0B4U9L8,reviewed,VMF1_VIPAA,Zinc metalloproteinase-disintegrin-like protei...,,Vipera ammodytes ammodytes (Western sand viper),614,envenomation resulting in modulation of blood ...,extracellular region [GO:0005576]; extraorgani...,metalloendopeptidase activity [GO:0004222]; me...,extracellular region [GO:0005576]; extraorgani...,3.4.24.-,MLQVLLVTICLAVFPYQGSSIILESGNVNDYEVVYPQKLTALLKGA...,2023-02-22,3.40.390.10;4.10.70.10;,PF08516;PF00200;PF01562;PF01421;,614


In [16]:
len(new_proteins_meta)

1864

In [17]:
after = new_proteins_meta['Date of creation'] > '2022-12-14'

In [18]:
filename = 'new_proteins_after_cutoff.npy'

if os.path.exists(filename):
    data_path = filename
elif os.path.exists(os.path.join('..', 'data', filename)):
    data_path = os.path.join('..', 'data', filename)
elif os.path.exists(os.path.join('data', filename)):
    data_path = os.path.join('data', filename)
else:
    raise FileNotFoundError(f"Cannot find {filename}")

np.save(data_path, after)

In [19]:
sum(new_proteins_meta['Date of creation'] > '2022-12-14')

870

In [None]:
## STEP 5: Similarity Search using AI embeddings
# Build FAISS database for fast similarity search
# FAISS = Facebook AI Similarity Search (super fast for large datasets)
lookup_database = load_database(col_embeddings)

# Find the k most similar proteins for each new protein
# k = how many similar proteins to find for each query
# Using smaller k for testing to avoid memory issues
k = min(10000, col_embeddings.shape[0])   # Start with 10k for testing

# D = similarity distances, I = indices of most similar proteins
D, I = query(lookup_database, new_proteins_embeddings, k)

In [21]:
# 1. Check the shape and columns of your lookup database
print("lookup_proteins_meta shape:", lookup_proteins_meta.shape)
print("lookup_proteins_meta columns:", lookup_proteins_meta.columns.tolist())

# 2. Check if Pfam data exists at all
print("\nPfam column info:")
print("- Pfam column exists:", 'Pfam' in lookup_proteins_meta.columns)
print("- Non-null Pfam entries:", lookup_proteins_meta['Pfam'].notna().sum())
print("- Total entries:", len(lookup_proteins_meta))

# 3. Look at sample Pfam data
print("\nSample Pfam entries:")
pfam_samples = lookup_proteins_meta['Pfam'].dropna().head(5)
print(pfam_samples.tolist())

# 4. Check data types
print("\nData types:")
print(lookup_proteins_meta.dtypes)

lookup_proteins_meta shape: (540560, 17)
lookup_proteins_meta columns: ['Unnamed: 0', 'Entry', 'Reviewed', 'Entry Name', 'Protein names', 'Gene Names', 'Organism', 'Length', 'Gene Ontology (biological process)', 'Gene Ontology (cellular component)', 'Gene Ontology (molecular function)', 'Gene Ontology (GO)', 'EC number', 'Sequence', 'Date of creation', 'Gene3D', 'Pfam']

Pfam column info:
- Pfam column exists: True
- Non-null Pfam entries: 517951
- Total entries: 540560

Sample Pfam entries:
['PF13676;', 'PF01266;', 'PF19429;', 'PF19429;', 'PF19429;']

Data types:
Unnamed: 0                             int64
Entry                                 object
Reviewed                              object
Entry Name                            object
Protein names                         object
Gene Names                            object
Organism                              object
Length                                 int64
Gene Ontology (biological process)    object
Gene Ontology (cellular 

## Check for exchangability

In [22]:
new_proteins_tmp = new_proteins.sort_values('Date of creation', ascending=False).reset_index(drop=True)

In [23]:
len(new_proteins_tmp)

2295

In [24]:
new_proteins_tmp.iloc[int(np.ceil(len(new_proteins_tmp)/2))]

Entry                                                                            Q6AYL0
Reviewed                                                                       reviewed
Entry Name                                                                      TPC_RAT
Protein names                         Mitochondrial thiamine pyrophosphate carrier (...
Gene Names                                                        Slc25a19 LOC100910173
Organism                                                        Rattus norvegicus (Rat)
Length                                                                              318
Gene Ontology (biological process)    thiamine pyrophosphate transmembrane transport...
Gene Ontology (cellular component)    mitochondrial membrane [GO:0031966]; mitochond...
Gene Ontology (molecular function)    antiporter activity [GO:0015297]; thiamine pyr...
Gene Ontology (GO)                    mitochondrial membrane [GO:0031966]; mitochond...
EC number                       

In [25]:
from matplotlib import rcParams


In [None]:
## STEP 6: Visualize similarity score distribution
# Create a histogram showing how similar the proteins are
# Higher similarity = closer to 1.0, lower similarity = closer to 0.0

# Set up the plot style
sns.set_theme(style="white")
fig, ax = plt.subplots(figsize=(3.5, 2.5))

# Set font and size for publication quality
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']
rcParams['font.size'] = 14

# Create probability density weights (so bars add up to 1.0)
weights = np.ones_like(D.flatten()) / float(len(D.flatten()))

# Create the histogram of all similarity scores
ax.hist(D.flatten(), bins=100, weights=weights, color="skyblue")

# Label the plot
ax.set_xlabel('Similarity Scores', fontsize=14)
ax.set_ylabel('Density', fontsize=14)
ax.set_title(r'Protein-Vec: Histogram of Similarity Scores $S_{ij}$', fontsize=16)

# Clean up the plot appearance
sns.despine(left=False, bottom=False)
ax.yaxis.grid(False)
ax.xaxis.grid(False)

# Show only min, median, max on x-axis for clarity
min_sim = np.min(D)
mid_sim = np.median(D)
max_sim = np.max(D)
ax.set_xticks([min_sim, mid_sim, max_sim])
ax.set_xticklabels([f'{min_sim:.5f}', f'{mid_sim:.5f}', f'{max_sim:.5f}'], fontsize=14)

[Text(0.999722957611084, 0, '0.99972'),
 Text(0.9998669624328613, 0, '0.99987'),
 Text(1.000000238418579, 0, '1.00000')]

In [32]:
# fig.savefig('/data//protein_vec_histogram.svg', format='svg', dpi=300, bbox_inches='tight')

In [33]:
# fig.savefig('/data/ron/protein-conformal/figs/protein_vec_histogram.pdf', format='pdf', dpi=300, bbox_inches='tight')
# plt.show()

In [35]:
col_meta_data = col_proteins_meta[column].values

In [None]:
## STEP 7: Create annotation matching analysis
# For each new protein, check which similar proteins have matching Pfam domains
# This creates the data needed for conformal prediction analysis

# Get Pfam annotations for all reference proteins
col_meta_data = col_proteins_meta[column].values

near_ids = []
# Loop through each new protein
for i in range(I.shape[0]):
    # Get Pfam annotations of the k most similar proteins for this query
    meta = col_meta_data[I[i]]
    # Get Pfam annotation of the current query protein
    meta_query = new_proteins_meta.iloc[i][column]
    
    # Skip if query protein has no annotation
    if meta_query is np.nan:
        continue
        
    # Check for EXACT matches (identical Pfam domain combinations)
    mask_exact = [meta_query == meta[k] for k in range(len(meta))]
    
    # Check for PARTIAL matches (query domains found in similar proteins)
    # If protein has multiple domains (separated by ';'), check if any domain matches
    if meta_query.count(';') > 1:
        available_pfs = meta_query.split(';')
        mask_partial = [[available_pfs[j] in meta[k] for j in range(len(available_pfs) - 1)] for k in range(len(meta))]
    else:
        # Single domain: check if it appears in any similar protein
        mask_partial = [meta_query in meta[k] for k in range(len(meta))]
    
    # Store results for this query protein
    near_ids.append({
        'meta': meta,                    # Pfam annotations of similar proteins
        'meta_query': meta_query,        # Pfam annotation of query protein
        'exact': mask_exact,             # Boolean mask for exact matches
        'partial': mask_partial,         # Boolean mask for partial matches
        'S_i': D[i]                      # Similarity scores to all similar proteins
    })   

In [None]:
## STEP 8: Save results for conformal prediction analysis
# Save the processed data containing:
# - Similarity scores between new and reference proteins
# - Pfam annotation matches (exact and partial)
# - Data ready for conformal prediction calibration

np.save('data/pfam_new_proteins.npy', near_ids)

# Verify it worked:
print("✅ File saved successfully!")
print(f"File size: {os.path.getsize('data/pfam_new_proteins.npy')} bytes")
print(f"Contains similarity and annotation data for {len(near_ids)} proteins")

✅ File saved successfully!
File size: 255618739 bytes
