In [1]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from collections import defaultdict
import faiss

In [3]:
# Load in uniprot meta data
meta_data_new = pd.read_csv('/home/ron/protein-vec/src_run/data/uniprotkb_AND_reviewed_true_2023_07_03.tsv', sep='\t')

In [4]:
#Now filter for the proteins that were newly discovered
new_proteins = meta_data_new[meta_data_new['Date of creation'] > '2022-05-25'].reset_index(drop=True)

In [6]:
lookup_proteins_meta = pd.read_csv('/home/ron/protein-vec/src_run/protein_vec_embeddings/lookup_embeddings_meta_data.tsv', sep="\t")
# were all of these trained on?

In [9]:
# new protein Entry is in the lookup database Entry
new_proteins['Entry'].isin(lookup_proteins_meta['Entry']).sum()


0

In [10]:
# get proteins in meta_data_new that are not in lookup_proteins_meta
new_proteins_not_in_lookup = new_proteins[~new_proteins['Entry'].isin(lookup_proteins_meta['Entry'])].reset_index(drop=True)

In [12]:
len(new_proteins_not_in_lookup), len(new_proteins)

(2350, 2350)

In [14]:
embeddings = np.load('/home/ron/protein-vec/src_run/protein_vec_embeddings/lookup_embeddings.npy')

In [23]:
column = 'Pfam'

In [33]:
# select proteins and their embeddings only for proteins that have a Pfam domain
col_proteins_meta = lookup_proteins_meta[~lookup_proteins_meta[column].isnull()]
col_embeddings = embeddings[col_proteins_meta.index]
# drop index to avoid confusion
col_proteins_meta = col_proteins_meta.reset_index(drop=True)

In [27]:
len(col_proteins_meta), len(col_embeddings)

(517951, 517951)

In [37]:
# split into train and test set and get the respective embeddings (using the index of the proteins)
np.random.seed(0)
# col_lookup = col_proteins_meta.sample(frac=0.8)
# col_query = col_proteins_meta[~col_proteins_meta.index.isin(col_lookup.index)]

# select 10000 proteins for the query set and the rest for the lookup set
col_lookup = col_proteins_meta.sample(n=10000)
col_query = col_proteins_meta[~col_proteins_meta.index.isin(col_lookup.index)]

col_lookup_embeddings = col_embeddings[col_lookup.index]
col_query_embeddings = col_embeddings[col_query.index]


In [41]:
col_meta_data = col_lookup[column].values

In [39]:
def load_database(lookup_database):
    # Build an indexed database
    d = lookup_database.shape[1]
    index = faiss.IndexFlatIP(d)
    faiss.normalize_L2(lookup_database)
    index.add(lookup_database)

    return index

def query(index, queries, k=10):
    faiss.normalize_L2(queries)
    D, I = index.search(queries, k)

    return (D, I)

In [40]:
# load database
lookup_database = load_database(col_lookup_embeddings)

# Query for the 1st nearest neighbor
k = 10000
D, I = query(lookup_database, col_query_embeddings, k)

In [44]:
# save D, I, col_query, col_lookup
np.save('lookup_embeddings_faiss_query_results.npy', D)
np.save('lookup_embeddings_faiss_query_indices.npy', I)
col_query.to_csv('lookup_embeddings_faiss_query_meta_data.tsv', sep="\t")
col_lookup.to_csv('lookup_embeddings_faiss_lookup_meta_data.tsv', sep="\t")


In [45]:
I.shape[0]

103590

In [46]:
meta = col_meta_data[I[0]]

In [49]:
meta_query = col_query.iloc[0][column]

In [50]:
meta_query

'PF01266;'

In [61]:
near_ids = []
# for i in range(I.shape[0]):
for i in range(10000):
    meta = col_meta_data[I[i]]
    meta_query = col_query.iloc[i][column]
    # if no annotation for the query, skip
    
    if meta_query is np.nan:
        continue
    # create boolean mask for proteins that have the same annotation as the query
    # print(meta_query)

    # technically, this is a partial hit. to get exact hits, use the following line

    # mask_query_in_lookup = [meta_query in meta[k] for k in range(len(meta))]
    # kind of confusing. delineate the cases of different hits based on number of Pfam/EC/etc annotations

    mask_exact = [meta_query == meta[k] for k in range(len(meta))]
    # partial hits
    if meta_query.count(';') > 1:
        available_pfs = meta_query.split(';')
        mask_partial = [[available_pfs[j] in meta[k] for j in range(len(available_pfs) - 1)] for k in range(len(meta))]
    else:
        mask_partial = [meta_query in meta[k] for k in range(len(meta))]

    
    # available_pfs = meta_query.split(';')
    # mask_partial = [[available_pfs[j] in meta[k] for j in range(len(available_pfs) - 1)] for k in range(len(meta))]
    
    near_ids.append({
        'meta': meta,
        'meta_query': meta_query,
        'exact': mask_exact,
        'partial': mask_partial,
        'S_i': D[i]
    })   

In [63]:
# save the results
np.save('conformal_pfam_with_lookup_dataset.npy', near_ids)

### Make the same dataset but for new proteins after the cutoff date

In [None]:
query_embeddings = np.load('new_protein_embeddings.npy')

In [None]:
# Filter for lookup proteins with annotations for the relavant aspect (don't want to transfer null annotations)
col_lookup = lookup_proteins_meta[~lookup_proteins_meta[column].isnull()]
col_lookup_embeddings = embeddings[col_lookup.index]
col_meta_data = col_lookup[column].values
