In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score, roc_auc_score
import scipy as sp
from sys import getsizeof
import time
from src.utils import load_embed
import torch

In [2]:
# Load table of contents

dataset = 'new'
embed_type = 'esm'


rng = np.random.default_rng(seed=1234)
data_path = f"../data/{dataset}/"

# Read data table of contents csv
df = pd.read_csv(data_path + f"{dataset}.csv", delimiter='\t')
df.set_index('Entry', inplace=True)
entry_idxs = list(df.index)
ec_idxs = set()
for elt in df.loc[:, "EC number"]:
    for ec in elt.split(';'):
        ec_idxs.add(ec)
ec_idxs = list(ec_idxs)

n_samples = len(entry_idxs)
n_features = len(ec_idxs)

In [10]:
# Load all embeds into a numpy array
embeds = []
for elt in entry_idxs:
    embeds.append(load_embed(data_path + f"{embed_type}/{elt}.pt", embed_key=33)[1])

embeds = np.vstack(embeds)
embeds /= np.sqrt(np.square(embeds).sum(axis=1)).reshape(-1,1)

In [11]:
# In batches of ~1500 embeddings, matmul embeds, save resulting sim matrix blocks

batch_size = 20
fn_pref = f"protein_x_protein_{embed_type}_similarity_batch"
save_to = data_path + 'sim_mats/' + fn_pref


n_batches = embeds.shape[0] // batch_size + 1
for i in range(n_batches):
    similarity_batch = embeds[i * batch_size : (i + 1) * batch_size] @ embeds.T
    np.save(save_to + f"_{i}.npy", similarity_batch)

In [3]:
# Load protein x function matrix into csr
y_fn = "protein_x_catalytic_function.npz"
rng = np.random.default_rng(seed=825)
path = f"../data/{dataset}/"
y = sp.sparse.load_npz(path + y_fn)
row, col = y.nonzero()


# Mask out
percent_to_mask = 0.01
n_mask = int((len(row) * percent_to_mask) / 2) # 50-50 split of 1s and 0s of 1% of elements

# Sample ones
print("Get ones to mask")
rnd_idxs = rng.integers(0, len(row), size=(n_mask,))
mask_row, mask_col, y_true = zip(*[(row[idx], col[idx], 1) for idx in rnd_idxs])
mask_row, mask_col, y_true = list(mask_row), list(mask_col), list(y_true)

# Mask ones
for elt in range(len(mask_row)):
    i, j = mask_row[elt], mask_col[elt]
    y[i, j] = 0

y.eliminate_zeros()

# Sample zeros
print("Get zeros to mask")
n0 = 0
sampled_ones_idxs = list(zip(mask_row, mask_col))
while n0 < n_mask:
    print(f"n0:{n0}", end='\r')
    sampled_idx = (rng.integers(0, n_samples), rng.integers(0, n_features))

    if sampled_idx not in sampled_ones_idxs:
        mask_row.append(sampled_idx[0])
        mask_col.append(sampled_idx[1])
        y_true.append(0)
        n0 += 1

Get ones to mask
Get zeros to mask
n0:1

In [16]:
# In batches, matmul esm_sim_i against y
k = 3
y_hat = []
for i in range(n_batches):
    sim_mat = np.load(save_to + f"_{i}.npy")

    # k-threshold
    threshes = np.sort(sim_mat, axis=1)[:, -k].reshape(-1, 1)
    sim_mat[sim_mat < threshes] = 0

    # Normalize
    sim_mat /= sim_mat.sum(axis=1).reshape(-1, 1)

    
    y_hat.append(sp.sparse.csr_array(sim_mat @ y))

y_hat = sp.sparse.vstack(y_hat).tocsr() # Stack into full y_hat

In [17]:
# Predict and evaluate

y_true = np.array(y_true)
y_pred = np.array(y_hat[mask_row, mask_col]).reshape(-1,)

accuracy = accuracy_score(y_true, y_pred>0)
roc_auc = roc_auc_score(y_true, y_pred>0)

print(f"Accuracy: {accuracy}")
print(f"ROC AUC: {roc_auc}")


print("Done")

Accuracy: 1.0
ROC AUC: 1.0
Done


In [6]:
y[: , :5]

<392x5 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [7]:
np.array([1,2,3,4])[-2:]

array([3, 4])