# Intrinsic dimension

https://huggingface.co/blog/AmelieSchreiber/intrinsic-dimension-of-proteins

Use token-wise representations

 If regularization is needed due to overfitting, choosing a rank for the LoRA that is lower than the intrinsic dimension for that layer is likely helpful.

In [9]:
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.spatial import distance_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from transformers import AutoTokenizer, AutoModel
import torch

In [10]:
data_dir = "/data/rajan/integrase"
emb_dir = f"{data_dir}/emb"
esm2_3B_emb_dir = f"{emb_dir}/esm2_3B"
ORF_emb_dir = f"{esm2_3B_emb_dir}/ORF"
attB_emb_dir = f"{esm2_3B_emb_dir}/attB"
attP_emb_dir = f"{esm2_3B_emb_dir}/attP"
fasta_dir = f"{data_dir}/fasta"

In [18]:
def get_embeddings(int_name, emb_dir):
    return torch.load(f"{emb_dir}/{int_name.strip()}.pt")["representations"][36]

In [12]:
def compute_persistent_score(embeddings):
    """
    Compute the persistent score for a subset of embeddings using the sum of edge weights in the MST.
    
    Parameters:
    - embeddings (numpy.ndarray): A matrix where each row is an embedding.
    
    Returns:
    - float: The persistent score for the embeddings.
    """
    dist_matrix = distance_matrix(embeddings, embeddings)
    mst = minimum_spanning_tree(dist_matrix)
    return mst.sum()

In [13]:
def sample_and_score(embeddings, n, k=8, hat_n=40, J=7):
    """
    For various sample sizes, compute the median persistent score across J samples.
    
    Parameters:
    - embeddings (numpy.ndarray): A matrix where each row is an embedding.
    - n (int): Total number of embeddings.
    - k (int): Number of different sample sizes.
    - hat_n (int): A parameter for determining sample sizes.
    - J (int): Number of samples for each sample size.
    
    Returns:
    - list: List of sample sizes.
    - list: List of corresponding median persistent scores.
    """
    scores = []
    sizes = [(i - 1) * (n - hat_n) // k + hat_n for i in range(1, k + 1)]
    
    for size in sizes:
        subset_scores = [compute_persistent_score(embeddings[np.random.choice(n, size, replace=False)])
                         for _ in range(J)]
        scores.append(np.median(subset_scores))
    
    return sizes, scores

In [14]:
def estimate_dimension(sizes, scores):
    """
    Estimate the intrinsic dimension of the data using linear regression on log-transformed sizes and scores.
    
    Parameters:
    - sizes (list): List of sample sizes.
    - scores (list): List of corresponding median persistent scores.
    
    Returns:
    - float: Estimated dimension of the data.
    """
    log_sizes = np.log(sizes).reshape(-1, 1)
    log_scores = np.log(scores)

    reg = LinearRegression().fit(log_sizes, log_scores)
    slope = reg.coef_[0]
    
    return 1 / (1 - slope)

In [27]:
def estimate_sequence_dimension(int_name, emb_dir, runs=5):
    """
    Estimate the intrinsic dimension of the text by repeatedly sampling subsets of its tokens, 
    computing their persistent scores, and then using linear regression on the log-transformed values.
    
    Parameters:
    - text (str): The input text for which the dimension needs to be estimated.
    - runs (int): Number of runs with different random seeds.
    
    Returns:
    - float: Estimated dimension of the text.
    """
    embeddings = get_embeddings(int_name, emb_dir)
    n = embeddings.shape[0]
    print(n)
    
    slopes = []
    for _ in range(runs):
        sizes, scores = sample_and_score(embeddings, n)
        log_sizes = np.log(sizes).reshape(-1, 1)
        log_scores = np.log(scores)
        
        reg = LinearRegression().fit(log_sizes, log_scores)
        slopes.append(reg.coef_[0])
    
    kappa_F = np.mean(slopes)
    return 1 / (1 - kappa_F)

In [30]:
int_name = 'LPINT 124'
dim = estimate_sequence_dimension(int_name, ORF_emb_dir, runs=2)
print(f"Estimated dimension of the protein sequence: {dim}")

1022
Estimated dimension of the protein sequence: 13.27396244023865


In [32]:
int_name = 'LPINT 124' # best eff of 74.55
dim = estimate_sequence_dimension(int_name, attB_emb_dir, runs=2)
print(f"Estimated dimension of the attB sequence for {int_name}: {dim}")

114
Estimated dimension of the attB sequence for LPINT 124: 9.798257989968421


In [33]:
int_name = 'LPINT 124'
dim = estimate_sequence_dimension(int_name, attP_emb_dir, runs=2)
print(f"Estimated dimension of the attP sequence for {int_name}: {dim}")

114
Estimated dimension of the attP sequence for LPINT 124: 10.168652909579016


In [31]:
int_name = 'Int4' # mid eff 31.5
dim = estimate_sequence_dimension(int_name, ORF_emb_dir, runs=2)
print(f"Estimated dimension of the protein sequence: {dim}")

1022
Estimated dimension of the protein sequence: 12.741481189204736


In [34]:
int_name = 'Int4' # mid eff 31.5
dim = estimate_sequence_dimension(int_name, attB_emb_dir, runs=2)
print(f"Estimated dimension of the attB sequence for {int_name}: {dim}")

66
Estimated dimension of the attB sequence for Int4: 9.643788494354599


In [35]:
int_name = 'Int4' # mid eff 31.5
dim = estimate_sequence_dimension(int_name, attP_emb_dir, runs=2)
print(f"Estimated dimension of the attP sequence for {int_name}: {dim}")

63
Estimated dimension of the attP sequence for Int4: 11.217583969202364
