In [None]:
'''Adapted from https://github.com/FunctionLab/ExPecto/blob/master/chromatin.py'''

import argparse
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import pickle
from torch import nn
import torch.nn.functional as F


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn

class ResidualBlock(nn.Module):
    def __init__(self, dim, dropout=0.3):
        super().__init__()
        self.block = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return x + self.block(x)



class ExpressionPredictor(nn.Module):
    def __init__(self, base_model, hidden_dim=2048, output_dim=218, num_blocks=6):
        super().__init__()
        self.base_model = base_model

        # Freeze the base model
        for param in self.base_model.parameters():
            param.requires_grad = False

        self.input_proj = nn.Linear(base_model.config.hidden_size, hidden_dim)

        # Stack of residual blocks
        self.res_blocks = nn.Sequential(*[
            ResidualBlock(hidden_dim, dropout=0.3) for _ in range(num_blocks)
        ])

        # Final output layer
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, attention_mask=None):
        with torch.no_grad():
            outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embedding = outputs.last_hidden_state[:, 0]

        x = self.input_proj(cls_embedding.float())
        x = self.res_blocks(x)
        out = self.output_layer(x)
        return out


In [None]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
model_name = "InstaDeepAI/nucleotide-transformer-2.5b-multi-species"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
max_length = tokenizer.model_max_length


# Load downstream model
output_dim = 218
model = ExpressionPredictor(base_model, output_dim).to(device)

# Load each component of the saved downstream model
checkpoint = torch.load("/content/drive/MyDrive/CSCI/CSCI2952G/Final Project/data/model_weights/downstream_weights.pt")
model.input_proj.load_state_dict(checkpoint['input_proj'])
model.res_blocks.load_state_dict(checkpoint['res_blocks'])
model.output_layer.load_state_dict(checkpoint['output_layer'])

model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/46.0k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.91G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/278M [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/48.1k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of EsmModel were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-2.5b-multi-species and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ExpressionPredictor(
  (base_model): EsmModel(
    (embeddings): EsmEmbeddings(
      (word_embeddings): Embedding(4105, 2560, padding_idx=1)
      (dropout): Dropout(p=0.0, inplace=False)
      (position_embeddings): Embedding(1002, 2560, padding_idx=1)
    )
    (encoder): EsmEncoder(
      (layer): ModuleList(
        (0-31): 32 x EsmLayer(
          (attention): EsmAttention(
            (self): EsmSelfAttention(
              (query): Linear(in_features=2560, out_features=2560, bias=True)
              (key): Linear(in_features=2560, out_features=2560, bias=True)
              (value): Linear(in_features=2560, out_features=2560, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): EsmSelfOutput(
              (dense): Linear(in_features=2560, out_features=2560, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (LayerNorm): LayerNorm((2560,), eps=1e-12, elementwise_affine=True)
          )
  

In [None]:
geneanno_merged = pd.read_csv('/content/drive/MyDrive/CSCI/CSCI2952G/Final Project/data/human/sequence_exp_6000.csv').drop("Unnamed: 0", axis=1)
test_df = geneanno_merged[geneanno_merged['seqnames'] == 'chr8']

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset

def extract_expression(df):
    tissue_start = df.columns.get_loc('type') + 1
    tissue_end = df.columns.get_loc('seq')
    expression_vals = df.iloc[:, tissue_start:tissue_end]
    tissues = list(expression_vals.columns)
    sequences = df['seq']
    return sequences, expression_vals, tissues

class GeneExpressionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_len = max_length
        self.sequences, self.expression_vals, self.tissues = extract_expression(dataframe)

    def __getitem__(self, idx):
        sequence = self.sequences.iloc[idx]
        expression_values = np.array(self.expression_vals.iloc[idx], dtype=np.float32)
        log_expression_values = np.log(expression_values + 1e-8)
        tokenized = self.tokenizer(sequence, return_tensors="pt", padding="max_length", max_length=self.max_len, truncation=True)
        input_ids = tokenized["input_ids"].squeeze(0)  # (seq_len,)
        return input_ids, torch.tensor(log_expression_values, dtype=torch.float32)

    def __len__(self):
        return len(self.sequences)

In [None]:
from torch.utils.data import DataLoader

test_dataset = GeneExpressionDataset(test_df, tokenizer, tokenizer.model_max_length)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
test_df.head()

Unnamed: 0,id,symbol,seqnames,strand,TSS,CAGE_representative_TSS,type,Adipose_Subcutaneous,Adipose_Visceral_Omentum,Adrenal_Gland,...,GM12878.1,HELA,HEPG2,HMEC,HSMM,HUVEC,K562.1,NHEK,NHLF,seq
37,ENSG00000003987,MTMR7,chr8,-,17271037,17270816,protein_coding,0.077622,0.252681,0.098076,...,0.0,0.66,0.167,0.0,0.013,0.199,0.003,0.003,0.007,tcacctcaaacattgatcatttctttgtggtgagaacattcaaagt...
38,ENSG00000003989,SLC7A2,chr8,+,17354597,17354597,protein_coding,22.717374,19.965932,4.823932,...,0.022,143.656,96.7,0.055,5.733,12.632,0.004,0.377,1.007,CCATCTGAAAAAAATTGGAGGATTTTAGCTTGTCTTCGCTAAAATA...
202,ENSG00000008513,ST3GAL1,chr8,-,134584183,134584130,protein_coding,9.402268,14.498145,14.012907,...,19.429,6.259,17.641,5.318,2.767,18.387,2.665,3.849,11.2,GAACCCATTAGGATCTTTCCACTCTGCTGTGCCAAGAAACGGGGGT...
208,ENSG00000008853,RHOBTB2,chr8,+,22844930,22857061,protein_coding,3.499919,3.244609,5.117711,...,9.565,19.052,4.492,19.977,6.962,10.302,0.444,12.977,5.171,ccttgttgtgttttatgacttagatatgtgtaatattcttacgcac...
211,ENSG00000008988,RPS20,chr8,-,56987140,56987065,protein_coding,321.667211,322.98683,176.784904,...,339.929,218.837,194.637,244.717,223.891,157.246,348.476,223.355,196.812,gttggccaggccgatctcgagcttctgactccaggtgacctacccg...


In [None]:
all_preds, all_targets = [], []

with torch.no_grad():
    for batch_idx, (input_ids, targets) in enumerate(test_loader):
        input_ids = input_ids.to(device)
        attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)

        # Forward pass
        preds = model(input_ids, attention_mask).float()

        # Store predictions and targets
        all_preds.append(preds.cpu().numpy())
        all_targets.append(targets.cpu().numpy())

        if batch_idx % 5 == 0:
            print(f"Processed {batch_idx+1}/{len(test_loader)} batches")

all_preds = np.concatenate(all_preds, axis=0)
all_targets = np.concatenate(all_targets, axis=0)

Processed 1/32 batches
Processed 6/32 batches
Processed 11/32 batches
Processed 16/32 batches
Processed 21/32 batches
Processed 26/32 batches
Processed 31/32 batches


In [None]:
import numpy as np
from scipy.stats import spearmanr

# Prediction and actual arrays: (genes x tissues)
actual = np.array(all_targets)
predicted = np.array(all_preds)

# Spearman across all tissues (per gene)
gene_corrs = []
for i in range(actual.shape[0]):  # iterate over genes
    rho, _ = spearmanr(actual[i, :], predicted[i, :])
    gene_corrs.append(rho)
mean_gene_rho = np.nanmean(gene_corrs)

# Spearman across all genes (per tissue)
tissue_corrs = []
for j in range(actual.shape[1]):  # iterate over tissues
    rho, _ = spearmanr(actual[:, j], predicted[:, j])
    tissue_corrs.append(rho)
mean_tissue_rho = np.nanmean(tissue_corrs)

print(f"Mean Spearman rho across tissues (per gene): {mean_gene_rho:.3f}")
print(f"Mean Spearman rho across genes (per tissue): {mean_tissue_rho:.3f}")


Mean Spearman rho across tissues (per gene): 0.367
Mean Spearman rho across genes (per tissue): 0.713
