In [1]:
import pandas as pd
import numpy as np
import os
import sys
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import anndata as an
import scanpy as sc
import scipy
import glob
from transformers import BertConfig, BertForMaskedLM, AdamW, DataCollatorForLanguageModeling
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import DataLoader, TensorDataset

source_path = os.path.abspath("../../source/")
sys.path.append(source_path)
import utils as ut
import matrix as matrix
import centrality as central

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.cuda.is_available():
    print("CUDA is available")
    print("Number of GPUs:", torch.cuda.device_count())
    print("Number of Cores:", os.cpu_count())
    print("Current device:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available")

CUDA is not available


# Get input

In [3]:
# load chromsizes 
fpath = "/scratch/indikar_root/indikar1/shared_data/population/references/GRCm39.chrom.sizes"
chroms = pd.read_csv(fpath, sep='\t', header=None, names=['chrom', 'size'])
chroms = chroms.head(20) # drop unplaced contigs

chroms['bp_start'] = chroms['size'].cumsum()

chrom_starts = dict(zip(chroms['chrom'].values, chroms['bp_start'].values))

chroms.head()

Unnamed: 0,chrom,size,bp_start
0,1,195154279,195154279
1,2,181755017,376909296
2,3,159745316,536654612
3,4,156860686,693515298
4,5,151758149,845273447


In [4]:
dpath = "/scratch/indikar_root/indikar1/shared_data/population/align_table/"

file_list = glob.glob(f"{dpath}*")
file_list

['/scratch/indikar_root/indikar1/shared_data/population/align_table/batch04.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/population/align_table/batch02.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/population/align_table/batch03.GRCm39.align_table.parquet',
 '/scratch/indikar_root/indikar1/shared_data/population/align_table/batch01.GRCm39.align_table.parquet']

In [5]:
df = []

resolution = 1e5

columns = [
    'read_name',
    'align_id',
    'chrom', 
    'ref_start', 
    'ref_end',
    'is_mapped',
]

for fpath in file_list:
    basename = os.path.basename(fpath).split(".")[0]
    tmp = pd.read_parquet(fpath, columns=columns)
    
    # only mapped monomers
    tmp = tmp[tmp['is_mapped']]
    
    # only chromosomal contigs (no unplaced contigs)
    tmp = tmp[tmp['chrom'].isin(chroms['chrom'].values)]
    
    # compute the midpoint of each alignment
    tmp['local_position'] = ((tmp['ref_end'] - tmp['ref_start']) // 2) + tmp['ref_start']
    
    # convert local coordinates to global bin loci at some resolution
    tmp['chrom_start'] = tmp['chrom'].map(chrom_starts)
    tmp['global_position'] = tmp['chrom_start'] + tmp['local_position']
    tmp['global_bin'] = tmp['global_position'].apply(lambda x: np.ceil(x / resolution))
    tmp = tmp[tmp['global_bin'].notna()]
    
    tmp['basename'] = basename
    
    # drop duplicate bins per read (only unique monomners in the contact)
    tmp = tmp.drop_duplicates(subset=['read_name', 'global_bin'])
    tmp['order'] = tmp.groupby('read_name')['global_bin'].transform('nunique')
    
    # drop all singletons
    tmp = tmp[tmp['order'] > 1]
    
    print(basename, tmp.shape)    
    df.append(tmp)

df = pd.concat(df)
print(f"{df.shape=}")
df['global_bin'] = df['global_bin'].astype(int)

df.head()

batch04 (4099946, 12)
batch02 (419546, 12)
batch03 (2192781, 12)
batch01 (3726776, 12)
df.shape=(10439049, 12)


Unnamed: 0,read_name,align_id,chrom,ref_start,ref_end,is_mapped,local_position,chrom_start,global_position,global_bin,basename,order
11,00001eac-9561-4bd2-a272-1b0a475d75e5,2701970,13,21752768,21753106.0,True,21752937.0,1887824000.0,1909577000.0,19096,batch04,3
15,00001eac-9561-4bd2-a272-1b0a475d75e5,2701974,13,9587846,9588362.0,True,9588104.0,1887824000.0,1897412000.0,18975,batch04,3
20,00001eac-9561-4bd2-a272-1b0a475d75e5,2701979,13,3847827,3847900.0,True,3847863.0,1887824000.0,1891672000.0,18917,batch04,3
22,00002cf2-3a39-4d41-b231-75b50a5df98a,246076,5,54958588,54958907.0,True,54958747.0,845273400.0,900232200.0,9003,batch04,3
24,00002cf2-3a39-4d41-b231-75b50a5df98a,246078,5,54733402,54733982.0,True,54733692.0,845273400.0,900007100.0,9001,batch04,3


In [6]:
"""Print the total number of bins (tokens) and the most frequent genomic loci"""
print(f"{df['global_bin'].nunique()=}")
print()
print(df['global_bin'].value_counts())

df['global_bin'].nunique()=24666

global_bin
4755     524824
13974    346644
10985     45110
20161     15817
14296     14827
          ...  
8058         16
28009        11
28010         7
25239         6
27557         2
Name: count, Length: 24666, dtype: int64


In [7]:
# create a bin map, just in case
bin_map = df[['chrom', 'chrom_start', 'global_bin']].drop_duplicates()
bin_map = bin_map.sort_values(by='global_bin')

print(bin_map.groupby('chrom')['global_bin'].nunique().head(20))

print()
bin_map.head()

chrom
1     1921
10    1274
11    1190
12    1171
13    1176
14    1217
15    1011
16     950
17     923
18     877
19     584
2     1787
3     1567
4     1533
5     1482
6     1465
7     1419
8     1264
9     1214
X     1662
Name: global_bin, dtype: int64



Unnamed: 0,chrom,chrom_start,global_bin
7110,1,195154279.0,1983
103408,1,195154279.0,1984
206712,1,195154279.0,1985
51175,1,195154279.0,1986
43221,1,195154279.0,1987


# structure the tokens

In [8]:
def format_input(token_list, max_length=12, pad_token=0):
    """A function to format a token list"""
    
    # truncate long
    if len(token_list) > max_length:
        token_list = token_list[:max_length]
    # pad short
    else:
        short = max_length - len(token_list)
        token_list = list(token_list) + ([pad_token] * short)
    return list(token_list)

# Sample concatemers

In [9]:
sample_size = 1000
read_names  = np.random.choice(df['read_name'].unique(), sample_size, replace=False)
len(read_names)

1000

In [10]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
max_length = 10
token_df = []

for read_name, group in df.groupby('read_name'):
    
    input_row = {
        'read_name' : read_name,
        'input_ids' : format_input(group['global_bin'].to_list(), max_length),
        'order' : len(group),
        'length' : max_length,
        'chroms' : group['chrom'].to_list(),
        'n_chroms' : group['chrom'].nunique(),
        'basename' : group['basename'].unique()[0],
    }
    
    token_df.append(input_row)
    
token_df = pd.DataFrame(token_df)
print(f"{token_df.shape=}")
token_df.head()

# SAMPLE THE DATA 

In [None]:
SAMPLE_SIZE  = 1000
sample = token_df.sample(SAMPLE_SIZE).reset_index(drop=True)
input_ids = torch.tensor(sample['input_ids'].to_list())
input_ids = input_ids.to(torch.float16)

print(f"{input_ids.shape=}")
print(f"{input_ids.device=}")

# Create dataset and dataloader
dataset = TensorDataset(input_ids)
data_loader = DataLoader(dataset, batch_size=8) 
print('done!')

# Build model

In [None]:
# Clear GPU cache
torch.cuda.empty_cache()
print("\n--- GPU Cache Cleared ---\n")

# Configuration Details
total_tokens = bin_map['global_bin'].nunique()
vocab_size = int(total_tokens + 1)
mask_token_id = int(total_tokens + 2)
unk_token_id = int(total_tokens + 3)

print(f"Total Unique Tokens: {total_tokens}")
print(f"Vocabulary Size: {vocab_size}")
print(f"Mask Token ID: {mask_token_id}")
print(f"Unknown Token ID: {unk_token_id}")

# Hyperparameters
masking_prob = 0.15
learning_rate = 1e-4
num_hidden_layers = 2
num_attention_heads = 2
output_shape = 10
num_epochs = 2

print("\n--- Model Hyperparameters ---")
print(f"Masking Probability: {masking_prob}")
print(f"Learning Rate: {learning_rate}")
print(f"Number of Hidden Layers: {num_hidden_layers}")
print(f"Number of Attention Heads: {num_attention_heads}")
print(f"Output Shape: {output_shape}")
print(f"Number of Epochs: {num_epochs}\n")

# Model Configuration
config = BertConfig(
    vocab_size=vocab_size,
    hidden_size=output_shape,
    num_hidden_layers=num_hidden_layers,
    output_hidden_states=True,
    num_attention_heads=num_attention_heads,
    intermediate_size=output_shape * 2,
    max_position_embeddings=max_length,
)

model = BertForMaskedLM(config)
print(model)
print("--- BERT Model Successfully Built ---\n") 

In [None]:
# Masking Function with Debugging Prints
def apply_masking(batch, mask_token_id=mask_token_id, mlm_probability=masking_prob):
    print("\n--- Inside Masking Function ---")
    print("Original Batch Shape:", batch.shape)
    labels = batch.clone()

    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        [i == 0 or i == len(b) - 1 for i in range(len(b))] for b in labels.tolist()
    ]

    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    print("Number of Masked Indices:", masked_indices.sum().item()) 

    labels[~masked_indices] = -100
    batch[masked_indices] = mask_token_id

    print("Modified Batch Shape:", batch.shape)
    print("Labels Shape:", labels.shape)
    print("Sample Masked Batch Values:", batch[0][:10])
    print("Sample Labels Values:", labels[0][:10])
    return batch, labels


# Training Loop with Debugging Prints
optimizer = AdamW(model.parameters(), lr=learning_rate, no_deprecation_warning=True)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"{device=}")
model.to(device)

for epoch in range(num_epochs):
    total_loss = 0.0
    print(f"\n--- Epoch {epoch + 1}/{num_epochs} ---")  # Print epoch start
    for batch in data_loader:
        optimizer.zero_grad()

        batch = batch[0].to(device) 
        print("Batch size:", batch.shape)  # Check size before masking
        print("Batch device:", batch.device)
        print("Batch type:", batch.type())
        
        # Handle OOV tokens (replace with UNK token)
        batch[batch >= vocab_size - 1] = unk_token_id
        batch, labels = apply_masking(batch) 
        
        batch = batch.to(torch.long)
        print("Batch type:", batch.type())

        # Forward pass
        outputs = model(batch, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()  # Accumulate loss

        # Backward pass
        loss.backward()
        optimizer.step()

        print("Batch Loss:", loss.item()) # Print loss for each batch
  
    # Print average loss after each epoch
    average_loss = total_loss / len(data_loader)
    print(f"Average Loss: {average_loss:.4f}\n") # Print average loss with higher precision

In [None]:
break

In [None]:
# torch.cuda.empty_cache()

# # Masking
# def apply_masking(batch, mask_token_id=mask_token_id, mlm_probability=masking_prob):
#     labels = batch.clone()  
#     probability_matrix = torch.full(labels.shape, mlm_probability)
    
#     # Create a mask for positions to be masked
#     special_tokens_mask = [
#         [i == 0 or i == len(b) - 1 for i in range(len(b))] for b in labels.tolist()
#     ]
#     probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
#     masked_indices = torch.bernoulli(probability_matrix).bool()
#     labels[~masked_indices] = -100  

#     # Replace masked indices with mask token id
#     batch[masked_indices] = mask_token_id
#     return batch, labels


# # Training Loop
# optimizer = AdamW(model.parameters(), lr=learning_rate, no_deprecation_warning=True)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
# model.to(device)

# for epoch in range(num_epochs):
#     total_loss = 0.0  # Track total loss for the epoch

#     for batch in data_loader:
#         optimizer.zero_grad()
        
#         # Apply masking using our custom function
#         batch = batch[0].to(device) 
        
#         # Handle OOV tokens (replace with UNK token)
#         batch[batch >= vocab_size - 1] = unk_token_id
#         batch, labels = apply_masking(batch)  

#         # Forward pass
#         outputs = model(batch, labels=labels)
#         loss = outputs.loss
#         total_loss += loss.item()  # Accumulate loss

#         # Backward pass
#         loss.backward()
#         optimizer.step()
    
#     # Print average loss after each epoch
#     print(f"Epoch {epoch + 1}/{num_epochs}, Total Loss: {total_loss:.2f}")

In [None]:
break

# Convert into "tokens"

In [None]:
max_length = 10
bin_names = df.T.columns.astype(int).to_numpy()

def format_input(token_list, max_length=12, pad_token=0):
    """A function to format a token list"""
    
    # truncate long
    if len(token_list) > max_length:
        token_list = token_list[:max_length]
    # pad short
    else:
        short = max_length - len(token_list)
        token_list = list(token_list) + ([pad_token] * short)
    return list(token_list)
    

tokens = df.T.apply(lambda x: bin_names[np.argwhere(x != 0).ravel()], axis=1)
tokens = tokens.reset_index()
tokens.columns = ['read_code', 'raw_input']
tokens['order'] = tokens['raw_input'].apply(lambda x: len(x))
tokens['input_ids'] = tokens['raw_input'].apply(lambda x: format_input(x, max_length=max_length))
tokens['length'] = tokens['input_ids'].apply(lambda x: len(x))
tokens.head()

In [None]:
# Example integer data (already tokenized)
input_ids = torch.tensor(tokens['input_ids'].to_list())
print(f"{input_ids.shape=}")

# Create dataset and dataloader
dataset = TensorDataset(input_ids)
data_loader = DataLoader(dataset, batch_size=8, shuffle=True) 
print('done!')

# Train a model

In [None]:
vocab_size = 200
mask_token_id = 199
unk_token_id = 198
masking_prob = 0.15
learning_rate = 1e-4
num_hidden_layers = 2
num_attention_heads = 2
output_shape = 10
num_epochs = 3

print(f"{vocab_size=}")

# Model Configuration
config = BertConfig(
    vocab_size=vocab_size, 
    hidden_size=output_shape,
    num_hidden_layers=num_hidden_layers,
    output_hidden_states=True,
    num_attention_heads=num_attention_heads,
    intermediate_size=output_shape * 2,
    max_position_embeddings=max_length,
)

model = BertForMaskedLM(config)

# Masking
def apply_masking(batch, mask_token_id=mask_token_id, mlm_probability=masking_prob):
    labels = batch.clone()  
    probability_matrix = torch.full(labels.shape, mlm_probability)
    
    # Create a mask for positions to be masked
    special_tokens_mask = [
        [i == 0 or i == len(b) - 1 for i in range(len(b))] for b in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  

    # Replace masked indices with mask token id
    batch[masked_indices] = mask_token_id
    return batch, labels


# Training Loop
optimizer = AdamW(model.parameters(), lr=learning_rate)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
model.to(device)

for epoch in range(num_epochs):
    total_loss = 0.0  # Track total loss for the epoch

    for batch in data_loader:
        optimizer.zero_grad()
        
        # Apply masking using our custom function
        batch = batch[0].to(device) 
        
        # Handle OOV tokens (replace with UNK token)
        batch[batch >= vocab_size - 1] = unk_token_id
        batch, labels = apply_masking(batch)  

        # Forward pass
        outputs = model(batch, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()  # Accumulate loss

        # Backward pass
        loss.backward()
        optimizer.step()
    
    # Print average loss after each epoch
    print(f"Epoch {epoch + 1}/{num_epochs}, Total Loss: {total_loss:.2f}")

# Extract learned embeddings

In [None]:
sample_size = 10000
sample = tokens.sample(sample_size)

new_input = torch.tensor(sample['input_ids'].to_list())
print(f"{new_input.shape=}")

# Move to the same device as the model
new_input = new_input.to(device)

# Get the model's prediction (logits for each masked position)
with torch.no_grad():  # No need to track gradients for this
    outputs = model(new_input)
    
embeddings = outputs.hidden_states[-1] 
embeddings.shape

# Loci embeddings

In [None]:
result = []

for i, (_, row) in enumerate(sample.iterrows()):
    n = min(row['order'], max_length)
    bins = row['raw_input'][:n] # handle raw inputs larger than the max length

    mat = embeddings[i, 0:n, :].cpu().detach().numpy()
    mat = pd.DataFrame(mat, index=bins)
    result.append(mat)

result = pd.concat(result)
result = result.reset_index(names='loci')

# average ovber the loci embeddings
result = result.groupby('loci').mean()

result.head()

In [None]:
# Create AnnData object
adata = an.AnnData(
    X=result,
)

sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=1)

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 12, 3

sns.barplot(x=adata.obs_names.astype(int),
            y=adata.obs['leiden'].astype(int) + 1,
            hue=adata.obs['leiden'].astype(int) + 1,
            palette='viridis',
            )

plt.xticks([])
adata

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 5, 5

sns.scatterplot(
    x=adata.obsm['X_umap'][:, 0],
    y=adata.obsm['X_umap'][:, 1],
    hue=adata.obs.index.astype(int),
    style=adata.obs['leiden'],
    ec='k',
    palette='viridis',
)


In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 12, 3

sns.barplot(x=adata.obs_names.astype(int),
            y=adata.obsm['X_pca'][:, 0],
            )

plt.title("1st Eigenvector of Embedding Space")
plt.xticks([])
plt.show()

sns.barplot(x=adata.obs_names.astype(int),
            y=adata.obsm['X_pca'][:, 1],
            )
plt.title("2nd Eigenvector of Embedding Space")
plt.xticks([])
plt.show()

In [None]:
# break

# AB Compartments

In [None]:
A = matrix.clique_expand_incidence(df, zero_diag=False) 
A = A.sort_index(axis=1)
A = A.sort_index(axis=0)

A = matrix.normalize_oe(matrix.normalize_kr(A).todense())
A = np.asarray(A)

print(f"{A.shape=}")

pca = PCA(n_components=2)
pca.fit(A)
X_pca = pca.transform(A)

sns.barplot(x=range(len(X_pca)),
            y=X_pca[:, 0],
            color='C1')

plt.title("Clique-Expanded, OE Normed Eigenvector")

plt.xticks([])

In [None]:
component = 1
comp_vec = adata.obsm['X_pca'][:, component]

print(f"{X_pca[:, 0].shape=}")
print(f"{comp_vec.shape=}")

scipy.stats.pearsonr(X_pca[:, 0], comp_vec)

In [None]:
A = matrix.clique_expand_incidence(df, zero_diag=False) 
A = A.sort_index(axis=1)
A = A.sort_index(axis=0)

print(f"{A.shape=}")

plt.imshow(np.log1p(A))

In [None]:
break

# hyperedge embeddings

In [None]:
# geneformer-style embeddings for hyperedges
edge_embeddings = embeddings.mean(axis=1).cpu().detach().numpy()  
print(f"{edge_embeddings.shape=}")

obs_names = [f"Obs_{i}" for i in range(len(edge_embeddings))]
sample['obs_names'] = obs_names

# Create AnnData object
adata = an.AnnData(
    X=edge_embeddings,
    obs=sample.set_index('obs_names')
)

sc.tl.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=0.1)

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 5, 5

sc.pl.umap(
    adata,
    color=["order", "leiden"],
    ncols=1,
    size=15,
)

adata

In [None]:
adata.obs.explode('input_ids')['input_ids'].value_counts()

In [None]:
# look a each cluster:

for cluster, group in adata.obs.groupby('leiden'):
    print(f"{cluster=}")
    
    group = group.explode('input_ids')
    group = group[group['input_ids'] != 0]
    print(group['input_ids'].value_counts().head(5))
    
    

In [None]:
A = matrix.clique_expand_incidence(df, zero_diag=False) 
A = A.sort_index(axis=1)
A = A.sort_index(axis=0)

print(f"{A.shape=}")

plt.imshow(np.log1p(A))
# plt.axvline(x=86, c='r', alpha=0.2, lw=2)
# plt.axhline(y=86, c='r', alpha=0.2, lw=2)

In [None]:
# just each group

# look a each cluster:
for cluster, group in adata.obs.groupby('leiden'):
    I = ut.list_of_list_to_incidence(group['raw_input'].to_list())
    I = pd.DataFrame(I)
    I = ut.fill_missing_bins(I, df.index)
    print(f"{I.shape}")
    
    A = matrix.clique_expand_incidence(I, zero_diag=False) 
    plt.imshow(np.log1p(A))
    plt.title(f"{cluster=}")
    plt.show()