In [None]:

import os
dir='/home/houshiyuan/frp/CCTAD'
Dataset='HIC056'   
resolution=50    
output_dir=dir+'/output/'+Dataset+f'/{resolution}kb'
models_dir=dir+'/config/models/'+Dataset+f'/{resolution}kb'
best_seed_dir=dir+'/config/best_seed'
final_output=dir+'/final_output/'+Dataset+f'/{resolution}kb'
path='/mnt/sdi/frp/data/marks'  
hic_matrix_dir='/mnt/sdi/frp/data/hic_matrix'  
method='ConvCluster'


In [None]:

os.makedirs(output_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)
os.makedirs(final_output, exist_ok=True)
os.makedirs(best_seed_dir, exist_ok=True)
os.makedirs(os.path.join(final_output, "TAD"), exist_ok=True)
os.makedirs(os.path.join(final_output, "TAD_res"), exist_ok=True)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy.sparse import lil_matrix
import sys
import os
import random

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")  
print(f"Using device: {device}")

class ConvAutoencoder1D(nn.Module):
    def __init__(self, input_dim=100, embedding_dim=16):
        super(ConvAutoencoder1D, self).__init__()
        self.encoder = nn.Sequential(
            nn.Conv1d(1, 8, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.Conv1d(8, 4, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(input_dim * 4, embedding_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, input_dim * 4),
            nn.Unflatten(1, (4, input_dim)),
            nn.ConvTranspose1d(4, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(8, 1, kernel_size=5, padding=2),
        )

    def forward(self, x):
        z = self.encoder(x)
        x_recon = self.decoder(z)
        return x_recon, z

def create_connectivity(n):
    mat = lil_matrix((n, n))
    for i in range(n - 1):
        mat[i, i + 1] = mat[i + 1, i] = 1
    return mat

def run_clustering(features, distance_threshold=1.5):
    clustering = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=distance_threshold,
        linkage='average',
        connectivity=create_connectivity(len(features))
    )
    return clustering.fit_predict(features)

def train_model(hic_matrix, chrom,input_dim=100, embedding_dim=16, num_epochs=500, lr=1e-4,load_model=False):   
    
    features = StandardScaler().fit_transform(hic_matrix)
    features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(1).to(device)
    model = ConvAutoencoder1D(input_dim=input_dim, embedding_dim=embedding_dim).to(device)
    
    if load_model:
        best_model_path = models_dir+f'/best_model_{chrom}.pt'
        if os.path.exists(best_model_path):
            print("加载已有模型参数：", best_model_path)
            model.load_state_dict(torch.load(best_model_path, map_location=device))
        else:
            print("未找到模型文件，无法加载。")
        model.eval()
        with torch.no_grad():
                _, z = model(features_tensor)
        return model, z
    else:
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        model.train()
        for epoch in range(num_epochs):
            optimizer.zero_grad()
            recon, z = model(features_tensor)
            recon_loss = F.mse_loss(recon.squeeze(1), features_tensor.squeeze(1))
            recon_loss.backward()
            optimizer.step()
            
        return model, z

    



Using device: cuda:3


In [None]:
def identify_tad_boundaries(cluster_labels):  
    boundaries = []
    for i in range(1, len(cluster_labels)):
        if cluster_labels[i] != cluster_labels[i - 1]:
            boundaries.append(i)
    return boundaries

def create_boundary_labels_from_clusters(cluster_labels, hic_len, save_path):  
    labels = np.zeros(hic_len, dtype=int)
    boundary_bins = []

    for i in range(1, len(cluster_labels)):
        if cluster_labels[i] != cluster_labels[i - 1]:
            labels[i] = 1
            boundary_bins.append(i)

    np.savetxt(save_path, labels, fmt='%d')
    
    return boundary_bins

def merge_consecutive_small_tads(input_file, output_file, min_bin_size=3):
    with open(input_file, 'r') as f:
        lines = [list(map(int, line.strip().split())) for line in f if line.strip()]

    merged = []
    i = 0
    while i < len(lines):
        start, end = lines[i]
        if (end - start) < min_bin_size:
           
            merge_start = start
            merge_end = end
            i += 1
            while i < len(lines) and lines[i][0] == merge_end and (lines[i][1] - lines[i][0]) < min_bin_size:
                merge_end = lines[i][1]
                i += 1
            merged.append((merge_start, merge_end))
        else:
            
            merged.append((start, end))
            i += 1
    with open(output_file, 'w') as f:
        for s, e in merged:
            f.write(f"{s} {e}\n")

def process_tad_file(input_path, output_path, min_size=2):
    def read_intervals_from_file(file_path):
        intervals = []
        with open(file_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) == 2:
                    intervals.append([int(parts[0]), int(parts[1])])
        return intervals
    def write_intervals_to_file(intervals, output_path):
        with open(output_path, 'w') as f:
            for interval in intervals:
                f.write(f"{interval[0]} {interval[1]}\n")

    def merge_tads(tad_intervals, min_size):
        if not tad_intervals:
            return []
        merged = []
        i = 0
        while i < len(tad_intervals):
            curr = tad_intervals[i]
            start, end = curr
            length = end - start

            if length < min_size:
                
                if merged and merged[-1][1] == start:
                    merged[-1][1] = end
               
                elif i + 1 < len(tad_intervals) and tad_intervals[i + 1][0] == end:
                    tad_intervals[i + 1][0] = start      
            else:
                merged.append([start, end])
            i += 1

        return merged
 
    intervals = read_intervals_from_file(input_path)
    merged_intervals = merge_tads(intervals, min_size)
    write_intervals_to_file(merged_intervals, output_path)
    
def convert_boundary_file_to_tads(input_file, output_file, start=1):

    with open(input_file, 'r') as f:
        boundaries = [int(line.strip()) for line in f if line.strip().isdigit()]
    current = start
    with open(output_file, 'w') as f:
        for b in boundaries:
            f.write(f"{current} {b}\n")
            current = b
            
def convert_tad_resolution(input_file, output_file, resolution=25000):
    tads = np.loadtxt(input_file, dtype=int)
    if tads.ndim == 1:
        tads = np.expand_dims(tads, axis=0)

    tads_bp = tads * resolution
    np.savetxt(output_file, tads_bp, fmt="%d", delimiter="\t")
 
def mark_label_transitions(labels):
    labels = np.array(labels)
    transitions = np.zeros_like(labels)
    for i in range(1, len(labels)):
        if labels[i] != labels[i - 1]:
            transitions[i] = 1
            transitions[i - 1] = 1  
    return transitions
def mark_transitions_from_file(input_file, output_file):
 
    with open(input_file, 'r') as f:
        labels = [int(line.strip()) for line in f if line.strip().isdigit()]
    transitions = mark_label_transitions(labels)
    np.savetxt(output_file, transitions, fmt='%d')
 





In [None]:

import time
def tad(hic_matrix,chrom,seed,load_model,threshold):

    if seed is not None:
        seed_everything(seed) 
    trained_model, z = train_model(hic_matrix, chrom,input_dim=hic_matrix.shape[1],num_epochs=500,load_model=load_model)    
    z_np = z.detach().cpu().numpy()
    cluster_labels = run_clustering(z_np, distance_threshold=threshold)  
   
    if all(x == 0 for x in cluster_labels):
        return 0,0,0,None
    cluster_labels_path=os.path.join(output_dir, "cluster_labels", f"{chrom}_cluster_labels.txt")  
    os.makedirs(os.path.dirname(cluster_labels_path), exist_ok=True)
    np.savetxt(cluster_labels_path, cluster_labels, fmt="%d")

    boundary=identify_tad_boundaries(cluster_labels)
    boundary_file=os.path.join(output_dir, "boundary", f"boundary_{chrom}.txt")
    os.makedirs(os.path.dirname(boundary_file), exist_ok=True)
    np.savetxt(boundary_file, boundary, fmt="%d")

    tad_path=os.path.join(output_dir, "TAD", f"{Dataset}_{method}_{resolution}k_KR.{chrom}")
    os.makedirs(os.path.dirname(tad_path), exist_ok=True)
    convert_boundary_file_to_tads(boundary_file, tad_path, start=1)

    merge_consecutive_small_tads(tad_path,tad_path,min_bin_size=3)  
    process_tad_file(tad_path,tad_path,min_size=3)
    

    res_tad_path=os.path.join(output_dir, "TAD_res", f"{Dataset}_{method}_{resolution}k_KR.{chrom}")
    os.makedirs(os.path.dirname(res_tad_path), exist_ok=True)
    convert_tad_resolution(tad_path, res_tad_path, resolution=resolution*1000)

    os.makedirs(os.path.join(output_dir, "labels"), exist_ok=True)
    mark_transitions_from_file(cluster_labels_path, output_dir+f"/labels/01_{chrom}_cluster_labels.txt")

    return trained_model




In [None]:
chromosomes = [f"chr{i}" for i in range(21,22)]   #染色体编号

In [None]:


import shutil
for chrom in chromosomes:
    print(f"正在处理{Dataset}_{chrom},{resolution}kb")
    # 读取数据
    hic_matrix = np.loadtxt( f"{hic_matrix_dir}/{Dataset}/{resolution}kb/{Dataset}_{resolution}k_KR.{chrom}") 

    k=2    # k==1 训练 
            # k==2 根据随机种子复现
            # k==其他 直接调用训练好的最优模型
    if k==1:
        ### 训练
        load_model=False  
        # 指定最优阈值
        best_threshold=1.2   
        seed = int(time.time())  
        trained_model=tad(hic_matrix,chrom,seed,load_model,best_threshold)
        #print(seed)
        torch.save(trained_model.state_dict(), models_dir+f"/model_{chrom}.pt")
        shutil.copy(output_dir+f'/TAD/{Dataset}_{method}_{resolution}k_KR.{chrom}', final_output+f'/TAD/{Dataset}_{method}_{resolution}k_KR.{chrom}')
        shutil.copy(output_dir+f'/TAD_res/{Dataset}_{method}_{resolution}k_KR.{chrom}', final_output+f'/TAD_res/{Dataset}_{method}_{resolution}k_KR.{chrom}')
        # with open(os.path.join(best_seed_dir, f"{Dataset}_{resolution}kb_best_seed.txt"), "a") as f:
        #     f.write(f"{chrom}:{seed}\n")
    elif k==2:
        ### 根据最优随机种子再次训练最优模型 训练一次
        load_model=False   
        seed=1755498234    # 某条染色体对应最优模型的随机种子
        best_threshold=1.2  #最优阈值
        trained_model=tad(hic_matrix,chrom,seed,load_model,best_threshold)

        p=0     
        if p==1:  #是否保存本次训练的模型  p==1 保存，否则不保存
            torch.save(trained_model.state_dict(), models_dir+f"/best_model_{chrom}.pt")
            print("模型已经保存到：",models_dir+f"/best_model_{chrom}.pt")
            shutil.copy(output_dir+f'/TAD/{Dataset}_{method}_{resolution}k_KR.{chrom}', final_output+f'/TAD/{Dataset}_{method}_{resolution}k_KR.{chrom}')
            shutil.copy(output_dir+f'/TAD_res/{Dataset}_{method}_{resolution}k_KR.{chrom}', final_output+f'/TAD_res/{Dataset}_{method}_{resolution}k_KR.{chrom}')

    else:
        ### 直接调用训练好的最优模型
        load_model=True  
        seed=None
        best_threshold=1.2    #某条染色体对应最优阈值
        tad(hic_matrix,chrom,seed,load_model,best_threshold)  
        shutil.copy(output_dir+f'/TAD/{Dataset}_{method}_{resolution}k_KR.{chrom}', final_output+f'/TAD/{Dataset}_{method}_{resolution}k_KR.{chrom}')
        shutil.copy(output_dir+f'/TAD_res/{Dataset}_{method}_{resolution}k_KR.{chrom}', final_output+f'/TAD_res/{Dataset}_{method}_{resolution}k_KR.{chrom}')

    
     

正在处理HIC056_chr21,50kb
模型已经保存到： /home/houshiyuan/frp/CCTAD/config/models/HIC056/50kb/best_model_chr21.pt
