In [1]:
import pandas as pd
import biotite
from biotite.structure.residues import get_residues
from biotite.structure import get_chains
from biotite.sequence import ProteinSequence
import biotite.structure as struc
import biotite.structure.io as strucio
import glob,re

import os,sys,time,json,fnmatch,shutil,re,subprocess,multiprocessing
import numpy as np
import torch

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="biotite")

from pymol import cmd
import copy

def single_chain_rmsd(template, temp_pdbfile,positin=None):
    if isinstance(template,str):
        template = strucio.load_structure(template,model=1)
        template = template[template.atom_name == "CA"]
    if isinstance(temp_pdbfile,str):
        temp_pdbfile = strucio.load_structure(temp_pdbfile,model=1)
        temp_pdbfile = temp_pdbfile[temp_pdbfile.atom_name == "CA"]
    if positin:
        res_id = template.res_id
        res_id_mask = np.isin(res_id,positin)
        template = template[res_id_mask]
        temp_pdbfile = temp_pdbfile[res_id_mask]
    superimposed, _ = struc.superimpose(template, temp_pdbfile)
    motif_rmsd = struc.rmsd(template, superimposed)
    return motif_rmsd

def usalign_wrapper(template, temp_pdbfile, force_alignment=None):
    if force_alignment == None:
        p = subprocess.Popen(f'/home/sirius/Desktop/USalign {template} {temp_pdbfile} -mm 1 -ter 0 | grep -E "RMSD|TM-score=" ', stdout=subprocess.PIPE, shell=True)
    else:
        p = subprocess.Popen(f'/home/sirius/Desktop/USalign {template} {temp_pdbfile} -I {force_alignment} -mm 1 -ter 0| grep -E "RMSD|TM-score=" ', stdout=subprocess.PIPE, shell=True)
    output, __ = p.communicate()
    output = output.decode('utf-8')
    tm_rmsd = float(re.search('RMSD=\s*(\d+\.\d+)', output).group(1))
    tm_score = float(re.search('TM-score=\s*(\d+\.\d+)', output).group(1))
    return tm_rmsd, tm_score

def tmalign_wrapper(template, temp_pdbfile, force_alignment=None):
    if force_alignment == None:
        p = subprocess.Popen(f'/home/sirius/Desktop/TMalign {template} {temp_pdbfile} | grep -E "RMSD|TM-score=" ', stdout=subprocess.PIPE, shell=True)
    else:
        p = subprocess.Popen(f'/home/sirius/Desktop/TMalign {template} {temp_pdbfile} -I {force_alignment} | grep -E "RMSD|TM-score=" ', stdout=subprocess.PIPE, shell=True)
    output, __ = p.communicate()
    tm_rmsd  = float(str(output)[:-3].split("RMSD=")[-1].split(",")[0] )
    tm_score = float(str(output)[:-3].split("TM-score=")[-1].split("(if")[0] )
    return tm_rmsd, tm_score

def extract_plddt(protein,chain_id=None):
    if isinstance(protein,str):
        # model = 1 to load a AtomArray object
        # extra_fields to load the b_factor column
        atom_array = strucio.load_structure(protein,model=1,extra_fields=["b_factor"])
    elif isinstance(protein, struc.AtomArrayStack):
        atom_array = protein[0]
    elif isinstance(protein, struc.AtomArray):
        atom_array = protein

    # add multiple chain sequence subtract function
    all_chains = get_chains(atom_array)
    if len(all_chains) == 0:
        raise ValueError('No chains found in the input file.')
    if chain_id is None:
        chain_ids = all_chains
    elif isinstance(chain_id, list):
        chain_ids = chain_id
    else:
        chain_ids = [chain_id] 
    for chain in chain_ids:
        if chain not in all_chains:
            raise ValueError(f'Chain {chain} not found in input file')
    chain_filter = [a.chain_id in chain_ids for a in atom_array]
    atom_array = atom_array[chain_filter]

    # mask canonical aa 
    aa_mask = struc.filter_amino_acids(atom_array) # higher version of biotite
    atom_array = atom_array[aa_mask]

    # ca atom only
    atom_array = atom_array[atom_array.atom_name == "CA"]

    plddt = np.array([i.b_factor for i in atom_array])

    return np.mean(plddt)

# ***********************************************************************************************************

def chain_pair_pae(pae, chain_number):
    # 判断pae的shape[0]可以被chain_number整除
    assert pae.shape[0] % chain_number == 0
    chain_length = pae.shape[0] // chain_number
    interaction_mask = np.ones_like(pae)
    for i in range(chain_number):
        interaction_mask[i*chain_length:(i+1)*chain_length, i*chain_length:(i+1)*chain_length] = 0.0
    return np.sum(pae * interaction_mask) / np.sum(interaction_mask)

def get_chain_number_from_pdb(pdb_file):
    atom_array = strucio.load_structure(pdb_file,model=1)
    all_chains = get_chains(atom_array)
    return len(all_chains)




def pymol_align(folder_pdb,save_path):
    # only for ProteinMPNN lmpnn and NeMO_v3, not suitable for NeMO_v2
    cmd.reinitialize()
    cmd.do("run /home/sirius/Desktop/tmalign.py")
    all_file = os.listdir(folder_pdb)
    # Sort files based on the number extracted from the file name

    all_file = sorted(
        os.listdir(folder_pdb),
        key=lambda x: (
            # 优先按 T_ 的数字排序（T_后面的数字，比如 T_0.1 转换为 0.1）
            float(re.search(r"T_(\d\.\d)", x).group(1)) if "T_" in x else float("inf"),
            # 再按 sample 的数字排序（sample 后面的数字）
            int(re.search(r"sample_(\d+)_", x).group(1)) if "sample_" in x else -1
        )
    )

    for file in all_file:
        if "seed" in file:
            name = "native"
        else:
            sample_pattern = r"sample_(\d+)_"
            match = re.search(sample_pattern, file)
            name_id = match.group(1)
            # T_0.1__sample_67__score_0.4917__global_score_0.4917__seq_recovery_0.4400
            temperature_pattern = r"T_(\d+\.\d+)"
            temperature_match = re.search(temperature_pattern, file)
            temperature = temperature_match.group(1) 
            name = f"{temperature}_{name_id}"
        cmd.load(os.path.join(folder_pdb,file), name)

    for obj in cmd.get_object_list():
        cmd.do(f"tmalign {obj}, {cmd.get_object_list()[0]}")
    cmd.set('grid_mode', 1)
    cmd.zoom()
    cmd.remove('solvent')
    cmd.do("@/home/sirius/Desktop/xcf.pml")
    cmd.save(save_path)
# ***********************************************************************************************************




def extract_seq(protein, chain_id=None):
    if isinstance(protein, str):
        atom_array = strucio.load_structure(protein, model=1)
    elif isinstance(protein, struc.AtomArrayStack):
        atom_array = protein[0]
    elif isinstance(protein, struc.AtomArray):
        atom_array = protein
    # aa_mask = struc.filter_canonical_amino_acids(atom_array)
    # atom_array = atom_array[aa_mask]
    all_chains = get_chains(atom_array)
    if len(all_chains) == 0:
        raise ValueError('No chains found in the input file.')
    if chain_id is None:
        chain_ids = all_chains
    elif isinstance(chain_id, list):
        chain_ids = chain_id
    else:
        chain_ids = [chain_id]
    for chain in chain_ids:
        if chain not in all_chains:
            raise ValueError(f'Chain {chain} not found in input file')
    chain_filter = [a.chain_id in chain_ids for a in atom_array]
    atom_array = atom_array[chain_filter]
    # mask canonical aa
    aa_mask = struc.filter_amino_acids(atom_array)
    atom_array = atom_array[aa_mask]
    residue_identities = get_residues(atom_array)[1]
    seq = ''.join([ProteinSequence.convert_letter_3to1(r)
                  for r in residue_identities])
    return seq

def usalign_wrapper(template, temp_pdbfile, force_alignment=None):
    if force_alignment == None:
        p = subprocess.Popen(f'/home/sirius/Desktop/USalign {template} {temp_pdbfile} -mm 1 -ter 0 | grep -E "RMSD|TM-score=" ', stdout=subprocess.PIPE, shell=True)
    else:
        p = subprocess.Popen(f'/home/sirius/Desktop/USalign {template} {temp_pdbfile} -I {force_alignment} -mm 1 -ter 0| grep -E "RMSD|TM-score=" ', stdout=subprocess.PIPE, shell=True)
    output, __ = p.communicate()
    output = output.decode('utf-8')
    tm_rmsd = float(re.search('RMSD=\s*(\d+\.\d+)', output).group(1))
    tm_score = float(re.search('TM-score=\s*(\d+\.\d+)', output).group(1))
    return tm_rmsd, tm_score

def mmalign_wrapper(template, temp_pdbfile, force_alignment=None):
    if force_alignment == None:
        p = subprocess.Popen(f'/home/sirius/Desktop/MMalign {template} {temp_pdbfile} | grep -E "RMSD|TM-score=" ', stdout=subprocess.PIPE, shell=True)
    else:
        p = subprocess.Popen(f'/home/sirius/Desktop/MMalign {template} {temp_pdbfile} -I {force_alignment} | grep -E "RMSD|TM-score=" ', stdout=subprocess.PIPE, shell=True)
    output, __ = p.communicate()
    output = output.decode('utf-8')  # Decode the bytes to a string
    tm_rmsd  = float(str(output)[:-3].split("RMSD=")[-1].split(",")[0] )
    tm_score = float(str(output)[:-3].split("TM-score=")[-1].split("(normalized")[0] )
    aligned_length_line = [line for line in output.split('\n') if 'Aligned length=' in line][0]
    aligned_length = int(aligned_length_line.split("Aligned length=")[-1].split(",")[0].strip())
    seq_id = float(aligned_length_line.split("Seq_ID=n_identical/n_aligned=")[-1].split()[0].strip())
    return tm_rmsd, tm_score, aligned_length, seq_id

def load_mpnn_seq(fasta=None,folder=None):
    """
    Load Protienmpnn / nemo style fasta 
    1. give a fasta return all the sequence
    2. give a folder return all the fasta in the folder

    return a pandas dataframe with columns: fasta_name, T, sample, score, global_score, seq_recovery, seq
    """
    all_seqs = []
    exists_seq = set()
    repeat_label = False
    assert (fasta is not None) != (folder is not None), "only one input is allowed!"
    if fasta is not None:
        fasta_name = ".".join(fasta.split("/")[-1].split(".")[:-1])
        with open(fasta, "r") as f:
            all_data = [i.strip() for i in f.readlines()]
            header = [i[1:] for i in all_data if i.startswith(">")]
            seq = [i for i in all_data if not i.startswith(">")]
            # remove the first sequence which is the native sequence
            header = header[1:]
            seq = seq[1:]
            for h,s in zip(header,seq):
                if s not in exists_seq:
                    exists_seq.add(s)
                    # T=0.1, sample=1, score=0.6639, global_score=0.8922, seq_recovery=0.2500
                    # extract values from header line (h)
                    matches = re.findall(r'(T|sample|score|global_score|seq_recovery)=([\d\.]+)', h)
                    values = {k: float(v) if '.' in v else int(v) for k,v in matches}
                    T = values.get('T')
                    sample = values.get('sample')
                    score = values.get('score')
                    global_score = values.get('global_score')
                    seq_recovery = values.get('seq_recovery')
                    all_seqs.append([fasta_name,float(T),int(sample),float(score),float(global_score),float(seq_recovery),s])
                else:
                    repeat_label = True
    if folder is not None:
        for fasta in os.listdir(folder):
            if fasta.endswith(".fa"):
                fasta_name = fasta.replace(".fa", "")
            elif fasta.endswith(".fasta"):
                fasta_name = fasta.replace(".fasta", "")
            else:
                continue
            with open(os.path.join(folder,fasta), "r") as f:
                all_data = [i.strip() for i in f.readlines()]
                header = [i[1:] for i in all_data if i.startswith(">")]
                seq = [i for i in all_data if not i.startswith(">")]
                header = header[1:]
                seq = seq[1:]
                for h,s in zip(header,seq):
                    if s not in exists_seq:
                        exists_seq.add(s)
                        matches = re.findall(r'(T|sample|score|global_score|seq_recovery)=([\d\.]+)', h)
                        values = {k: float(v) if '.' in v else int(v) for k,v in matches}
                        T = values.get('T')
                        sample = values.get('sample')
                        score = values.get('score')
                        global_score = values.get('global_score')
                        seq_recovery = values.get('seq_recovery')
                        all_seqs.append([fasta_name,float(T),int(sample),float(score),float(global_score),float(seq_recovery),s])
                    else:
                        repeat_label = True
    if repeat_label:
        print(f"Warning: {fasta_name} has repeat label!")
    all_seqs = pd.DataFrame(all_seqs,columns=["fasta_name","T","sample","score","global_score","seq_recovery","seq"])
    return all_seqs

Invalid license file syntax.
Feature:       PYMOL_MAIN
License path:  /home/sirius/.pymol/license.lic:
FlexNet Licensing error:-2,413


In [7]:
import py2Dmol

viewer = py2Dmol.view()

viewer.add_pdb("bioinformatics/7z09_relaxed.pdb")
viewer.show()
