In [1]:
import os
import biotite
import biotite.structure as struc
import biotite.structure.io as strucio
import biotite.sequence as seq

In [2]:
import json
import math
import numpy as np
import biotite.structure
from biotite.structure.io import pdbx, pdb
from biotite.structure.residues import get_residues
from biotite.structure import filter_backbone
from biotite.structure import get_chains
from biotite.sequence import ProteinSequence

from typing import Sequence, Tuple, List

def load_structure(fpath, chain=None):
    """
    Args:
        fpath: filepath to either pdb or cif file
        chain: the chain id or list of chain ids to load
    Returns:
        biotite.structure.AtomArray
    """
    if fpath.endswith('cif'):
        with open(fpath) as fin:
            pdbxf = pdbx.PDBxFile.read(fin)
        structure = pdbx.get_structure(pdbxf, model=1)
    elif fpath.endswith('pdb'):
        with open(fpath) as fin:
            pdbf = pdb.PDBFile.read(fin)
        structure = pdb.get_structure(pdbf, model=1)
    bbmask = filter_backbone(structure)
    structure = structure[bbmask]
    all_chains = get_chains(structure)
    if len(all_chains) == 0:
        raise ValueError('No chains found in the input file.')
    if chain is None:
        chain_ids = all_chains
    elif isinstance(chain, list):
        chain_ids = chain
    else:
        chain_ids = [chain] 
    for chain in chain_ids:
        if chain not in all_chains:
            raise ValueError(f'Chain {chain} not found in input file')
    chain_filter = [a.chain_id in chain_ids for a in structure]
    structure = structure[chain_filter]
    return structure


def extract_coords_from_structure(structure: biotite.structure.AtomArray):
    """
    Args:
        structure: An instance of biotite AtomArray
    Returns:
        Tuple (coords, seq)
            - coords is an L x 3 x 3 array for N, CA, C coordinates
            - seq is the extracted sequence
    """
    coords = get_atom_coords_residuewise(["N", "CA", "C"], structure)
    residue_identities = get_residues(structure)[1]
    seq = ''.join([ProteinSequence.convert_letter_3to1(r) for r in residue_identities])
    return coords, seq


def load_coords(fpath, chain):
    """
    Args:
        fpath: filepath to either pdb or cif file
        chain: the chain id
    Returns:
        Tuple (coords, seq)
            - coords is an L x 3 x 3 array for N, CA, C coordinates
            - seq is the extracted sequence
    """
    structure = load_structure(fpath, chain)
    return extract_coords_from_structure(structure)

def get_atom_coords_residuewise(atoms: List[str], struct: biotite.structure.AtomArray):
    """
    Example for atoms argument: ["N", "CA", "C"]
    """
    def filterfn(s, axis=None):
        filters = np.stack([s.atom_name == name for name in atoms], axis=1)
        sum = filters.sum(0)
        if not np.all(sum <= np.ones(filters.shape[1])):
            raise RuntimeError("structure has multiple atoms with same name")
        index = filters.argmax(0)
        coords = s[index].coord
        coords[sum == 0] = float("nan")
        return coords

    return biotite.structure.apply_residue_wise(struct, struct, filterfn)

In [None]:
strucio.load_structure

In [3]:
import biotite.structure.io as strucio
import biotite.structure as struc
import numpy as np
from biotite.structure.residues import get_residues
from biotite.structure import get_chains
from biotite.sequence import ProteinSequence

def load_structure(fpath,chain=None):
    """
    loading atom from the fpath, from the given chain
    """
    structure = strucio.load_structure(fpath,model=1)
    aa_mask = struc.filter_amino_acids(structure)
    structure = structure[aa_mask]
    all_chains = get_chains(structure)
    if len(all_chains) == 0:
        raise ValueError('No chains found in the input file.')
    if chain is None:
        chain_ids = all_chains
    elif isinstance(chain, list):
        chain_ids = chain
    else:
        chain_ids = [chain] 
    for chain in chain_ids:
        if chain not in all_chains:
            raise ValueError(f'Chain {chain} not found in input file')
    chain_filter = [a.chain_id in chain_ids for a in structure]
    structure = structure[chain_filter]
    return structure

def extract_seq(protein,chain_id=None):
    if isinstance(protein,str):
        atom_array = strucio.load_structure(protein,model=1)
    elif isinstance(protein, struc.AtomArrayStack):
        atom_array = protein[0]
    elif isinstance(protein, struc.AtomArray):
        atom_array = protein

    # add multiple chain sequence subtract function
    all_chains = get_chains(atom_array)
    if len(all_chains) == 0:
        raise ValueError('No chains found in the input file.')
    if chain_id is None:
        chain_ids = all_chains
    elif isinstance(chain_id, list):
        chain_ids = chain_id
    else:
        chain_ids = [chain_id] 
    for chain in chain_ids:
        if chain not in all_chains:
            raise ValueError(f'Chain {chain} not found in input file')
    chain_filter = [a.chain_id in chain_ids for a in atom_array]
    atom_array = atom_array[chain_filter]

    # mask canonical aa
    aa_mask = struc.filter_amino_acids(atom_array)
    atom_array = atom_array[aa_mask]
    residue_identities = get_residues(atom_array)[1]
    seq = ''.join([ProteinSequence.convert_letter_3to1(r) for r in residue_identities])
    return seq    


def extract_plddt(protein,chain_id=None):
    if isinstance(protein,str):
        # model = 1 to load a AtomArray object
        # extra_fields to load the b_factor column
        atom_array = strucio.load_structure(protein,model=1,extra_fields=["b_factor"])
    elif isinstance(protein, struc.AtomArrayStack):
        atom_array = protein[0]
    elif isinstance(protein, struc.AtomArray):
        atom_array = protein

    # add multiple chain sequence subtract function
    all_chains = get_chains(atom_array)
    if len(all_chains) == 0:
        raise ValueError('No chains found in the input file.')
    if chain_id is None:
        chain_ids = all_chains
    elif isinstance(chain_id, list):
        chain_ids = chain_id
    else:
        chain_ids = [chain_id] 
    for chain in chain_ids:
        if chain not in all_chains:
            raise ValueError(f'Chain {chain} not found in input file')
    chain_filter = [a.chain_id in chain_ids for a in atom_array]
    atom_array = atom_array[chain_filter]

    # mask canonical aa 
    aa_mask = struc.filter_amino_acids(atom_array)
    atom_array = atom_array[aa_mask]

    # ca atom only
    atom_array = atom_array[atom_array.atom_name == "CA"]

    plddt = np.array([i.b_factor for i in atom_array])

    return plddt, np.mean(plddt)

In [45]:
restypes = ["A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V",]
def infer_featurize(fpath,chain=None):
    array = load_structure(fpath,chain)
    N,CA,C,CB,O = [[] for _ in range(5)]
    seq = extract_seq(array,chain)
    for residue_id in set(array.res_id):
        # Get the residue's atoms
        residue_atoms = array[array.res_id == residue_id]

        # Get the resname
        residue_name = residue_atoms.res_name[0]

        # Perform desired operations on the residue
        # Note to index the numpy array to be shaped [3,] rather than [1,3]
        n = residue_atoms[residue_atoms.atom_name=="N"].coord[0]
        ca = residue_atoms[residue_atoms.atom_name=="CA"].coord[0]
        c = residue_atoms[residue_atoms.atom_name=="C"].coord[0]
        if ProteinSequence.convert_letter_3to1(residue_name) == "G":
            # center at origin
            n_ca_vec = n - ca
            c_ca_vec = c - ca
            # find rotation matrix that rotates n -120 degrees along the ca-c vector and apply rotation to ca-n vector
            cb_at_origin= struc.rotate_about_axis(n_ca_vec, axis=c_ca_vec, angle=-math.pi * 120.0 / 180.0, support=None)
            # put on top of ca atom
            cb = cb_at_origin + ca
        else:
            cb = residue_atoms[residue_atoms.atom_name=="CB"].coord[0]
        o = residue_atoms[residue_atoms.atom_name=="O"].coord[0]
        N.append(n)
        CA.append(ca)
        C.append(c)
        CB.append(cb)
        O.append(o)
    coord = np.stack([N,CA,C,CB,O],axis=1) #[L,5,3]
    coord = np.expand_dims(coord,axis=0) #add batch dimension [1, L, 5, 3]
    seq = np.array([restypes.index(i) for i in seq])
    seq = np.expand_dims(seq,axis=0) #add batch dimension [1,L]
    return coord,seq

In [46]:
coord,seq = infer_featurize("./5ius/5ius_design/T1/5IUS_0.pdb")

In [38]:
atom_array[atom_array.res_id==61]

array([
	Atom(np.array([-17.806,  -3.577,  -8.948], dtype=float32), chain_id="A", res_id=61, ins_code="", res_name="GLY", hetero=False, atom_name="N", element="N"),
	Atom(np.array([-17.58 ,  -2.484,  -8.016], dtype=float32), chain_id="A", res_id=61, ins_code="", res_name="GLY", hetero=False, atom_name="CA", element="C"),
	Atom(np.array([-17.609,  -2.921,  -6.564], dtype=float32), chain_id="A", res_id=61, ins_code="", res_name="GLY", hetero=False, atom_name="C", element="C"),
	Atom(np.array([-17.45 ,  -2.099,  -5.659], dtype=float32), chain_id="A", res_id=61, ins_code="", res_name="GLY", hetero=False, atom_name="O", element="O")
])

In [39]:
atom_array = load_structure("./5ius/5ius_design/T1/5IUS_0.pdb")
atom_array[atom_array.res_id==61]

array([
	Atom(np.array([-17.806,  -3.577,  -8.948], dtype=float32), chain_id="A", res_id=61, ins_code="", res_name="GLY", hetero=False, atom_name="N", element="N"),
	Atom(np.array([-17.58 ,  -2.484,  -8.016], dtype=float32), chain_id="A", res_id=61, ins_code="", res_name="GLY", hetero=False, atom_name="CA", element="C"),
	Atom(np.array([-17.609,  -2.921,  -6.564], dtype=float32), chain_id="A", res_id=61, ins_code="", res_name="GLY", hetero=False, atom_name="C", element="C"),
	Atom(np.array([-17.45 ,  -2.099,  -5.659], dtype=float32), chain_id="A", res_id=61, ins_code="", res_name="GLY", hetero=False, atom_name="O", element="O")
])

In [40]:
x[60]

array([[-17.80599976,  -3.5769999 ,  -8.94799995],
       [-17.57999992,  -2.48399997,  -8.01599979],
       [-17.60899925,  -2.921     ,  -6.56400013],
       [-18.5893733 ,  -1.48701088,  -8.33463348],
       [-17.45000076,  -2.09899998,  -5.65899992]])

In [35]:
def esm_structure_ana(folder):
    esm_data = []
    for protein_name in os.listdir(folder):
        if not protein_name.endswith(".pdb"):
            continue
        protein_file = os.path.join(folder,protein_name)
        seq = extract_seq(protein_file)
        plddt = extract_plddt(protein_file)
        esm_data.append({"name":protein_name,"seq":seq,"plddt":plddt})
    return sorted(esm_data,key=lambda x:x['plddt'])

In [36]:
x = esm_structure_ana("./5ius/5ius_design/T1/")

[34m5ius[m[m        [34m5tpn[m[m        [34m5yui[m[m        plddt.ipynb


In [21]:
atom_array = load_structure("../benchmark_set/1PRW.pdb")

In [1]:
residue_name = atom_array.res_name[0]


NameError: name 'atom_array' is not defined

In [17]:
residue_identities = get_residues(atom_array)[1]
seq = ''.join([ProteinSequence.convert_letter_3to1(r) for r in residue_identities])

In [19]:
atom_array

array([
	Atom(np.array([-28.823, -15.535, -10.797], dtype=float32), chain_id="A", res_id=530, ins_code="", res_name="MET", hetero=False, atom_name="N", element="N"),
	Atom(np.array([-29.814, -14.468, -10.926], dtype=float32), chain_id="A", res_id=530, ins_code="", res_name="MET", hetero=False, atom_name="CA", element="C"),
	Atom(np.array([-29.401, -13.194, -10.183], dtype=float32), chain_id="A", res_id=530, ins_code="", res_name="MET", hetero=False, atom_name="C", element="C"),
	Atom(np.array([-29.264, -13.201,  -8.96 ], dtype=float32), chain_id="A", res_id=530, ins_code="", res_name="MET", hetero=False, atom_name="O", element="O"),
	Atom(np.array([-31.175, -14.942, -10.413], dtype=float32), chain_id="A", res_id=530, ins_code="", res_name="MET", hetero=False, atom_name="CB", element="C"),
	Atom(np.array([-32.27 , -13.883, -10.454], dtype=float32), chain_id="A", res_id=530, ins_code="", res_name="MET", hetero=False, atom_name="CG", element="C"),
	Atom(np.array([-32.882, -13.569, -12.119

In [9]:
import sys
sys.path.append("../")
import utils
coords, seq = utils.extract_coords_from_structure(atom_array, pattern="max")

ModuleNotFoundError: No module named 'openfold'

In [48]:
import torch
x = torch.randn((2,2))

In [52]:
torch.full(x.shape,2).dtype

torch.int64