## 2024  09 18 framediff learning

### 1. PDB processing + AlphaFold mmcif object

In [14]:
import Bio
from Bio.PDB import PDBIO, MMCIFParser, PDBParser

import sys
sys.path.append("/home/sirius/PhD/software/se3_diffusion")
from data import errors, mmcif_parsing, parsers
from data import utils as du
import os
import dataclasses
import numpy as np

In [15]:
def process_mmcif_mimic_framediff(mmcif_path,max_resolution,max_len):
    metadata = {}
    mmcif_name = os.path.basename(mmcif_path).replace('.cif', '')
    metadata['pdb_name'] = mmcif_name
    with open(mmcif_path, 'r') as f:
        parsed_mmcif = mmcif_parsing.parse(
            file_id=mmcif_name, mmcif_string=f.read())
    metadata['raw_path'] = mmcif_path
    parsed_mmcif = parsed_mmcif.mmcif_object
    raw_mmcif = parsed_mmcif.raw_string
    raw_olig_count = raw_mmcif['_pdbx_struct_assembly.oligomeric_count']
    oligomeric_count = ','.join(raw_olig_count).lower()
    raw_olig_detail = raw_mmcif['_pdbx_struct_assembly.oligomeric_details']
    oligomeric_detail = ','.join(raw_olig_detail).lower()
    metadata['oligomeric_count'] = oligomeric_count
    metadata['oligomeric_detail'] = oligomeric_detail

    mmcif_header = parsed_mmcif.header
    mmcif_resolution = mmcif_header['resolution']
    metadata['resolution'] = mmcif_resolution
    metadata['structure_method'] = mmcif_header['structure_method']
    if mmcif_resolution >= max_resolution:
        raise errors.ResolutionError(
            f'Too high resolution {mmcif_resolution}')
    if mmcif_resolution == 0.0:
        raise errors.ResolutionError(
            f'Invalid resolution {mmcif_resolution}')

        # Extract all chains
    struct_chains = {
        chain.id.upper(): chain
        for chain in parsed_mmcif.structure.get_chains()}
    metadata['num_chains'] = len(struct_chains)

    # Extract features
    struct_feats = []
    all_seqs = set()
    for chain_id, chain in struct_chains.items():
        # Convert chain id into int
        chain_id = du.chain_str_to_int(chain_id)
        chain_prot = parsers.process_chain(chain, chain_id)
        chain_dict = dataclasses.asdict(chain_prot)
        chain_dict = du.parse_chain_feats(chain_dict)
        all_seqs.add(tuple(chain_dict['aatype']))
        struct_feats.append(chain_dict)
    if len(all_seqs) == 1:
        metadata['quaternary_category'] = 'homomer'
    else:
        metadata['quaternary_category'] = 'heteromer'
    complex_feats = du.concat_np_features(struct_feats, False)

    # Process geometry features
    complex_aatype = complex_feats['aatype']
    modeled_idx = np.where(complex_aatype != 20)[0]
    if np.sum(complex_aatype != 20) == 0:
        raise errors.LengthError('No modeled residues')
    min_modeled_idx = np.min(modeled_idx)
    max_modeled_idx = np.max(modeled_idx)
    metadata['seq_len'] = len(complex_aatype)
    metadata['modeled_seq_len'] = max_modeled_idx - min_modeled_idx + 1
    complex_feats['modeled_idx'] = modeled_idx
    return complex_feats, metadata

In [16]:
mmcif_path = "/home/sirius/PhD/software/se3_diffusion/data/1h2s.cif"
max_resolution = 8.0
max_len = 2000
complex_feats, metadata = process_mmcif_mimic_framediff(mmcif_path,max_resolution,max_len)

modeled_idx is which residue is not X

bb_mask is whether this residue has CA atom

a modeled_idx can also be bb_mask

In [21]:
for key in complex_feats.keys():
    print(key, complex_feats[key].shape)

atom_positions (327, 37, 3)
aatype (327,)
atom_mask (327, 37)
residue_index (327,)
chain_index (327,)
b_factors (327, 37)
bb_mask (327,)
bb_positions (327, 3)
modeled_idx (285,)


In [22]:
metadata

{'pdb_name': '1h2s',
 'raw_path': '/home/sirius/PhD/software/se3_diffusion/data/1h2s.cif',
 'oligomeric_count': '4',
 'oligomeric_detail': 'tetrameric',
 'resolution': 1.93,
 'structure_method': 'x-ray diffraction',
 'num_chains': 2,
 'quaternary_category': 'heteromer',
 'seq_len': 327,
 'modeled_seq_len': 326}