# TODOS:
- Mechanism for selecting apo/pred; right now we are selecting at most one apo/pred by `sort_score`
- Relax with requirement of presence of apo and holo
- Agree on what out feature input and output should look like

In [1]:
from plinder.core.structure.structure import Structure
from plinder.core import PlinderSystem
from pathlib import Path



# Load structure

#### load holo structure

In [2]:
ROOT_DIR = Path("/Users/yusuf/.local/share")
holo_struc = Structure.load_structure(
    id="1avd__1__1.A__1.C",
    protein_path=ROOT_DIR/"plinder/2024-06/v2/systems/1avd__1__1.A__1.C/receptor.cif",
    protein_sequence=ROOT_DIR/"plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta",
    list_ligand_sdf_and_resolved_smiles=[(ROOT_DIR/"plinder/2024-06/v2/systems/1avd__1__1.A__1.C/ligand_files/1.C.sdf"
        , "CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O)O")]
    )

#### load apo structure

In [3]:
apo_struc = Structure.load_structure(
    id="1avd_A",
    protein_path=ROOT_DIR/ "plinder/2024-06/v2/linked_structures/apo/1avd__1__1.A__1.C/1nqn_A/superposed.cif",
    protein_sequence=ROOT_DIR/"plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta",
    structure_type="apo"
    )

#### list structure  fields

In [4]:
holo_struc.model_fields

{'id': FieldInfo(annotation=str, required=True),
 'protein_path': FieldInfo(annotation=Path, required=True),
 'protein_sequence': FieldInfo(annotation=Path, required=True),
 'list_ligand_sdf_and_resolved_smiles': FieldInfo(annotation=Union[list[tuple[Path, str]], NoneType], required=False, default=None),
 'protein_atom_array': FieldInfo(annotation=Union[AtomArray, NoneType], required=False, default=None),
 'ligand_mols': FieldInfo(annotation=Union[dict[str, tuple[Mol, Mol, Mol, tuple[int, ...]]], NoneType], required=False, default=None),
 'add_ligand_hydrogen': FieldInfo(annotation=bool, required=False, default=False),
 'structure_type': FieldInfo(annotation=str, required=False, default='holo')}

#### get structure properties

In [5]:
holo_struc.get_properties()

['__fields_set__',
 'ligand_chain_ordered',
 'model_extra',
 'model_fields_set',
 'original_unresolved_mols',
 'pdf_ligand_mols_coords',
 'protein_backbone_mask',
 'protein_calpha_coords',
 'protein_calpha_mask',
 'protein_chain_ordered',
 'protein_chain_unresolved_sequence',
 'protein_chains',
 'protein_coords',
 'protein_n_atoms',
 'protein_sequence_from_structure',
 'protein_structure_atom_names',
 'protein_structure_b_factor',
 'protein_structure_residue_names',
 'protein_structure_residues',
 'protein_structure_sequence_fasta',
 'protein_structure_tokenized_sequence',
 'resolved_ligand_conformers',
 'resolved_ligand_conformers_coords',
 'resolved_ligand_mols',
 'resolved_ligand_mols_coords',
 'resolved_sequence_full_atom_feat',
 'resolved_sequence_list_ordered_by_chain',
 'resolved_sequence_stacked_mask',
 'resolved_sequences',
 'resolved_smiles_ligand_mask',
 'sequence_atom_mask']

#### Inspect holo structure

In [6]:
holo_struc

Structure(
    (
        'id',
        '1avd__1__1.A__1.C',
    ),
    (
        'protein_path',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/receptor.cif,
    ),
    (
        'protein_sequence',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta,
    ),
    (
        'list_ligand_sdf_and_resolved_smiles',
        [
            (
                /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/ligand_files/1.C.sdf,
                'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O)O',
            ),
        ],
    ),
    (
        'protein_atom_array',
        <class 'biotite.structure.AtomArray'> with shape (964,),
    ),
    (
        'ligand_mols',
        {
            '1.C': (
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa52a0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa51c0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa5380>,
                (
    

#### Inspect holo ligand
Returns a chain-mapped dictionary of original_unresolved_mol, resolved_ligand_mol resolved_ligand_mol_conformer, matches


In [7]:
holo_struc.ligand_mols

{'1.C': (<rdkit.Chem.rdchem.Mol at 0x1b8aa52a0>,
  <rdkit.Chem.rdchem.Mol at 0x1b8aa51c0>,
  <rdkit.Chem.rdchem.Mol at 0x1b8aa5380>,
  (9, 4, 5, 6, 7, 11, 1, 0, 3, 14, 13, 8, 12, 2))}

#### Inspect holo sequences
Returns a chain-mapped dictionary of sequences

In [8]:
holo_struc.resolved_sequences

{'1.A': 'ARKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYTTAVTATSNEIKESPLHGTENTINKRTQPTFGFTVNWKFSESTTVFTGQCFIDRNGKEVLKTMWLLRSSVNDIGDDWKATRVGINIFTRLRTQKE'}

#### Inspect holo atom array
This is the input sequence-renumbered array


In [9]:
holo_struc.protein_atom_array

array([
	Atom(np.array([31.221, 22.957, 43.101], dtype=float32), chain_id="1.A", res_id=3, ins_code="", res_name="LYS", hetero=False, atom_name="N", element="N"),
	Atom(np.array([31.828, 24.118, 42.476], dtype=float32), chain_id="1.A", res_id=3, ins_code="", res_name="LYS", hetero=False, atom_name="CA", element="C"),
	Atom(np.array([31.979, 23.854, 41.021], dtype=float32), chain_id="1.A", res_id=3, ins_code="", res_name="LYS", hetero=False, atom_name="C", element="C"),
	Atom(np.array([31.496, 24.598, 40.166], dtype=float32), chain_id="1.A", res_id=3, ins_code="", res_name="LYS", hetero=False, atom_name="O", element="O"),
	Atom(np.array([33.178, 24.436, 43.069], dtype=float32), chain_id="1.A", res_id=3, ins_code="", res_name="LYS", hetero=False, atom_name="CB", element="C"),
	Atom(np.array([33.279, 25.867, 43.567], dtype=float32), chain_id="1.A", res_id=3, ins_code="", res_name="LYS", hetero=False, atom_name="CG", element="C"),
	Atom(np.array([33.23 , 25.978, 45.078], dtype=float32), ch

#### Inspect unresolved input strcture sequence


In [10]:
# holo_struc.aligned_unresolved_seqs

#### Inspect unresolved input strcture indices
Unresolved structure original indices with indices matching the residue number of resolved sequence

In [11]:
# holo_struc.unresolved_aligned_indices

#### Inspect unresolved input strcture sequence
Unresolved structure original indices with indices matching the residue number of resolved sequence

#### Inspect unresolved orignal ligand loaded from sdf

In [12]:
holo_struc.original_unresolved_mols

{'1.C': <rdkit.Chem.rdchem.Mol at 0x1b8aa52a0>}

#### Inspect resolved ligand loaded from smiles

In [13]:
holo_struc.resolved_ligand_mols
holo_struc.resolved_ligand_conformers

{'1.C': <rdkit.Chem.rdchem.Mol at 0x1b8aa5380>}

#### Inspect random conformer of resolved ligand loaded from smiles

In [14]:
holo_struc.resolved_ligand_conformers

{'1.C': <rdkit.Chem.rdchem.Mol at 0x1b8aa5380>}

#### Inspect coordinates of random conformer of resolved ligand loaded from smiles

In [15]:
holo_struc.resolved_ligand_conformers_coords

{'1.C': array([[-1.32325143, -2.86305071,  2.3964789 ],
        [-0.5302486 , -1.96598286,  1.50370794],
        [ 0.2367775 , -2.36847825,  0.63577923],
        [-0.75538584, -0.63578648,  1.76840044],
        [-0.08512521,  0.40070138,  0.99605848],
        [-0.53172077,  0.36110243, -0.45694019],
        [ 0.31517757,  1.22057229, -1.37052005],
        [ 1.78880906,  0.94358526, -1.12376296],
        [ 2.07152399,  1.23890893,  0.24543438],
        [ 1.43759148,  0.27236516,  1.07950831],
        [ 1.84475467,  0.46473583,  2.42649249],
        [ 2.67929759,  1.82570832, -1.97629431],
        [ 2.78958586,  1.28348727, -3.28142864],
        [-0.03497734,  0.95288132, -2.71781699],
        [-1.88905478,  0.77866947, -0.54060498]])}

#### Inspect coordinates of resolved ligand loaded from smiles and aligned with original ligand

In [16]:
holo_struc.resolved_ligand_mols_coords

{'1.C': array([[34.9261516 , 22.66222349, 15.88433131],
        [35.92334306, 22.07672528, 16.83433418],
        [35.7748091 , 20.90763193, 17.27243138],
        [37.01188562, 22.86833375, 17.28987843],
        [36.85533873, 24.3116667 , 17.31076371],
        [38.19588066, 25.03037747, 17.26930705],
        [37.92897344, 26.5380924 , 17.34043758],
        [37.04475036, 26.85125562, 18.57715272],
        [35.8403653 , 26.1246026 , 18.47656718],
        [36.12604607, 24.75714771, 18.57986178],
        [34.9289961 , 24.05850146, 18.7430147 ],
        [36.67888557, 28.33655176, 18.66437329],
        [35.41315042, 28.51835079, 19.24850043],
        [39.14107535, 27.23368472, 17.36258731],
        [38.86334472, 24.73135578, 16.07947365]])}

#### Inspect coordinates of original unresolved ligand

In [17]:
holo_struc.pdf_ligand_mols_coords

{'1.C': array([[36.097, 24.731, 18.551],
        [36.841, 24.28 , 17.298],
        [38.191, 24.996, 17.307],
        [37.93 , 26.507, 17.335],
        [37.118, 26.889, 18.595],
        [36.636, 28.326, 18.719],
        [35.948, 22.08 , 16.874],
        [34.96 , 22.767, 15.881],
        [36.989, 22.845, 17.232],
        [38.879, 24.744, 16.078],
        [39.175, 27.211, 17.276],
        [35.883, 26.133, 18.477],
        [35.31 , 28.501, 19.242],
        [35.767, 20.938, 17.325]])}

#### Inspect protein structure dataframe with indices renumbered to match sequence

In [18]:
#holo_struc.protein_dataframe

#### Inspect protein backbone mask

In [19]:
holo_struc.protein_backbone_mask

array([ True,  True,  True, False, False, False, False, False, False,
        True,  True,  True, False, False, False,  True,  True,  True,
       False, False, False,  True,  True,  True, False, False, False,
       False, False,  True,  True,  True, False, False, False, False,
        True,  True,  True, False,  True,  True,  True, False, False,
       False, False, False, False,  True,  True,  True, False, False,
       False, False, False, False, False, False, False, False, False,
        True,  True,  True, False, False, False, False,  True,  True,
        True, False, False, False, False, False,  True,  True,  True,
       False, False, False, False, False,  True,  True,  True, False,
       False, False, False, False,  True,  True,  True, False,  True,
        True,  True, False, False, False,  True,  True,  True, False,
       False, False, False, False,  True,  True,  True, False, False,
       False, False, False,  True,  True,  True, False, False, False,
       False,  True,

#### Inspect protein calpha mask

In [20]:
holo_struc.protein_calpha_mask

array([False,  True, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False,  True, False, False, False,  True, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False,  True, False, False, False,
        True, False, False, False, False, False,  True, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False,

#### Inspect number of protein atoms

In [21]:
holo_struc.protein_n_atoms

964

#### Inspect protein chain ids

In [22]:
holo_struc.protein_chains

['1.A']

#### Inspect unresolved structure fasta

In [23]:
holo_struc.protein_structure_sequence_fasta

'>receptor\nKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYTTAVTATSNEIKESPLHGTENTINKRTQPTFGFTVNWKFSESTTVFTGQCFIDRNGKEVLKTMWLLRSSVNDIGDDWKATRVGINIFTRLRT'

### Test sequence alignment

In [24]:
holo_struc

Structure(
    (
        'id',
        '1avd__1__1.A__1.C',
    ),
    (
        'protein_path',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/receptor.cif,
    ),
    (
        'protein_sequence',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta,
    ),
    (
        'list_ligand_sdf_and_resolved_smiles',
        [
            (
                /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/ligand_files/1.C.sdf,
                'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O)O',
            ),
        ],
    ),
    (
        'protein_atom_array',
        <class 'biotite.structure.AtomArray'> with shape (964,),
    ),
    (
        'ligand_mols',
        {
            '1.C': (
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa52a0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa51c0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa5380>,
                (
    

In [25]:
apo_struc

Structure(
    (
        'id',
        '1avd_A',
    ),
    (
        'protein_path',
        /Users/yusuf/.local/share/plinder/2024-06/v2/linked_structures/apo/1avd__1__1.A__1.C/1nqn_A/superposed.cif,
    ),
    (
        'protein_sequence',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta,
    ),
    (
        'list_ligand_sdf_and_resolved_smiles',
        None,
    ),
    (
        'protein_atom_array',
        <class 'biotite.structure.AtomArray'> with shape (928,),
    ),
    (
        'ligand_mols',
        {

        },
    ),
    (
        'add_ligand_hydrogen',
        False,
    ),
    (
        'structure_type',
        'apo',
    ),
)

In [26]:
# Note for structure alignment to work, apo and holo need to have same chain id
apo_struc.set_chain("1.A")

In [27]:
apo_struc.protein_atom_array

array([
	Atom(np.array([35.228, 26.654, 45.992], dtype=float32), chain_id="1.A", res_id=1, ins_code="", res_name="ARG", hetero=False, atom_name="N", element="N"),
	Atom(np.array([34.659, 26.809, 44.626], dtype=float32), chain_id="1.A", res_id=1, ins_code="", res_name="ARG", hetero=False, atom_name="CA", element="C"),
	Atom(np.array([34.168, 25.473, 44.078], dtype=float32), chain_id="1.A", res_id=1, ins_code="", res_name="ARG", hetero=False, atom_name="C", element="C"),
	Atom(np.array([34.65 , 24.41 , 44.472], dtype=float32), chain_id="1.A", res_id=1, ins_code="", res_name="ARG", hetero=False, atom_name="O", element="O"),
	Atom(np.array([35.706, 27.378, 43.669], dtype=float32), chain_id="1.A", res_id=1, ins_code="", res_name="ARG", hetero=False, atom_name="CB", element="C"),
	Atom(np.array([35.08 , 28.178, 42.531], dtype=float32), chain_id="1.A", res_id=1, ins_code="", res_name="ARG", hetero=False, atom_name="CG", element="C"),
	Atom(np.array([35.848, 28.051, 41.219], dtype=float32), ch

In [28]:
seq_align = holo_struc.get_per_chain_seq_alignments(apo_struc)

In [29]:
seq_align

{'1.A': {3: 2,
  4: 3,
  5: 4,
  6: 5,
  7: 6,
  8: 7,
  9: 8,
  10: 9,
  11: 10,
  12: 11,
  13: 12,
  14: 13,
  15: 14,
  16: 15,
  17: 16,
  18: 17,
  19: 18,
  20: 19,
  21: 20,
  22: 21,
  23: 22,
  24: 23,
  25: 24,
  26: 25,
  27: 26,
  28: 27,
  29: 28,
  30: 29,
  31: 30,
  32: 31,
  33: 32,
  34: 33,
  35: 34,
  36: 35,
  37: 36,
  42: 41,
  43: 42,
  44: 43,
  45: 44,
  46: 45,
  47: 46,
  48: 47,
  49: 48,
  50: 49,
  51: 50,
  52: 51,
  53: 52,
  54: 53,
  55: 54,
  56: 55,
  57: 56,
  58: 57,
  59: 58,
  60: 59,
  61: 60,
  62: 61,
  63: 62,
  64: 63,
  65: 64,
  66: 65,
  67: 66,
  68: 67,
  69: 68,
  70: 69,
  71: 70,
  72: 71,
  73: 72,
  74: 73,
  75: 74,
  76: 75,
  77: 76,
  78: 77,
  79: 78,
  80: 79,
  81: 80,
  82: 81,
  83: 82,
  84: 83,
  85: 84,
  86: 85,
  87: 86,
  88: 87,
  89: 88,
  90: 89,
  91: 90,
  92: 91,
  93: 92,
  94: 93,
  95: 94,
  96: 95,
  97: 96,
  98: 97,
  99: 98,
  100: 99,
  101: 100,
  102: 101,
  103: 102,
  104: 103,
  105: 104,
  106: 

In [30]:
holo_struc.protein_atom_array[0]

Atom(np.array([31.221, 22.957, 43.101], dtype=float32), chain_id="1.A", res_id=3, ins_code="", res_name="LYS", hetero=False, atom_name="N", element="N")

In [31]:
apo_struc.protein_atom_array[0]

Atom(np.array([35.228, 26.654, 45.992], dtype=float32), chain_id="1.A", res_id=1, ins_code="", res_name="ARG", hetero=False, atom_name="N", element="N")

### Alignment and Cropping

In [32]:
align_common_seq = holo_struc.align_common_sequence(
        apo_struc,
    )

In [33]:
holo_struc

Structure(
    (
        'id',
        '1avd__1__1.A__1.C',
    ),
    (
        'protein_path',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/receptor.cif,
    ),
    (
        'protein_sequence',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta,
    ),
    (
        'list_ligand_sdf_and_resolved_smiles',
        [
            (
                /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/ligand_files/1.C.sdf,
                'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O)O',
            ),
        ],
    ),
    (
        'protein_atom_array',
        <class 'biotite.structure.AtomArray'> with shape (964,),
    ),
    (
        'ligand_mols',
        {
            '1.C': (
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa52a0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa51c0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa5380>,
                (
    

In [34]:
apo_struc

Structure(
    (
        'id',
        '1avd_A',
    ),
    (
        'protein_path',
        /Users/yusuf/.local/share/plinder/2024-06/v2/linked_structures/apo/1avd__1__1.A__1.C/1nqn_A/superposed.cif,
    ),
    (
        'protein_sequence',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta,
    ),
    (
        'list_ligand_sdf_and_resolved_smiles',
        None,
    ),
    (
        'protein_atom_array',
        <class 'biotite.structure.AtomArray'> with shape (928,),
    ),
    (
        'ligand_mols',
        {

        },
    ),
    (
        'add_ligand_hydrogen',
        False,
    ),
    (
        'structure_type',
        'apo',
    ),
)

In [35]:
align_common_seq[0]

Structure(
    (
        'id',
        '1avd__1__1.A__1.C',
    ),
    (
        'protein_path',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/receptor.cif,
    ),
    (
        'protein_sequence',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta,
    ),
    (
        'list_ligand_sdf_and_resolved_smiles',
        [
            (
                /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/ligand_files/1.C.sdf,
                'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O)O',
            ),
        ],
    ),
    (
        'protein_atom_array',
        <class 'biotite.structure.AtomArray'> with shape (907,),
    ),
    (
        'ligand_mols',
        {
            '1.C': (
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa52a0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa51c0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa5380>,
                (
    

In [36]:
align_common_seq[1]

Structure(
    (
        'id',
        '1avd_A',
    ),
    (
        'protein_path',
        /Users/yusuf/.local/share/plinder/2024-06/v2/linked_structures/apo/1avd__1__1.A__1.C/1nqn_A/superposed.cif,
    ),
    (
        'protein_sequence',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta,
    ),
    (
        'list_ligand_sdf_and_resolved_smiles',
        None,
    ),
    (
        'protein_atom_array',
        <class 'biotite.structure.AtomArray'> with shape (907,),
    ),
    (
        'ligand_mols',
        {

        },
    ),
    (
        'add_ligand_hydrogen',
        False,
    ),
    (
        'structure_type',
        'apo',
    ),
)

In [37]:
superimposed_apo = apo_struc.superimpose(holo_struc)
superimposed_apo

(Structure(
     (
         'id',
         '1avd_A',
     ),
     (
         'protein_path',
         /Users/yusuf/.local/share/plinder/2024-06/v2/linked_structures/apo/1avd__1__1.A__1.C/1nqn_A/superposed.cif,
     ),
     (
         'protein_sequence',
         /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta,
     ),
     (
         'list_ligand_sdf_and_resolved_smiles',
         None,
     ),
     (
         'protein_atom_array',
         <class 'biotite.structure.AtomArray'> with shape (928,),
     ),
     (
         'ligand_mols',
         {
 
         },
     ),
     (
         'add_ligand_hydrogen',
         False,
     ),
     (
         'structure_type',
         'apo',
     ),
 ),
 1.6846485,
 0.3277406)

In [38]:
holo_struc.protein_coords

array([[31.221, 22.957, 43.101],
       [31.828, 24.118, 42.476],
       [31.979, 23.854, 41.021],
       ...,
       [34.341, 35.018, 24.674],
       [35.484, 35.831, 24.497],
       [33.105, 35.742, 24.15 ]], dtype=float32)

In [39]:
#apo_struc.protein_dataframe

In [40]:
holo_struc.protein_structure_b_factor

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [41]:
test_sys = PlinderSystem(system_id="1avd__1__1.A__1.C", resolved_smiles_dict={"1.C": "CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O)O"})

In [42]:
test_sys.holo_structure

2024-09-18 00:56:26,659 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2024-09-18 00:56:26,660 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s


Structure(
    (
        'id',
        '1avd__1__1.A__1.C',
    ),
    (
        'protein_path',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/receptor.cif,
    ),
    (
        'protein_sequence',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta,
    ),
    (
        'list_ligand_sdf_and_resolved_smiles',
        [
            (
                /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/ligand_files/1.C.sdf,
                'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O)O',
            ),
        ],
    ),
    (
        'protein_atom_array',
        <class 'biotite.structure.AtomArray'> with shape (964,),
    ),
    (
        'ligand_mols',
        {
            '1.C': (
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa6f80>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa6dc0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa6ea0>,
                (
    

In [43]:
test_sys.alt_structures

2024-09-18 00:56:27,065 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.13s
2024-09-18 00:56:27,399 | plinder.core.scores.links.query_links:24 | INFO : runtime succeeded: 0.66s
2024-09-18 00:56:27,537 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2024-09-18 00:56:27,537 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s


{'apo': {'1.A': Structure(
      (
          'id',
          '1vyo_B',
      ),
      (
          'protein_path',
          /Users/yusuf/.local/share/plinder/2024-06/v2/linked_structures/apo/1avd__1__1.A__1.C/1vyo_B/superposed.cif,
      ),
      (
          'protein_sequence',
          /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta,
      ),
      (
          'list_ligand_sdf_and_resolved_smiles',
          None,
      ),
      (
          'protein_atom_array',
          <class 'biotite.structure.AtomArray'> with shape (958,),
      ),
      (
          'ligand_mols',
          {
  
          },
      ),
      (
          'add_ligand_hydrogen',
          False,
      ),
      (
          'structure_type',
          'apo',
      ),
  )},
 'pred': {'1.A': Structure(
      (
          'id',
          'P02701_A',
      ),
      (
          'protein_path',
          /Users/yusuf/.local/share/plinder/2024-06/v2/linked_structures/pred/1avd__1__1.A__1.

In [44]:
test_sys.best_linked_structures_paths

{'apo': {'1.A': '/Users/yusuf/.local/share/plinder/2024-06/v2/linked_structures/apo/1avd__1__1.A__1.C/1vyo_B/superposed.cif'},
 'pred': {'1.A': '/Users/yusuf/.local/share/plinder/2024-06/v2/linked_structures/pred/1avd__1__1.A__1.C/P02701_A/superposed.cif'}}

In [45]:
#cropped = test_sys.create_masked_bound_unbound_complexes()

In [46]:
mask = holo_struc.protein_atom_array.atom_name == "CA"
holo_struc.filter(
        property="atom_name",
        mask="CA",

    )

Structure(
    (
        'id',
        '1avd__1__1.A__1.C',
    ),
    (
        'protein_path',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/receptor.cif,
    ),
    (
        'protein_sequence',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta,
    ),
    (
        'list_ligand_sdf_and_resolved_smiles',
        [
            (
                /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/ligand_files/1.C.sdf,
                'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O)O',
            ),
        ],
    ),
    (
        'protein_atom_array',
        <class 'biotite.structure.AtomArray'> with shape (123,),
    ),
    (
        'ligand_mols',
        {
            '1.C': (
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa52a0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa51c0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa5380>,
                (
    

In [47]:
holo_struc

Structure(
    (
        'id',
        '1avd__1__1.A__1.C',
    ),
    (
        'protein_path',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/receptor.cif,
    ),
    (
        'protein_sequence',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta,
    ),
    (
        'list_ligand_sdf_and_resolved_smiles',
        [
            (
                /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/ligand_files/1.C.sdf,
                'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O)O',
            ),
        ],
    ),
    (
        'protein_atom_array',
        <class 'biotite.structure.AtomArray'> with shape (964,),
    ),
    (
        'ligand_mols',
        {
            '1.C': (
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa52a0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa51c0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa5380>,
                (
    

In [48]:
holo_struc + apo_struc

Structure(
    (
        'id',
        '1avd__1__1.A__1.C--1avd_A',
    ),
    (
        'protein_path',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/1avd__1__1.A__1.C--1avd_A,
    ),
    (
        'protein_sequence',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta,
    ),
    (
        'list_ligand_sdf_and_resolved_smiles',
        [
            (
                /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/ligand_files/1.C.sdf,
                'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O)O',
            ),
        ],
    ),
    (
        'protein_atom_array',
        <class 'biotite.structure.AtomArray'> with shape (1892,),
    ),
    (
        'ligand_mols',
        {
            '1.C': (
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa52a0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa51c0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa5380>,


In [49]:
holo_struc.protein_atom_array[holo_struc.protein_atom_array.chain_id == "1.A"]

array([
	Atom(np.array([31.221, 22.957, 43.101], dtype=float32), chain_id="1.A", res_id=3, ins_code="", res_name="LYS", hetero=False, atom_name="N", element="N"),
	Atom(np.array([31.828, 24.118, 42.476], dtype=float32), chain_id="1.A", res_id=3, ins_code="", res_name="LYS", hetero=False, atom_name="CA", element="C"),
	Atom(np.array([31.979, 23.854, 41.021], dtype=float32), chain_id="1.A", res_id=3, ins_code="", res_name="LYS", hetero=False, atom_name="C", element="C"),
	Atom(np.array([31.496, 24.598, 40.166], dtype=float32), chain_id="1.A", res_id=3, ins_code="", res_name="LYS", hetero=False, atom_name="O", element="O"),
	Atom(np.array([33.178, 24.436, 43.069], dtype=float32), chain_id="1.A", res_id=3, ins_code="", res_name="LYS", hetero=False, atom_name="CB", element="C"),
	Atom(np.array([33.279, 25.867, 43.567], dtype=float32), chain_id="1.A", res_id=3, ins_code="", res_name="LYS", hetero=False, atom_name="CG", element="C"),
	Atom(np.array([33.23 , 25.978, 45.078], dtype=float32), ch

In [50]:
holo_struc

Structure(
    (
        'id',
        '1avd__1__1.A__1.C',
    ),
    (
        'protein_path',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/receptor.cif,
    ),
    (
        'protein_sequence',
        /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/sequences.fasta,
    ),
    (
        'list_ligand_sdf_and_resolved_smiles',
        [
            (
                /Users/yusuf/.local/share/plinder/2024-06/v2/systems/1avd__1__1.A__1.C/ligand_files/1.C.sdf,
                'CC(=O)N[C@@H]1[C@H]([C@@H]([C@H](O[C@H]1O)CO)O)O',
            ),
        ],
    ),
    (
        'protein_atom_array',
        <class 'biotite.structure.AtomArray'> with shape (964,),
    ),
    (
        'ligand_mols',
        {
            '1.C': (
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa52a0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa51c0>,
                <rdkit.Chem.rdchem.Mol object at 0x1b8aa5380>,
                (
    

## Loader

In [51]:
from plinder.core.loader import PlinderDataset
from plinder.core.loader.loader import get_torch_loader
from plinder.core import get_split
from plinder.core.scores import query_links

#### Make plinder dataset

In [52]:
train_dataset = PlinderDataset()
#train_dataset = PlinderDataset(df=splits_df[splits_df.system_id =="6pl9__1__1.A__1.C"])

2024-09-18 00:56:28,202 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.14s
2024-09-18 00:56:28,337 | plinder.core.split.utils:42 | INFO : reading /Users/yusuf/.local/share/plinder/2024-06/v2/splits/split.parquet
2024-09-18 00:56:28,629 | plinder.core.split.utils.get_split:24 | INFO : runtime succeeded: 0.62s
2024-09-18 00:56:28,951 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.14s
2024-09-18 00:56:35,236 | plinder.core.scores.links.query_links:24 | INFO : runtime succeeded: 6.61s
2024-09-18 00:56:35,863 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.14s
  .apply(lambda x: dict(zip(x[0], x[1])), axis=1)


In [53]:
test_data = train_dataset[0]

2024-09-18 00:56:42,084 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2024-09-18 00:56:42,085 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2024-09-18 00:56:42,428 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.14s
2024-09-18 00:56:42,695 | plinder.core.scores.links.query_links:24 | INFO : runtime succeeded: 0.56s
2024-09-18 00:56:42,831 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2024-09-18 00:56:42,832 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s


In [54]:
test_data.keys()

dict_keys(['structures', 'id', 'features_and_coords', 'path'])

#### Make torch loader

In [55]:
train_loader = get_torch_loader(
    train_dataset
)

In [56]:
for data in train_loader:

    test_torch = data
    break
    #for k, v in test_torch['input_features'].items():
    #    if v.shape[1] > 1:
    #        break

2024-09-18 00:56:48,458 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2024-09-18 00:56:48,458 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2024-09-18 00:56:48,843 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.17s
2024-09-18 00:56:49,274 | plinder.core.scores.links.query_links:24 | INFO : runtime succeeded: 0.74s
2024-09-18 00:56:49,431 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2024-09-18 00:56:49,431 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2024-09-18 00:56:49,639 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2024-09-18 00:56:49,639 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2024-09-18 00:56:50,067 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.13s
2024-09-18 00:56:50,517 | plinder.core.scores.links.query_links:24 | INFO : runtime succeed

In [57]:
test_torch.keys()

dict_keys(['features_and_coords', 'id', 'structures'])

In [58]:
test_torch['id']

['3gzt__1__17.H__17.U', '6vy2__1__1.E__1.BB']

In [59]:
for k, v in test_torch['features_and_coords'].items():
    print(k, v.shape)

protein_atom_types torch.Size([2, 1, 3601, 12])
protein_residue_types torch.Size([2, 1, 3601, 1])
resolved_protein_residue_type torch.Size([2, 1, 487, 1])
protein_atom_coordinates torch.Size([2, 1, 3601, 3])
protein_residue_coordinates torch.Size([2, 1, 3601, 3])
protein_residue_ids torch.Size([2, 1, 3601])
resolved_sequence_mask torch.Size([2, 1, 487])
protein_full_atom_mask torch.Size([2, 1, 4334])
ligand_features torch.Size([2, 1, 39, 16])
ligand_conformer_atom_coordinates torch.Size([2, 1, 39, 3])
resolved_ligand_mols_coords torch.Size([2, 1, 39, 3])
resolved_smiles_ligand_mask torch.Size([2, 1, 39])
resolved_sequence_full_atom_feat torch.Size([2, 1, 487, 5])
protein_calpha_coordinates torch.Size([2, 1, 458, 3])
