In [1]:
from tempfile import gettempdir
from biotite.structure import get_residues, get_chains
from biotite.structure.io.pdbx import get_structure
import biotite.structure.io.pdbx as pdbx
import biotite.database.rcsb as rcsb
import biotite

cif_file_path = rcsb.fetch("1TA6", "cif", gettempdir())
cif_file = pdbx.CIFFile.read(cif_file_path)

whole_auth = get_structure(cif_file, model=1)
# whole_label = get_structure(cif_file, use_author_fields=False)

auth_residues_only = get_residues(
    whole_auth[
        (whole_auth.chain_id == "A") & \
        (biotite.structure.filter_peptide_backbone(whole_auth))])
print(auth_residues_only[0])
print(auth_residues_only[1])
print(len(auth_residues_only[0]))
print(len(auth_residues_only[1]))


[  1   1   2   3   4   5   6   7   8   9  10  11  12  13  14  14  14  14
  14  14  14  14  14  14  14  14  16  17  18  19  20  21  22  23  24  25
  26  27  28  29  30  31  32  33  34  35  36  36  37  38  39  40  41  42
  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60
  60  60  60  60  60  60  60  60  60  61  62  63  64  65  66  67  68  69
  70  71  72  73  74  75  76  77  77  78  79  80  81  82  83  84  85  86
  87  88  89  90  91  92  93  94  95  96  97  97  98  99 100 101 102 103
 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
 122 123 124 125 126 127 128 129 129 129 129 130 131 132 133 134 135 136
 137 138 139 140 141 142 143 144 145 146 150 151 152 153 154 155 156 157
 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
 176 177 178 179 180 181 182 183 184 184 185 186 186 186 186 186 187 188
 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 204 204
 205 206 207 208 209 210 211 212 213 214 215 216 21

### GENERATE HOLO SEQUENCES

In [4]:
import json
import os
import deeplife_utils
import shutil
shutil.rmtree('../data/holo-sequence-annotations/', ignore_errors=True)
os.makedirs('../data/holo-sequence-annotations', exist_ok=True)

with open(f'../cryptobench/whole_dataset.json', 'r') as json_file:
    dataset = json.load(json_file)

for apo_structure, holo_structures in dataset.items():
    apo_pockets = set()
    for holo_structure in holo_structures:

        pdb_id = holo_structure['holo_pdb_id']
        chain_id = holo_structure['holo_chain']
        apo_pocket = holo_structure['apo_pocket_selection']
        pocket = holo_structure['holo_pocket_selection']
        print(f'Processing {pdb_id}{chain_id} ...')
        
        # if the pocket too similar to other pockets then don't worry about it
        new_apo_residues = [residue.split(
            '_')[1] for residue in apo_pocket if residue.split('_')[1] not in apo_pockets]
        # probably a homomer or the pocket is just too similar to others 
        if (len(apo_pocket) - len(new_apo_residues)) / len(apo_pocket) > 0.75:
            continue
        apo_pockets.update(new_apo_residues)

        # auth_seq_ids of the pocket
        binding_residues = set([residue.split('_')[1]
                               for residue in pocket])

        cif_file_path = rcsb.fetch(pdb_id, "cif", gettempdir())

        cif_file = pdbx.CIFFile.read(cif_file_path)

        auth = get_structure(cif_file, model=1)

        # filter to get correct chain; filter only for peptides
        auth_residues_only = get_residues(
            auth[
                (auth.chain_id == chain_id) &
                (biotite.structure.filter_peptide_backbone(auth))
            ])

        zero_based_binding_residues = []
        sequence = ""
        # to handle cases where residue indices are named like this: 60A, 60B, 60C, ...
        previous_seq_id = float('-inf')
        letter_counter = 0
        for idx, (auth_seq_id, resname) in enumerate(zip(auth_residues_only[0], auth_residues_only[1])):
            if previous_seq_id == auth_seq_id:
                letter_counter += 1
            elif letter_counter > 0:
                letter_counter = 0
            one_letter_aa = deeplife_utils.three_to_one(resname)
            if str(auth_seq_id) in binding_residues or (str(auth_seq_id) + chr(ord('A') + letter_counter)) in binding_residues:
                zero_based_binding_residues.append(one_letter_aa + str(idx))
            sequence += one_letter_aa
            previous_seq_id = auth_seq_id

        with open(f'../data/holo-sequence-annotations/{pdb_id.lower()}{chain_id.upper()}.txt', 'w') as f:
            f.write(
                f'{pdb_id};{chain_id};UNKNOWN;{" ".join(zero_based_binding_residues)};{sequence}')


Processing 1pshA ...
Processing 1pshB ...
Processing 1pshC ...
Processing 1b14A ...
Processing 1b14B ...
Processing 1b15A ...
Processing 1b15B ...
Processing 1b16A ...
Processing 1b16B ...
Processing 1b2lA ...
Processing 1sbyA ...
Processing 1sbyB ...
Processing 3rj5A ...
Processing 3rj9A ...
Processing 3rj9B ...
Processing 3rj9C ...
Processing 3rj9D ...
Processing 3rj9E ...
Processing 3rj9F ...
Processing 6clvA ...
Processing 6clvC ...
Processing 3wa2X ...
Processing 3wa2X ...
Processing 2qz3A ...
Processing 1zesA ...
Processing 1zesB ...
Processing 1zesC ...
Processing 1g94A ...
Processing 1g9hA ...
Processing 8cqfA ...
Processing 2c14A ...
Processing 2c15A ...
Processing 2c16A ...
Processing 2c19A ...
Processing 1b9aA ...
Processing 1dbpA ...
Processing 1drkA ...
Processing 2driA ...
Processing 2gx6A ...
Processing 1a27A ...
Processing 1fdtA ...
Processing 1i5rA ...
Processing 3hb4X ...
Processing 3hb5X ...
Processing 6cgeA ...
Processing 1orkA ...
Processing 2fj1A ...
Processing 2o

In [4]:
import json
import os
import deeplife_utils
import shutil
shutil.rmtree('../data/apo-sequence-annotations/', ignore_errors=True)
os.makedirs('../data/apo-sequence-annotations', exist_ok=True)

with open(f'../cryptobench/whole_dataset.json', 'r') as json_file:
    dataset = json.load(json_file)

for apo_structure, holo_structures in dataset.items():
    binding_residues = set()
    chain_id = holo_structures[0]['apo_chain']
    for holo_structure in holo_structures:

        apo_pocket = holo_structure['apo_pocket_selection']
        
        new_apo_residues = [residue.split(
            '_')[1] for residue in apo_pocket]

        binding_residues.update(new_apo_residues)

    cif_file_path = rcsb.fetch(apo_structure, "cif", gettempdir())

    cif_file = pdbx.CIFFile.read(cif_file_path)

    auth = get_structure(cif_file, model=1)

    # filter to get correct chain; filter only for peptides
    auth_residues_only = get_residues(
        auth[
            (auth.chain_id == chain_id) &
            (biotite.structure.filter_peptide_backbone(auth))
        ])

    zero_based_binding_residues = []
    sequence = ""
    # to handle cases where residue indices are named like this: 60A, 60B, 60C, ...
    previous_seq_id = float('-inf')
    letter_counter = 0
    for idx, (auth_seq_id, resname) in enumerate(zip(auth_residues_only[0], auth_residues_only[1])):
        if previous_seq_id == auth_seq_id:
            letter_counter += 1
        elif letter_counter > 0:
            letter_counter = 0
        one_letter_aa = deeplife_utils.three_to_one(resname)
        if str(auth_seq_id) in binding_residues or (str(auth_seq_id) + chr(ord('A') + letter_counter)) in binding_residues:
            zero_based_binding_residues.append(one_letter_aa + str(idx))
        sequence += one_letter_aa
        previous_seq_id = auth_seq_id

    with open(f'../data/apo-sequence-annotations/{apo_structure.lower()}{chain_id.upper()}.txt', 'w') as f:
        f.write(
            f'{apo_structure};{chain_id};UNKNOWN;{" ".join(zero_based_binding_residues)};{sequence}')


In [5]:
import csv
OUTPUT_PATH = '../data/sequences-for-embedder'
for path in ['../data/apo-sequence-annotations/', '../data/holo-sequence-annotations/']:
    for file in os.listdir(path):
        with open(f'{path}{file}', 'r') as f:
            csv_reader = csv.reader(f, delimiter=';')
            sequence = next(csv_reader)[4]
        with open(f'{OUTPUT_PATH}/{file}', 'w') as f:
            f.write(sequence)