In [15]:
from tempfile import gettempdir
from biotite.structure import get_residues, get_chains
from biotite.structure.io.pdbx import get_structure
import biotite.structure.io.pdbx as pdbx
import biotite.database.rcsb as rcsb
import biotite

cif_file_path = rcsb.fetch("1TA6", "cif", gettempdir())
cif_file = pdbx.CIFFile.read(cif_file_path)


whole_auth = get_structure(cif_file, model=1, extra_fields=["b_factor"])
# whole_label = get_structure(cif_file, use_author_fields=False)

print(len(whole_auth[
        (whole_auth.chain_id == "A") & \
        (biotite.structure.filter_peptide_backbone(whole_auth))].b_factor))

chain_residues = whole_auth[
        (whole_auth.chain_id == "A") & \
        (biotite.structure.filter_peptide_backbone(whole_auth))]
for i in chain_residues:
    print(i.b_factor, i.res_id)


### GENERATE HOLO SEQUENCES WITH ANNOTATIONS

In [None]:
import json
import os
import deeplife_utils
import shutil
shutil.rmtree('../data/holo-sequence-annotations/', ignore_errors=True)
os.makedirs('../data/holo-sequence-annotations', exist_ok=True)

with open(f'../cryptobench/whole_dataset.json', 'r') as json_file:
    dataset = json.load(json_file)

for apo_structure, holo_structures in dataset.items():
    apo_pockets = set()
    for holo_structure in holo_structures:

        pdb_id = holo_structure['holo_pdb_id']
        chain_id = holo_structure['holo_chain']
        apo_pocket = holo_structure['apo_pocket_selection']
        pocket = holo_structure['holo_pocket_selection']
        print(f'Processing {pdb_id}{chain_id} ...')
        
        # if the pocket too similar to other pockets then don't worry about it
        new_apo_residues = [residue.split(
            '_')[1] for residue in apo_pocket if residue.split('_')[1] not in apo_pockets]
        # probably a homomer or the pocket is just too similar to others 
        if (len(apo_pocket) - len(new_apo_residues)) / len(apo_pocket) > 0.75:
            continue
        apo_pockets.update(new_apo_residues)

        # auth_seq_ids of the pocket
        binding_residues = set([residue.split('_')[1]
                               for residue in pocket])

        cif_file_path = rcsb.fetch(pdb_id, "cif", gettempdir())

        cif_file = pdbx.CIFFile.read(cif_file_path)

        auth = get_structure(cif_file, model=1)

        # filter to get correct chain; filter only for peptides
        auth_residues_only = get_residues(
            auth[
                (auth.chain_id == chain_id) &
                (biotite.structure.filter_peptide_backbone(auth))
            ])

        zero_based_binding_residues = []
        sequence = ""
        # to handle cases where residue indices are named like this: 60A, 60B, 60C, ...
        previous_seq_id = float('-inf')
        letter_counter = 0
        for idx, (auth_seq_id, resname) in enumerate(zip(auth_residues_only[0], auth_residues_only[1])):
            if previous_seq_id == auth_seq_id:
                letter_counter += 1
            elif letter_counter > 0:
                letter_counter = 0
            one_letter_aa = deeplife_utils.three_to_one(resname)
            if str(auth_seq_id) in binding_residues or (str(auth_seq_id) + chr(ord('A') + letter_counter)) in binding_residues:
                zero_based_binding_residues.append(one_letter_aa + str(idx))
            sequence += one_letter_aa
            previous_seq_id = auth_seq_id

        with open(f'../data/holo-sequence-annotations/{pdb_id.lower()}{chain_id.upper()}.txt', 'w') as f:
            f.write(
                f'{pdb_id};{chain_id};UNKNOWN;{" ".join(zero_based_binding_residues)};{sequence}')


### GENERATE APO SEQUENCES WITH ANNOTATIONS

In [None]:
import json
import os
import deeplife_utils
import shutil
shutil.rmtree('../data/apo-sequence-annotations/', ignore_errors=True)
os.makedirs('../data/apo-sequence-annotations', exist_ok=True)

with open(f'../cryptobench/whole_dataset.json', 'r') as json_file:
    dataset = json.load(json_file)

for apo_structure, holo_structures in dataset.items():
    binding_residues = set()
    chain_id = holo_structures[0]['apo_chain']
    for holo_structure in holo_structures:

        apo_pocket = holo_structure['apo_pocket_selection']
        
        new_apo_residues = [residue.split(
            '_')[1] for residue in apo_pocket]

        binding_residues.update(new_apo_residues)

    cif_file_path = rcsb.fetch(apo_structure, "cif", gettempdir())

    cif_file = pdbx.CIFFile.read(cif_file_path)

    auth = get_structure(cif_file, model=1)

    # filter to get correct chain; filter only for peptides
    auth_residues_only = get_residues(
        auth[
            (auth.chain_id == chain_id) &
            (biotite.structure.filter_peptide_backbone(auth))
        ])

    zero_based_binding_residues = []
    sequence = ""
    # to handle cases where residue indices are named like this: 60A, 60B, 60C, ...
    previous_seq_id = float('-inf')
    letter_counter = 0
    for idx, (auth_seq_id, resname) in enumerate(zip(auth_residues_only[0], auth_residues_only[1])):
        if previous_seq_id == auth_seq_id:
            letter_counter += 1
        elif letter_counter > 0:
            letter_counter = 0
        one_letter_aa = deeplife_utils.three_to_one(resname)
        if str(auth_seq_id) in binding_residues or (str(auth_seq_id) + chr(ord('A') + letter_counter)) in binding_residues:
            zero_based_binding_residues.append(one_letter_aa + str(idx))
        sequence += one_letter_aa
        previous_seq_id = auth_seq_id

    with open(f'../data/apo-sequence-annotations/{apo_structure.lower()}{chain_id.upper()}.txt', 'w') as f:
        f.write(
            f'{apo_structure};{chain_id};UNKNOWN;{" ".join(zero_based_binding_residues)};{sequence}')


In [None]:
import csv
OUTPUT_PATH = '../data/sequences-for-embedder'
for path in ['../data/apo-sequence-annotations/', '../data/holo-sequence-annotations/']:
    for file in os.listdir(path):
        with open(f'{path}{file}', 'r') as f:
            csv_reader = csv.reader(f, delimiter=';')
            sequence = next(csv_reader)[4]
        with open(f'{OUTPUT_PATH}/{file}', 'w') as f:
            f.write(sequence)

### CREATE THREE TXT FILES
1. `holo_train.txt`: containing the holo training set
2. `holo_test.txt`: containing the holo test set
3. `apo_test.txt`: containing the apo test set

In [3]:
import os
import json

DIR_PATH = '/home/skrhakv/deeplife/deeplife-project/'

TEST_PATH = DIR_PATH + 'cryptobench/test'
TRAIN_PATH = DIR_PATH + 'cryptobench/train'
APO_ANNOTATIONS_PATH = DIR_PATH + 'data/apo-sequence-annotations'
HOLO_ANNOTATIONS_PATH = DIR_PATH + 'data/holo-sequence-annotations'
OUTPUT_PATH = DIR_PATH + 'data'

def read_cryptobench_subset(subset_path):
    apo_subset = []
    holo_subset = []
    for file in os.listdir(subset_path):
        # load the CryptoBench subset from file
        with open(f'{subset_path}/{file}', 'r') as json_file:
            dataset = json.load(json_file)
        
        # read the JSON
        for apo_pdb_id, holo_structures in dataset.items():
            
            # find and read the apo file
            apo_chain_id = holo_structures[0]['apo_chain']
            apo_filename = f'{APO_ANNOTATIONS_PATH}/{apo_pdb_id}{apo_chain_id}.txt'
            if os.path.isfile(apo_filename):
                with open(apo_filename, 'r') as apo_file:
                    apo_subset.extend(apo_file.readlines())
            
            # find and read the holo file
            for holo_structure in holo_structures:
                holo_pdb_id = holo_structure['holo_pdb_id']
                holo_chain_id = holo_structure['holo_chain']
                holo_filename = f'{HOLO_ANNOTATIONS_PATH}/{holo_pdb_id}{holo_chain_id}.txt'
                if os.path.isfile(holo_filename):
                    with open(holo_filename, 'r') as holo_file:
                        holo_subset.extend(holo_file.readlines())

    return apo_subset, holo_subset

# extract the annotations
train_apo, train_holo = read_cryptobench_subset(TRAIN_PATH)
test_apo, test_holo = read_cryptobench_subset(TEST_PATH)

# merge the annotations into a single file
with open(f'{OUTPUT_PATH}/holo_train.txt', 'w') as holo_train_file:
    holo_train_file.write('\n'.join(train_holo))
with open(f'{OUTPUT_PATH}/holo_test.txt', 'w') as holo_test_file:
    holo_test_file.write('\n'.join(test_holo))
with open(f'{OUTPUT_PATH}/apo_test.txt', 'w') as apo_test_file:
    apo_test_file.write('\n'.join(test_apo))
with open(f'{OUTPUT_PATH}/apo_train.txt', 'w') as apo_train_file:
    apo_train_file.write('\n'.join(train_apo))
