In [None]:
from biotite.structure import get_residues
from biotite.structure.io.pdbx import get_structure
import biotite.structure.io.pdbx as pdbx
import biotite.database.rcsb as rcsb
import biotite

import shutil
import os

DATASET = 'cryptobench'
# shutil.rmtree(f'../../data/{DATASET}/sequence-annotations/', ignore_errors=True)
# os.makedirs(f'../../data/{DATASET}/sequence-annotations', exist_ok=True)


# GENERATE SEQUENCES WITH ANNOTATIONS
## CAUTION
You need to add a library from a [separate project](https://github.com/skrhakv/deeplife-project/blob/master/src/deeplife_utils.py) (update the `sys.path.append(...)` path). 

In [None]:
import json
import sys
sys.path.append('/home/vit/Projects/deeplife-project/src')
import deeplife_utils

CIF_FILES = '/home/vit/Projects/deeplife-project/data/cif_files'

def extract_annotations(dataset_path, output_path):
    with open(dataset_path, 'r') as json_file:
        dataset = json.load(json_file)

    for apo_structure, holo_structures in dataset.items():

        # print(f'Processing {apo_structure} ...')
        binding_residues = set()
        chain_id = holo_structures[0]['apo_chain']

        # skip multichain structures
        if '-' in chain_id:
            continue
        if os.path.isfile(f'{output_path}/{apo_structure.lower()}{chain_id.upper()}.txt'):
            continue

        for holo_structure in holo_structures:

            apo_pocket = holo_structure['apo_pocket_selection']

            new_apo_residues = [residue.split(
                '_')[1] for residue in apo_pocket]

            binding_residues.update(new_apo_residues)

        cif_file_path = rcsb.fetch(apo_structure, "cif", target_path=CIF_FILES)

        cif_file = pdbx.CIFFile.read(cif_file_path)

        auth = get_structure(cif_file, model=1)
        auth = auth[
                (auth.chain_id == chain_id) &
                (biotite.structure.filter_peptide_backbone(auth))]

        # skip if no residues left
        if len(auth) == 0: 
            print(f'No residues left for {apo_structure} {chain_id}')
            continue

        # filter to get correct chain; filter only for peptides
        auth_residues_only = get_residues(auth)

        zero_based_binding_residues = []
        sequence = ""
        # to handle cases where residue indices are named like this: 60A, 60B, 60C, ...
        previous_seq_id = float('-inf')
        letter_counter = 0
        for idx, (auth_seq_id, resname) in enumerate(zip(auth_residues_only[0], auth_residues_only[1])):
            if previous_seq_id == auth_seq_id:
                letter_counter += 1
            elif letter_counter > 0:
                letter_counter = 0
            one_letter_aa = deeplife_utils.three_to_one(resname)
            if str(auth_seq_id) in binding_residues or (str(auth_seq_id) + chr(ord('A') + letter_counter)) in binding_residues:
                zero_based_binding_residues.append(one_letter_aa + str(idx))
            sequence += one_letter_aa
            previous_seq_id = auth_seq_id

        with open(f'{output_path}/{apo_structure.lower()}{chain_id.upper()}.txt', 'w') as f:
            f.write(
                f'{apo_structure};{chain_id};UNKNOWN;{" ".join(zero_based_binding_residues)};{sequence}')

INPUT_PATH = f'../../datasets/{DATASET}-dataset/dataset.json'
OUTPUT_PATH = f'../../data/{DATASET}/sequence-annotations'
# extract_annotations(INPUT_PATH, OUTPUT_PATH)

In [None]:
import csv
INPUT_PATH = OUTPUT_PATH
OUTPUT_PATH = f'../../data/{DATASET}/sequences'

shutil.rmtree(f'{OUTPUT_PATH}/', ignore_errors=True)
os.makedirs(OUTPUT_PATH, exist_ok=True)

for file in os.listdir(INPUT_PATH):
    with open(f'{INPUT_PATH}{file}', 'r') as f:
        csv_reader = csv.reader(f, delimiter=';')
        sequence = next(csv_reader)[4]
    with open(f'{OUTPUT_PATH}/{file}', 'w') as f:
        f.write(sequence)

### CREATE THREE TXT FILES
1. `test.txt`: containing the test set
2. `train.txt`: containing the train set
3. `val.txt`: containing validation set

In [2]:
import os
import json

DIR_PATH = '/home/skrhakv/deeplife/deeplife-project/'

TEST_PATH = [f'/home/vit/Projects/cryptic-nn/datasets/{DATASET}-dataset/folds/train-fold-3.json']
TRAIN_PATH = [f'/home/vit/Projects/cryptic-nn/datasets/{DATASET}-dataset/folds/train-fold-0.json',
              f'/home/vit/Projects/cryptic-nn/datasets/{DATASET}-dataset/folds/train-fold-1.json',
              f'/home/vit/Projects/cryptic-nn/datasets/{DATASET}-dataset/folds/train-fold-2.json'
              ]
APO_ANNOTATIONS_PATH = f'/home/vit/Projects/cryptic-nn/data/{DATASET}/sequence-annotations'
OUTPUT_PATH = f'/home/vit/Projects/cryptic-nn/data/{DATASET}-with-validation-subset'

def read_cryptobench_subset(subset_paths):
    apo_subset = []
    for file in subset_paths:
        # load the CryptoBench subset from file
        with open(file, 'r') as json_file:
            dataset = json.load(json_file)

        # read the JSON
        for apo_pdb_id, holo_structures in dataset.items():

            # find and read the apo file
            apo_chain_id = holo_structures[0]['apo_chain']
            apo_filename = f'{APO_ANNOTATIONS_PATH}/{apo_pdb_id}{apo_chain_id}.txt'
            if os.path.isfile(apo_filename):
                with open(apo_filename, 'r') as apo_file:
                    apo_subset.extend(apo_file.readlines())
    return apo_subset

# extract the annotations
train_apo = read_cryptobench_subset(TRAIN_PATH)
test_apo = read_cryptobench_subset(TEST_PATH)

# merge the annotations into a single file
with open(f'{OUTPUT_PATH}/val.txt', 'w') as apo_test_file:
    apo_test_file.write('\n'.join(test_apo))
with open(f'{OUTPUT_PATH}/train.txt', 'w') as apo_train_file:
    apo_train_file.write('\n'.join(train_apo))


## Extract non-cryptic binding annotation
Some structures from the CryptoBench dataset also contain binding sites that don't exhibit any flexibility. It might be a good idea to exclude those during training/evaluation. Let's explore this first by extracting that information and map it onto the sequence level.

In [5]:
DATASET = 'non-cryptic-cryptobench'
INPUT_PATH = f'../../datasets/cryptobench-dataset/auxiliary-data/non-cryptic-pockets/noncryptic-pockets.json'
OUTPUT_PATH = f'../../data/{DATASET}/sequence-annotations'
extract_annotations(INPUT_PATH, OUTPUT_PATH)

In [None]:
APO_ANNOTATIONS_PATH = OUTPUT_PATH

# merge the annotations into a single file
noncryptic_annotations = read_cryptobench_subset([INPUT_PATH])

# merge the annotations into a single file
with open(f'../../data/{DATASET}/noncryptic-annotations.txt', 'w') as f:
    f.write('\n'.join(noncryptic_annotations))


In [2]:
PATH = '/home/vit/Projects/cryptic-nn/data/cryptobench/sequences'
import os
for file in os.listdir(PATH):
    print(file.split('.')[0][:4])

1a4u
1a8d
1ad1
1ak1
1arl
1ayl
1b0i
1bfn
1bhs
1bk2
1byi
1bzj
1c3k
1cuz
1cwq
1dc6
1dkl
1dpj
1dq2
1dqz
1dte
1e3g
1e5l
1e6k
1ecc
1efh
1esw
1evy
1ezl
1f47
1f8a
1fd9
1fdp
1ffh
1fl1
1fvr
1fwk
1g24
1g59
1gqn
1gqz
1h13
1h3g
1hav
1hbq
1hp1
1ht6
1i0r
1i7n
1iwl
1j8f
1jpm
1k0n
1k1x
1k47
1k4k
1k7k
1kg5
1kn9
1ks9
1ksg
1kx9
1kxr
1l0w
1lbe
1lju
1lug
1m1z
1m5w
1mac
1mhn
1ms4
1muf
1mwk
1mwr
1n05
1naw
1nbf
1nd7
1nko
1nn6
1nok
1nul
1nwh
1nzo
1o73
1of3
1oib
1omx
1os2
1p4o
1p4v
1p74
1p9o
1pfz
1pt7
1pta
1pu5
1px5
1py3
1q4k
1qht
1qrz
1r8j
1rf5
1rjb
1rkm
1rq2
1rtc
1rxd
1s2l
1s8c
1se8
1sh0
1sjs
1snd
1sul
1t8t
1t9r
1thv
1tpl
1tqd
1tqn
1u4p
1uiu
1uka
1un1
1urp
1ute
1vju
1vk4
1vr2
1vsn
1wam
1wjg
1wxe
1wyc
1x0m
1x2g
1xgd
1xhx
1xjf
1xqv
1xqz
1xt3
1xtc
1y6i
1yhv
1yl5
1ys0
1z7g
1z90
1zm0
2a88
2air
2aka
2akr
2b0j
2b23
2b7c
2bei
2biv
2bva
2by3
2c3v
2c6g
2cam
2ci4
2ckq
2cl3
2d05
2d0t
2dfp
2dh4
2e1c
2e2o
2e3k
2e3m
2e64
2e8y
2epl
2exx
2fem
2ffy
2fhz
2fim
2fjy
2fk7
2fn9
2fp1
2frs
2fyu
2g5l
2gcb
2gme
2gzr
2h6d
2h7g
2h7s
2h98
