In [9]:
from tempfile import gettempdir
from biotite.structure import get_residues, get_chains
from biotite.structure.io.pdbx import get_structure
import biotite.structure.io.pdbx as pdbx
import biotite.database.rcsb as rcsb
import biotite

import shutil
import os

shutil.rmtree('../data/apo-sequence-annotations/', ignore_errors=True)
os.makedirs('../data/apo-sequence-annotations', exist_ok=True)


# GENERATE APO SEQUENCES WITH ANNOTATIONS
## CAUTION
You need to add a library from a [separate project](https://github.com/skrhakv/deeplife-project/blob/master/src/deeplife_utils.py) (update the `sys.path.append(...)` path). 

In [14]:
import json
import sys
sys.path.append('/home/vit/Projects/deeplife-project/src')
import deeplife_utils

CIF_FILES = '/home/vit/Projects/deeplife-project/data/cif_files'
with open(f'../cryptobench-dataset/dataset.json', 'r') as json_file:
    dataset = json.load(json_file)

for apo_structure, holo_structures in dataset.items():
    
    # print(f'Processing {apo_structure} ...')
    binding_residues = set()
    chain_id = holo_structures[0]['apo_chain']

    # skip multichain structures
    if '-' in chain_id:
        continue
    if os.path.isfile(f'../data/apo-sequence-annotations/{apo_structure.lower()}{chain_id.upper()}.txt'):
        continue

    for holo_structure in holo_structures:

        apo_pocket = holo_structure['apo_pocket_selection']
        
        new_apo_residues = [residue.split(
            '_')[1] for residue in apo_pocket]

        binding_residues.update(new_apo_residues)

    cif_file_path = rcsb.fetch(apo_structure, "cif", target_path=CIF_FILES)

    cif_file = pdbx.CIFFile.read(cif_file_path)

    auth = get_structure(cif_file, model=1)
    auth = auth[
            (auth.chain_id == chain_id) &
            (biotite.structure.filter_peptide_backbone(auth))]
    
    # skip if no residues left
    if len(auth) == 0: 
        print(f'No residues left for {apo_structure} {chain_id}')
        continue

    # filter to get correct chain; filter only for peptides
    auth_residues_only = get_residues(auth)

    zero_based_binding_residues = []
    sequence = ""
    # to handle cases where residue indices are named like this: 60A, 60B, 60C, ...
    previous_seq_id = float('-inf')
    letter_counter = 0
    for idx, (auth_seq_id, resname) in enumerate(zip(auth_residues_only[0], auth_residues_only[1])):
        if previous_seq_id == auth_seq_id:
            letter_counter += 1
        elif letter_counter > 0:
            letter_counter = 0
        one_letter_aa = deeplife_utils.three_to_one(resname)
        if str(auth_seq_id) in binding_residues or (str(auth_seq_id) + chr(ord('A') + letter_counter)) in binding_residues:
            zero_based_binding_residues.append(one_letter_aa + str(idx))
        sequence += one_letter_aa
        previous_seq_id = auth_seq_id

    with open(f'../data/apo-sequence-annotations/{apo_structure.lower()}{chain_id.upper()}.txt', 'w') as f:
        f.write(
            f'{apo_structure};{chain_id};UNKNOWN;{" ".join(zero_based_binding_residues)};{sequence}')


In [16]:
import csv
OUTPUT_PATH = '../data/sequences'
INPUT_PATH = '../data/apo-sequence-annotations/'
for file in os.listdir(INPUT_PATH):
    with open(f'{INPUT_PATH}{file}', 'r') as f:
        csv_reader = csv.reader(f, delimiter=';')
        sequence = next(csv_reader)[4]
    with open(f'{OUTPUT_PATH}/{file}', 'w') as f:
        f.write(sequence)

### CREATE THREE TXT FILES
1. `apo_test.txt`: containing the test set
2. `apo_train.txt`: containing the train set

In [18]:
import os
import json

DIR_PATH = '/home/skrhakv/deeplife/deeplife-project/'

TEST_PATH = ['/home/vit/Projects/cryptic-nn/cryptobench-dataset/folds/test.json']
TRAIN_PATH = ['/home/vit/Projects/cryptic-nn/cryptobench-dataset/folds/train-fold-0.json',
              '/home/vit/Projects/cryptic-nn/cryptobench-dataset/folds/train-fold-1.json',
              '/home/vit/Projects/cryptic-nn/cryptobench-dataset/folds/train-fold-2.json',
              '/home/vit/Projects/cryptic-nn/cryptobench-dataset/folds/train-fold-3.json'
              ]
APO_ANNOTATIONS_PATH = '/home/vit/Projects/cryptic-nn/data/apo-sequence-annotations'
OUTPUT_PATH = '/home/vit/Projects/cryptic-nn/data'

def read_cryptobench_subset(subset_paths):
    apo_subset = []
    for file in subset_paths:
        # load the CryptoBench subset from file
        with open(file, 'r') as json_file:
            dataset = json.load(json_file)

        # read the JSON
        for apo_pdb_id, holo_structures in dataset.items():

            # find and read the apo file
            apo_chain_id = holo_structures[0]['apo_chain']
            apo_filename = f'{APO_ANNOTATIONS_PATH}/{apo_pdb_id}{apo_chain_id}.txt'
            if os.path.isfile(apo_filename):
                with open(apo_filename, 'r') as apo_file:
                    apo_subset.extend(apo_file.readlines())
    return apo_subset

# extract the annotations
train_apo = read_cryptobench_subset(TRAIN_PATH)
test_apo = read_cryptobench_subset(TEST_PATH)

# merge the annotations into a single file
with open(f'{OUTPUT_PATH}/apo_test.txt', 'w') as apo_test_file:
    apo_test_file.write('\n'.join(test_apo))
with open(f'{OUTPUT_PATH}/apo_train.txt', 'w') as apo_train_file:
    apo_train_file.write('\n'.join(train_apo))
