In [1]:
import json
import biotite.structure.io.pdbx as pdbx
import biotite.structure.io.pdb as pdb
import biotite.structure as struc
from biotite.sequence import ProteinSequence
import os
import numpy as np

DATASET = 'rigid-dataset'
DATASET_PATH = f'../../../datasets/{DATASET}/folds'

def load_apo_structures(path):
    with open(path) as f:
        dataset = json.load(f)
    
    ids = {}
    for apo_id, holo_structures in dataset.items():
        for holo_structure in holo_structures:
            # skip multichain structures
            if '-' in holo_structure['apo_chain']:
                continue
            id = apo_id + holo_structure['apo_chain']
            if id in ids: ids[id].update(holo_structure['apo_pocket_selection'])
            else: ids[id] = set([int(''.join(filter(str.isdigit, residue.split('_')[1]))) for residue in holo_structure['apo_pocket_selection']])
    return ids

ids = {}
for fold in [f'train-fold-{i}.json' for i in range(4)]:
    subset_ids = load_apo_structures(f'{DATASET_PATH}/{fold}')
    ids = {**ids, **subset_ids}

CIF_FILES_PATH = '/home/vit/Projects/deeplife-project/data/cif_files'
OUTPUT_PATH = 'modified-pdbs'
FLUCTUATION_INPUT_PATH = f'../../../data/features/fluctuation/{DATASET}/fluctuation'
SKIPPED_IDS = ['5x6zD', '4ok2B', '5h61B', '1fl1B', '8hynA', '1sh0A', '2yx7A', '6lgyA', '4gu8H', '8j1kA', '1xhxB', '1pfzC', '4omgA', '1g24C', '6j35B', '6ialF', '4j2pA', '4x1oA', '5ytbC',
               '1havA', '4ekfA', '8iy0B', '3l28F', '2xc1A', '7wyoB', '3cbjA', '1of3B', '7gosB', '7poqA', '6spoA', '2wetA', '6tyoB', '7d48A', '2zf3F', '1mwkA', '5mf2D', '7oapEEE', '8dufA',
               '4pvrA', '4gf1B', '4u0mB', '1xt3B', '6ro0G', '3sebA', '3rmyD', '6twoAAA', '1nokA', '7oumC']


for id in ids.keys():
    if id in SKIPPED_IDS: continue
    if f'{id}.pdb' in os.listdir(OUTPUT_PATH):
        continue
    print(f'Processing {id} ...')
    # read file
    mmcif_filename = f'{id[:4]}.cif'
    mmcif_file = pdbx.CIFFile.read(f'{CIF_FILES_PATH}/{mmcif_filename}')
    chain_id = id[4:]

    # load file to biotite object
    whole_structure = pdbx.get_structure(mmcif_file, model=1, include_bonds=True)
    protein = whole_structure[struc.filter_amino_acids(whole_structure)]

    # some errors with MSE residue
    # WARNING: Here I am keeping all the atoms (i.e. not keeping only C-alpha atoms)
    filtered_protein_structure = protein[(protein.chain_id == chain_id) 
                       & (
                             (protein.res_name == 'ALA')
                           | (protein.res_name == 'ARG')
                           | (protein.res_name == 'ASN')
                           | (protein.res_name == 'ASP')
                           | (protein.res_name == 'CYS')
                           | (protein.res_name == 'GLN')
                           | (protein.res_name == 'GLU')
                           | (protein.res_name == 'GLY')
                           | (protein.res_name == 'HIS')
                           | (protein.res_name == 'ILE')
                           | (protein.res_name == 'LEU')
                           | (protein.res_name == 'LYS')
                           | (protein.res_name == 'MET')
                           | (protein.res_name == 'PHE')
                           | (protein.res_name == 'PRO')
                           | (protein.res_name == 'SER')
                           | (protein.res_name == 'THR')
                           | (protein.res_name == 'TRP')
                           | (protein.res_name == 'TYR')
                           | (protein.res_name == 'VAL'))]

    # Shorten chain_id to one character if it is longer than one character
    if len(chain_id) > 1:
        filtered_protein_structure.set_annotation('chain_id', np.array([chain_id[0] for _ in range(len(filtered_protein_structure))]))

    modified_pdb_file = pdb.PDBFile()
    modified_pdb_file.set_structure(filtered_protein_structure) 
    modified_pdb_file.write(os.path.join(OUTPUT_PATH, f"{id}.pdb"))


Processing 7o0aB ...
Processing 5klcA ...
Processing 1ci8B ...
Processing 4dwqA ...
Processing 2y8nA ...
Processing 6n2hA ...
Processing 6zu2CCC ...
Processing 4q71B ...
Processing 6zu3A ...
Processing 4txhD ...
Processing 1hn0A ...
Processing 6pkhA ...
Processing 1j0hB ...
Processing 5z0uA ...
Processing 1mwoA ...
Processing 3wdqA ...
Processing 7atrA ...
Processing 4dm1A ...
Processing 1tvnA ...
Processing 3b9eA ...
Processing 5e3pA ...
Processing 8vr6A ...
Processing 6etzA ...
Processing 2de6B ...
Processing 1slbB ...
Processing 6q4xA ...
Processing 7zajAAA ...
Processing 5kdnA ...
Processing 3donA ...
Processing 3jyoA ...
Processing 6k0mA ...
Processing 4ckqA ...
Processing 6ir4A ...
Processing 1iiwA ...
Processing 4ppvA ...
Processing 7xgvA ...
Processing 3ex9A ...
Processing 4crqA ...
Processing 7ofeA ...
Processing 3gbtA ...
Processing 2o70D ...
Processing 3gd0A ...
Processing 7vtkA ...
Processing 5jbdA ...
Processing 5u8eA ...
Processing 2vnsA ...
Processing 4piqA ...
Processin

## Data for CryptoBench was copied from another project
Check consistency of the copied data with the data from this project

In [None]:
import csv, os
import numpy as np
import shutil

# run for both modes: 'mutational' and 'configurational'
mode = 'mutational'
path = f'/home/vit/Projects/flexibility-analysis/data/features/frustration/cryptobench-dataset/{mode}'
inconsistency_path = f'/home/vit/Projects/flexibility-analysis/data/features/frustration/cryptobench-dataset/inconsistent-{mode}'
fluctuation_path = '/home/vit/Projects/flexibility-analysis/data/features/fluctuation/cryptobench-dataset/fluctuation'
for directory in os.listdir(path):
    print(f'Processing {directory} ...')
    protein_id = directory.split('.')[0]
    sequence_length = np.load(f'{fluctuation_path}/{protein_id.split("_")[0]}.npy').shape[0]

    if not os.path.exists(f'{path}/{directory}/FrustrationData/{protein_id}.pdb_{mode}_5adens'):
        print(f'Frustration data not found for {protein_id}')
        continue

    with open(f'{path}/{directory}/FrustrationData/{protein_id}.pdb_{mode}_5adens') as f:
        reader = csv.reader(f, delimiter=' ')
        # skip header
        next(reader)
        residue_count = 0
        for row in reader:
            residue_count += 1
    if sequence_length != residue_count:
        print(f'Data is inconsistent for {directory}')
        shutil.move(f'{path}/{directory}', f'{inconsistency_path}/{directory}')

# 39 / 773 seem to be inconsistent for the CryptoBench dataset -> need to be omitted during analysis

Processing 1a4uB_B.done ...
Processing 1a8dA_A.done ...
Processing 1ad1A_A.done ...
Processing 1ak1A_A.done ...
Processing 1aylA_A.done ...
Frustration data not found for 1aylA_A
Processing 1b0iA_A.done ...
Processing 1bfnA_A.done ...
Processing 1bhsA_A.done ...
Processing 1byiA_A.done ...
Processing 1c3kA_A.done ...
Processing 1cuzA_A.done ...
Processing 1dc6A_A.done ...
Data is inconsistent for 1dc6A_A.done
Processing 1dklA_A.done ...
Processing 1dpjA_A.done ...
Data is inconsistent for 1dpjA_A.done
Processing 1dqzA_A.done ...
Processing 1dteA_A.done ...
Processing 1e3gA_A.done ...
Processing 1e5lB_B.done ...
Processing 1eccB_B.done ...
Processing 1efhB_B.done ...
Processing 1eswA_A.done ...
Processing 1ezlC_C.done ...
Processing 1f47B_B.done ...
Processing 1f8aB_B.done ...
Processing 1fd9A_A.done ...
Processing 1fdpA_A.done ...
Frustration data not found for 1fdpA_A
Processing 1ffhA_A.done ...
Processing 1fvrA_A.done ...
Processing 1fwkC_C.done ...
Processing 1g1oA_A.done ...
Proces