In [3]:
from biotite.structure import distance
import numpy as np  
import os
import sys
from biotite.structure import superimpose

sys.path.append('../../utils/')
import biotite_utils
import dataset_utils

In [13]:

def compute_locations(dataset):
    dataset_path = f'../../../datasets/{dataset}'
    output_path = f'../../../data/features/residue-locations/{dataset}'
    fluctuation_path = f'../../../data/features/fluctuation/{dataset}/fluctuation'

    train_set = dataset_utils.load_train_set(dataset_path)
    apo_holo_pairs = dataset_utils.load_main_apo_holo_pairs(train_set, multichain=False)

    for apo, holo in apo_holo_pairs.items():
        print(f'Processing {apo}:{holo}')

        # Some random biotite error for those pairs; I don't have the mental capacity to deal with this - skipping
        if apo == "8j1kA" and holo == "3ouiA":
            continue
        if apo == "8pfpA" and holo == "7gquA":
            continue
        
        if os.path.exists(f'{output_path}/{apo}.npy'):
            continue

        apo_structure = biotite_utils.load_structure(apo)
        apo_sequence = biotite_utils.get_sequence(apo_structure, apo[4:])

        holo_structure = biotite_utils.load_structure(holo)
        holo_sequence = biotite_utils.get_sequence(holo_structure, holo[4:])

        alignment = biotite_utils.align_sequences(apo_sequence, holo_sequence)

        # we need to filter out the non-matching residues, otherwise the superimpose function will fail
        apo_indices = [i[0] for i in alignment[0].trace]
        holo_indices = [i[1] for i in alignment[0].trace]

        # get the structures with respect to indices
        apo_backbone = biotite_utils.get_protein_backbone(apo_structure, apo[4:], indices=apo_indices)
        holo_backbone = biotite_utils.get_protein_backbone(holo_structure, holo[4:], indices=holo_indices)

        holo_backbone, _ = superimpose(apo_backbone, holo_backbone)

        # rerun the alignment - the indices might got shifted due to the filtering for the sake of superimposing
        apo_sequence = biotite_utils.get_sequence(apo_backbone, apo[4:], from_backbone=True)
        holo_sequence = biotite_utils.get_sequence(holo_backbone, holo[4:], from_backbone=True)

        alignment = biotite_utils.align_sequences(apo_sequence, holo_sequence)

        # get mapping of indices from the filtered structure to the structure that matches the data from fluctuation:
        original_apo_backbone = biotite_utils.get_protein_backbone(apo_structure, apo[4:])
        original_apo_residue_ids = [residue.res_id for residue in original_apo_backbone]
        res_id_to_index = {i: original_apo_residue_ids.index(residue.res_id) for i, residue in enumerate(apo_backbone)}

        positions = np.full((len(original_apo_backbone), 2, 3), -1, dtype=np.float16) # 2 for apo and holo, 3 for x, y, z
        
        for i in alignment[0].trace:
            apo_index, holo_index = i
            positions[res_id_to_index[apo_index]] = np.array([apo_backbone[apo_index].coord, holo_backbone[holo_index].coord])

        assert len(np.load(f'{fluctuation_path}/{apo}.npy')) == positions.shape[0], f'{len(np.load(f"{fluctuation_path}/{apo}.npy"))} != {positions.shape[0]}'
        np.save(f'{output_path}/{apo}.npy', positions)


In [14]:
compute_locations('cryptobench-dataset')

Processing 1jpmA:1tkkB
Processing 2yx7A:2e7xA
Processing 2e1cA:2cyyA
Processing 1qhtA:5omqA
Processing 6cjfA:4ighA
Processing 2w6rA:7ac8E
Processing 5x6zD:5x6yC
Processing 1nkoA:2hrlA
Processing 5caeB:2fp4B
Processing 1macB:1u0aA
Processing 4dmzB:4dn0A
Processing 4gvrA:4gvqA
Processing 6kx4A:7d1cA
Processing 4ok2B:4ojzB
Processing 4ljpA:6sc5A
Processing 5v49A:4mwdB
Processing 7l8qA:7u4kA
Processing 4m23B:4m25A
Processing 7se6A:7secA
Processing 4brrE:4uxzA
Processing 1vjuB:3dwrA
Processing 1nwhB:1pquA
Processing 5uzvZ:5v0aZ
Processing 4jycC:4jybA
Processing 1mufA:4jdsA
Processing 6lgyA:6lgzA
Processing 3p08B:4ot5A
Processing 1l0wB:1g51A
Processing 6cy1A:6cy5A
Processing 6ksuA:6ksvB
Processing 1e5lB:1e5qB
Processing 8c3uA:5r8cA
Processing 7btcB:6bsxC
Processing 7oueA:7oliB
Processing 3vskB:3vslA
Processing 4z4lA:7d5gA
Processing 7s5gA:5ocaA
Processing 3l15A:6s6jB
Processing 8jisR:6x19R
Processing 7ciuA:7ciwA
Processing 2zcoA:4ea0A
Processing 1m1zA:1mc1B
Processing 3vjbD:3v66A
Processing 

In [15]:
compute_locations('rigid-dataset')

Processing 7o0aB:7ntgA
Processing 5klcA:5klfA
Processing 1ci8B:1ci9B
Processing 4dwqA:1uc2A
Processing 2y8nA:2yajA
Processing 6n2hA:8d2yA
Processing 6zu2CCC:6zrwD
Processing 4q71B:6bsnA
Processing 6zu3A:6ztuA
Processing 4txhD:4txlA
Processing 1hn0A:7eirA
Processing 6pkhA:6pkiA
Processing 1j0hB:1j0iA
Processing 5z0uA:1uh4A
Processing 1mwoA:1mxdA
Processing 3wdqA:3wdrA
Processing 7atrA:8fssA
Processing 4dm1A:3qhoC
Processing 1tvnA:1tvpB
Processing 3b9eA:3as0A
Processing 5e3pA:5e3rA
Processing 8vr6A:8vr7B
Processing 6etzA:6seaA
Processing 2de6B:7bugA
Processing 1slbB:1slaA
Processing 6q4xA:6q4zB
Processing 7zajAAA:7za6AAA
Processing 5kdnA:5kduA
Processing 3donA:3dooA
Processing 3jyoA:3jypA
Processing 6k0mA:6k0nA
Processing 4ckqA:5a6mA
Processing 6ir4A:6k36A
Processing 1iiwA:1iitA
Processing 4ppvA:4ppuA
Processing 7xgvA:7xgxA
Processing 3ex9A:3cnmB
Processing 4crqA:4cteA
Processing 7ofeA:7c5eA
Processing 3gbtA:3ll3A
Processing 2o70D:2o73C
Processing 3gd0A:3gd9A
Processing 7vtkA:7vtmA
Proce