# Импорт библиотек

In [3]:
import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.graphics as graphics
import matplotlib.pyplot as pl
import py3Dmol
import torch
from esm.sdk import client
from esm.sdk.api import ESM3InferenceClient, ESMProtein, GenerationConfig
from esm.models.esm3 import ESM3
from esm.utils.structure.protein_chain import ProteinChain

# Выгрузка белков из базы данных PDB и подготовка

In [7]:
model: ESM3InferenceClient = ESM3.from_pretrained('esm3-open').to('cuda')

Fetching 22 files:   0%|          | 0/22 [00:00<?, ?it/s]

In [8]:
# GFP
gfp_protein = ESMProtein.from_protein_chain(
    ProteinChain.from_rcsb('1qy3', chain_id='A')
)

# mScarlet
mscarlet_protein = ESMProtein.from_protein_chain(
    ProteinChain.from_rcsb('5lk4', chain_id='A')
)

# sfGFP
sfgfp_protein = ESMProtein.from_protein_chain(
    ProteinChain.from_rcsb('2b3p', chain_id='A')
)

In [9]:
# Токенизация белков
with torch.no_grad():
    gfp_protein_tokens = model.encode(gfp_protein)
    mscarlet_protein_tokens = model.encode(mscarlet_protein)
    sfgfp_protein_tokens = model.encode(sfgfp_protein)

  with torch.no_grad(), torch.cuda.amp.autocast(enabled=False):  # type: ignore
  with torch.no_grad(), torch.cuda.amp.autocast(enabled=False):  # type: ignore


In [10]:
alpha_phelix = [6, 134, 10, 139, 12, 140, 14, 141, 16, 142, 143, 181, 148, 182, 25, 183, 27, 33, 34, 35, 163, 165, 38, 39, 40, 167, 42, 44, 51, 179, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 199, 203, 205, 80, 82, 83, 86, 216, 90, 218, 92, 93, 94, 220, 222, 223, 98, 104, 106, 110, 117, 119, 121, 123]

In [11]:
prompt_seq = ['_'] * len(gfp_protein_tokens.sequence)

for i in alpha_phelix:
    prompt_seq[i - 1] = gfp_protein.sequence[i - 1]

prompt_seq = ''.join(prompt_seq)

print(gfp_protein.sequence)
print(prompt_seq)

prompt = model.encode(ESMProtein(sequence=prompt_seq))

KGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQEATISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGI
_____F___V_I_V_L________F_V_____GDA__GKL_L_F______L_VPWPTLVTTLTYGVQCFSRY_______D_FK__M___Y_QEA___F_____Y_T___V______L_N_I_L__________I____LEYNY____V______________F_I_H___________H_QQN_______________L___S_L__________M_L_E_VT______


In [16]:
prompt.structure = torch.full_like(prompt.sequence, 4096)

# Добавляем начальный и конечный токены
prompt.structure[0] = 4098
prompt.structure[-1] = 4097

# Заполняем токены для промпта
for i in alpha_phelix:
    prompt.structure[i] = gfp_protein_tokens.structure[i - 1]

# Выводим промпт для последовательности и для структуры
print('_' + prompt_seq + '_')
print(''.join(['A' if st < 4096 else '_' for st in prompt.structure]))

______F___V_I_V_L________F_V_____GDA__GKL_L_F______L_VPWPTLVTTLTYGVQCFSRY_______D_FK__M___Y_QEA___F_____Y_T___V______L_N_I_L__________I____LEYNY____V______________F_I_H___________H_QQN_______________L___S_L__________M_L_E_VT_______
______A___A_A_A_A________A_A_____AAA__AAA_A_A______A_AAAAAAAAAAAAAAAAAAAA_______A_AA__A___A_AAA___A_____A_A___A______A_A_A_A__________A____AAAAA____A______________A_A_A___________A_AAA_______________A___A_A__________A_A_A_AA_______


In [13]:
num_tokens_to_decode = min((prompt.structure == 4096).sum().item(), 20)


structure_generation = model.generate(
    prompt,
    GenerationConfig(
        # Generate a structure.
        track="structure",
        # Sample one token per forward pass of the model.
        num_steps=num_tokens_to_decode,
        # Sampling temperature trades perplexity with diversity.
        temperature=1.0,
    ),
)
structure_generation.

print("These are the structure tokens corresponding to our new design:")
print(
    "    ", ", ".join([str(token) for token in structure_generation.structure.tolist()])
)

# Decodes structure tokens to backbone coordinates.
structure_generation_protein = model.decode(structure_generation)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:02<00:00,  7.19it/s]


These are the structure tokens corresponding to our new design:
     4098, 1333, 3715, 3235, 1185, 1034, 3019, 3759, 1362, 85, 60, 1209, 3819, 154, 2708, 3683, 2219, 1966, 1325, 2620, 1413, 1209, 1907, 2016, 2271, 178, 2676, 2166, 3967, 3721, 952, 3609, 2117, 633, 2580, 2274, 1181, 3460, 2490, 1128, 3686, 1151, 4070, 592, 2239, 2885, 1092, 194, 1327, 3275, 2359, 2583, 3781, 584, 1595, 2293, 2370, 1774, 732, 1797, 748, 3403, 2370, 2582, 3704, 2737, 3007, 1660, 499, 484, 2202, 2786, 3034, 1910, 587, 2741, 1265, 1005, 3855, 160, 2387, 46, 1203, 3279, 3290, 511, 2136, 1779, 2158, 1418, 1344, 1050, 3152, 836, 1066, 35, 1325, 3550, 2875, 3264, 3737, 793, 2827, 3152, 1216, 2791, 4030, 4032, 276, 973, 3376, 1032, 1237, 426, 1140, 1607, 3437, 647, 2879, 3824, 1790, 2415, 622, 1813, 1809, 3968, 276, 3754, 2012, 4029, 2725, 322, 903, 3803, 2196, 4004, 3804, 205, 2940, 4038, 47, 2105, 3430, 464, 948, 2540, 3640, 952, 3445, 3744, 3686, 2721, 2149, 1175, 1206, 3030, 1576, 128, 1186, 1683, 2952, 558,

In [15]:
structure_generation_protein

ESMProtein(sequence='_____F___V_I_V_L________F_V_____GDA__GKL_L_F______L_VPWPTLVTTLTYGVQCFSRY_______D_FK__M___Y_QEA___F_____Y_T___V______L_N_I_L__________I____LEYNY____V______________F_I_H___________H_QQN_______________L___S_L__________M_L_E_VT______', secondary_structure=None, sasa=None, function_annotations=None, coordinates=tensor([[[-14.3530,  -3.8865,  23.2069],
         [-15.3319,  -2.9923,  22.5974],
         [-16.0419,  -3.6690,  21.4296],
         ...,
         [     inf,      inf,      inf],
         [     inf,      inf,      inf],
         [     inf,      inf,      inf]],

        [[-16.7473,  -3.4661,  21.3963],
         [-17.6040,  -4.0105,  20.3480],
         [-17.5258,  -3.1656,  19.0807],
         ...,
         [     inf,      inf,      inf],
         [     inf,      inf,      inf],
         [     inf,      inf,      inf]],

        [[-17.1869,  -3.2747,  18.1882],
         [-17.2287,  -2.6271,  16.8813],
         [-18.6666,  -2.4094,  16.4218],
         ...,
         [

In [14]:
view = py3Dmol.view(width=1000, height=500)
view.addModel(
    structure_generation_protein.to_protein_chain().infer_oxygen().to_pdb_string(),
    'pdb',
)
view.setStyle({"cartoon": {"color": "lightgreen"}})
view.zoomTo()
view.show()

In [23]:
structure_generation_protein.to_pdb('kutoi_belok.pdb')

In [25]:
template_chain = gfp_protein.to_protein_chain()
generation_chain = structure_generation_protein.to_protein_chain()

constrained_site_rmsd = template_chain[alpha_phelix].rmsd(
    generation_chain[alpha_phelix]
)
backbone_rmsd = template_chain.rmsd(generation_chain)

c_pass = "✅" if constrained_site_rmsd < 1.5 else "❌"
b_pass = "✅" if backbone_rmsd > 1.5 else "❌"

print(f"Constrained site RMSD: {constrained_site_rmsd:.2f} Ang {c_pass}")
print(f"Backbone RMSD: {backbone_rmsd:.2f} Ang {b_pass}")

AssertionError: 

In [21]:
num_tokens_to_decode = min((prompt.sequence == 32).sum().item(), 20)

sequence_generation = model.generate(
    # Generate a sequence.
    structure_generation_protein,
    GenerationConfig(track="sequence", num_steps=num_tokens_to_decode, temperature=1.0),
)

# Decode to AA string and coordinates.
sequence_generation_protein = model.decode(sequence_generation)

  with torch.no_grad(), torch.cuda.amp.autocast(enabled=False):  # type: ignore
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:02<00:00,  7.42it/s]


AttributeError: 'str' object has no attribute 'flatten'

In [19]:
sequence_generation_protein.to_pdb('krutoi_belok.pdb')

