In [6]:
import pandas as pd
from transformers import AutoModel, AutoConfig
from rdkit import Chem
from rdkit.Chem import AllChem

from src.latent_transport.energy.permeability.tokenizer import SMILES_SPE_Tokenizer
from src.latent_transport.energy.permeability.embed import embed_smiles

In [7]:
model = AutoModel.from_pretrained("aaronfeller/PeptideCLM-23M-all")

In [8]:
config = AutoConfig.from_pretrained("aaronfeller/PeptideCLM-23M-all")

In [9]:
config

RoFormerConfig {
  "_name_or_path": "aaronfeller/PeptideCLM-23M-all",
  "architectures": [
    "RoFormerForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 768,
  "model_type": "roformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "rotary_value": false,
  "torch_dtype": "float32",
  "transformers_version": "4.44.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 586
}

In [3]:
vocab_file = "/home/a03-sgoel/FLaT/src/latent_transport/energy/peptide_utils/new_vocab.txt"
splits_file = "/home/a03-sgoel/FLaT/src/latent_transport/energy/peptide_utils/new_splits.txt"

tokenizer = SMILES_SPE_Tokenizer(vocab_file, splits_file)



In [3]:
seqs = pd.read_csv("/home/a03-sgoel/FLaT/data/permeability/test.csv")['Sequence'].tolist()

In [4]:
seqs

['CCCCCCCC(=O)OC[C@H](NC(=O)[C@H](CO)NC(=O)CN)C(N)=O',
 'CC[C@H](C)[C@H](NC(=O)[C@@H]1Cc2ccccc2CN1)C(=O)N1Cc2ccccc2C[C@@H]1C(=O)N[C@@H](Cc1ccc(OP(=O)(O)O)cc1)C(=O)N[C@@H](CC(N)=O)C(=O)N[C@@H](CC(C)C)C(=O)NCC(=O)N[C@@H](CCC(=O)O)C(=O)O',
 'CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](CC(C)C)N(C)C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H](CC(C)C)N(C)C(=O)[C@H](C)N(C)C(=O)[C@H](Cc2ccccc2)N(C)C(=O)[C@@H](C)N(C)C(=O)C[C@@H](C(=O)N[C@H](C(=O)N2CCCCC2)[C@@H](C)CC)NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H]([C@@H](C)O)NC(=O)[C@H](CO)NC1=O',
 'CC(C)C[C@H](NC(=O)[C@H](Cc1ccccc1)NC(=O)CNC(=O)CNC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@H](C)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCCN)C(N)=O',
 'CC[C@H](C)[C@H](NC(=O)CNC(=O)[C@H](CCCCN)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](C)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(N)=O)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CC(C)C)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCSC)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@@H](NC(=

In [5]:
tokens = tokenizer(seqs[:3], return_tensors='pt', padding=True, truncation=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [6]:
tokens.input_ids

tensor([[  2,  29,  29,  30, 207,  67, 487, 197, 207, 487, 193, 196,  58, 207,
          28, 198,  28, 197,   8, 174,   3,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  2,  29, 487, 193,   8, 487, 197, 207, 485,  13,  28, 137,   1, 137,
          36,  13,   8, 207,  58,  13,  28, 137,   1, 137,  28, 485,  13, 207,
          58, 485, 193, 136,  93, 195,  74, 271, 195,   8, 196,  91, 165, 207,
          58, 485, 193,  28, 197,   8, 174,   8, 207,  58, 485, 193,  2

In [7]:
embed = embed_smiles(tokens, model)