In [42]:
import sentencepiece as spm

# Load the Mistral-7B SentencePiece tokenizer model
sp = spm.SentencePieceProcessor()
sp.load('../model_files/mistral-7B-v0.1/tokenizer.model')

True

In [43]:
# Tokenize a SMILES string
tokenized_smiles = sp.encode("CCO", out_type=str)
print(tokenized_smiles)

['▁C', 'CO']


In [48]:
smiles_examples = [
    "CCO",       # Ethanol
    "C1=CC=CC=C1", # Benzene
    "CC(=O)O",   # Acetic acid
    "C(C(=O)O)N", # Glycine,
    "C#N" #hydrogen cyanide
]

In [49]:
# Tokenize the SMILES string
for smiles_example in smiles_examples:
    tokenized_smiles = sp.encode(smiles_example, out_type=str)
    print("\nOriginal SMILES:", smiles_example)
    print("\nTokenized SMILES:", tokenized_smiles)

    # Decode the tokens back to SMILES
    decoded_smiles = sp.decode(tokenized_smiles)
    print("\nDecoded SMILES:", decoded_smiles)


Original SMILES: CCO

Tokenized SMILES: ['▁C', 'CO']

Decoded SMILES: CCO

Original SMILES: C1=CC=CC=C1

Tokenized SMILES: ['▁C', '1', '=', 'CC', '=', 'CC', '=', 'C', '1']

Decoded SMILES: C1=CC=CC=C1

Original SMILES: CC(=O)O

Tokenized SMILES: ['▁CC', '(', '=', 'O', ')', 'O']

Decoded SMILES: CC(=O)O

Original SMILES: C(C(=O)O)N

Tokenized SMILES: ['▁C', '(', 'C', '(', '=', 'O', ')', 'O', ')', 'N']

Decoded SMILES: C(C(=O)O)N

Original SMILES: C#N

Tokenized SMILES: ['▁C', '#', 'N']

Decoded SMILES: C#N


In [52]:
complex_examples = [
    "[12C]", #elemental carbon-13
    "[13C]", #elemental carbon-13
    "[OH3+]", #Hydronium
    "[Fe+3]", #Ferric
    "[Na+].[Cl-]", #Sodium Chloride
    "F/C=C/F", #trans-difluoroethene
    "F/C=C\F", #cis-difluoroethene
    "C1C[C@H]2CCCC[C@H]2CC1", #cis-decalin
]

In [53]:
# Tokenize the SMILES string
for smiles_example in complex_examples:
    tokenized_smiles = sp.encode(smiles_example, out_type=str)
    print("\nOriginal SMILES:", smiles_example)
    print("\nTokenized SMILES:", tokenized_smiles)

    # Decode the tokens back to SMILES
    decoded_smiles = sp.decode(tokenized_smiles)
    print("\nDecoded SMILES:", decoded_smiles)


Original SMILES: [12C]

Tokenized SMILES: ['▁[', '1', '2', 'C', ']']

Decoded SMILES: [12C]

Original SMILES: [13C]

Tokenized SMILES: ['▁[', '1', '3', 'C', ']']

Decoded SMILES: [13C]

Original SMILES: [OH3+]

Tokenized SMILES: ['▁[', 'O', 'H', '3', '+', ']']

Decoded SMILES: [OH3+]

Original SMILES: [Fe+3]

Tokenized SMILES: ['▁[', 'Fe', '+', '3', ']']

Decoded SMILES: [Fe+3]

Original SMILES: [Na+].[Cl-]

Tokenized SMILES: ['▁[', 'Na', '+', '].', '[', 'Cl', '-', ']']

Decoded SMILES: [Na+].[Cl-]

Original SMILES: F/C=C/F

Tokenized SMILES: ['▁F', '/', 'C', '=', 'C', '/', 'F']

Decoded SMILES: F/C=C/F

Original SMILES: F/C=C\F

Tokenized SMILES: ['▁F', '/', 'C', '=', 'C', '\\', 'F']

Decoded SMILES: F/C=C\F

Original SMILES: C1C[C@H]2CCCC[C@H]2CC1

Tokenized SMILES: ['▁C', '1', 'C', '[', 'C', '@', 'H', ']', '2', 'CC', 'CC', '[', 'C', '@', 'H', ']', '2', 'CC', '1']

Decoded SMILES: C1C[C@H]2CCCC[C@H]2CC1
