In [1]:
import selfies as sf

In [4]:
# Encoding SMILES of Aspirin as SELFIES.
aspirin = 'O=C(C)Oc1ccccc1C(=O)O'
asp_selfies = sf.encoder(aspirin)
asp_selfies

'[O][=C][Branch1][C][C][O][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=Branch1][C][=O][O]'

In [5]:
# Encoding SELFIES of Aspirin as SMILES.
asp_smiles = sf.decoder(asp_selfies)
asp_smiles

'O=C(C)OC1=CC=CC=C1C(=O)O'

In [30]:
# One-hot encoding SELFIES.
dataset = ['CC(=O)NC1=CC=C(C=C1)O', 'O=C(C)Oc1ccccc1C(=O)O', 'C1CN(CCN1CCOCC(=O)O)[C@H](C2=CC=CC=C2)C3=CC=C(C=C3)Cl']
selfies_dataset = list(map(sf.encoder, dataset))
selfies_dataset

['[C][C][=Branch1][C][=O][N][C][=C][C][=C][Branch1][Branch1][C][=C][Ring1][=Branch1][O]',
 '[O][=C][Branch1][C][C][O][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=Branch1][C][=O][O]',
 '[C][C][N][Branch1][#C][C][C][N][Ring1][=Branch1][C][C][O][C][C][=Branch1][C][=O][O][C@H1][Branch1][=Branch2][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=C][C][=C][Branch1][Branch1][C][=C][Ring1][=Branch1][Cl]']

In [31]:
# Getting maximum length of SELFIES for One-hot encoding.
max_len = max(sf.len_selfies(s) for s in selfies_dataset)
max_len

41

In [32]:
alphabet = sf.get_alphabet_from_selfies(selfies_dataset)
alphabet.add("[nop]")
alphabet = list(sorted(alphabet))
alphabet

['[#C]',
 '[=Branch1]',
 '[=Branch2]',
 '[=C]',
 '[=O]',
 '[Branch1]',
 '[C@H1]',
 '[C]',
 '[Cl]',
 '[N]',
 '[O]',
 '[Ring1]',
 '[nop]']

In [33]:
# Creating a mapping between the SELFIES symbols and indices.
# vocbulary selfies to indices
vocab_stoi = {symbol: idx for idx, symbol in enumerate(alphabet)}
vocab_itos = {idx: symbol for symbol, idx in vocab_stoi.items()}

In [34]:
print(vocab_stoi)
print(vocab_itos)

{'[#C]': 0, '[=Branch1]': 1, '[=Branch2]': 2, '[=C]': 3, '[=O]': 4, '[Branch1]': 5, '[C@H1]': 6, '[C]': 7, '[Cl]': 8, '[N]': 9, '[O]': 10, '[Ring1]': 11, '[nop]': 12}
{0: '[#C]', 1: '[=Branch1]', 2: '[=Branch2]', 3: '[=C]', 4: '[=O]', 5: '[Branch1]', 6: '[C@H1]', 7: '[C]', 8: '[Cl]', 9: '[N]', 10: '[O]', 11: '[Ring1]', 12: '[nop]'}


In [23]:
acetamenophen_onehot = selfies_dataset[0]
label, one_hot = sf.selfies_to_encoding(acetamenophen, vocab_stoi, pad_to_len=max_len)
print(label)
print(one_hot)
print(type(one_hot))

[7, 7, 1, 7, 4, 9, 7, 3, 7, 3, 5, 5, 7, 3, 11, 1, 10, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]
[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [22]:
acetamenophen_selfies = sf.encoding_to_selfies(one_hot, vocab_itos, enc_type='one_hot')
acetamenophen_selfies

'[C][C][=Branch1][C][=O][N][C][=C][C][=C][Branch1][Branch1][C][=C][Ring1][=Branch1][O][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop][nop]'

In [38]:
vocab_stoi = {'[#C]': 0, '[=Branch1]': 1, '[=Branch2]': 2, '[=C]': 3, '[=O]': 4, '[Branch1]': 5, '[C@H1]': 6, '[C]': 7, '[Cl]': 8, '[N]': 9, '[O]': 10, '[Ring1]': 11, '[nop]': 12}
dataset_onehot = sf.batch_selfies_to_flat_hot(dataset, vocab_stoi)

KeyError: '[C@H]'