In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
from anima.smiles import SMILES

#### Initiating SMILES class

In [3]:
sml = SMILES()

#### Defining working dirs

In [4]:
main_dir = os.getcwd()
xyz_dir = main_dir + "/xyz/"
structures = os.listdir(xyz_dir)

## Converting .xyz to SMILES
---

### Method 1 - Using the xyz2smiles function
This method is based on https://github.com/jensengroup/xyz2mol from the paper DOI: 10.1002/bkcs.10334. It is slower than the second method (based on OpenBabel), but usually yield more trustable SMILES.

#### Reading the xyz files

In [5]:
sml.xyz_to_smiles('xyz/1.xyz')

'O=CC1C2C3=NNS(=O)(=O)C(Br)=C3C1C1CC12'

In [6]:
for i in structures[:10]:
    n = int(i[:-4])
    print(sml.xyz_to_smiles(xyz_dir + i))

N#CC1=CC2C3Cc4sc(=N)c(Br)c1c4N23
N=c1[nH]cc2c(c1Br)C1CC2C2CNC(=O)N12
N#CC12CNc3nc(=O)sc(Br)c3C(C1)C2
Brc1c2c(nc3cnsc13)OCC1CC2C1
N=c1nc2oc(Br)c3c4c(cc1n4-2)C=NC3=O
[O]C1=C(Br)C2=C(N=[S]1)C1CC2N2CCCC12
O=c1nc2c(c(Br)o1)C1CC2C2NCCNC12
Brc1c2c(nc3sccc13)C1C3C=CC2N31
O=S1(=O)C=NN2C=CC3CN4CC4C3C2=C1Br
N=c1scc2c(c1Br)C1=C(CC=C2)CCC1


#### Using the standard_smiles function based on RDKIT to standardize the SMILES in Canonical format 

In [7]:
for i in structures[:10]:
    n = int(i[:-4])
    print(sml.standard_smiles(sml.xyz_to_smiles(xyz_dir + i)))

N#CC1=CC2C3Cc4sc(=N)c(Br)c1c4N23
N=c1[nH]cc2c(c1Br)C1CC2C2CNC(=O)N12
N#CC12CNc3nc(=O)sc(Br)c3C(C1)C2
Brc1c2c(nc3cnsc13)OCC1CC2C1
N=c1nc2oc(Br)c3c4c(cc1n4-2)C=NC3=O
[O]C1=C(Br)C2=C(N=[S]1)C1CC2N2CCCC12
O=c1nc2c(c(Br)o1)C1CC2C2NCCNC12
Brc1c2c(nc3sccc13)C1C3C=CC2N31
O=S1(=O)C=NN2C=CC3CN4CC4C3C2=C1Br
N=c1scc2c(c1Br)C1=C(CC=C2)CCC1


#### Calling the smiles_cleaner to get a more simplified version of the SMILES for NLP

In [8]:
for i in structures[:10]:
    n = int(i[:-4])
    print(sml.smiles_cleaner(sml.standard_smiles(sml.xyz_to_smiles(xyz_dir + i))))

N#CC1=CC2C3Cc4sc(=N)c(Br)c1c4N23
N=c1[nH]cc2c(c1Br)C1CC2C2CNC(=O)N12
N#CC12CNc3nc(=O)sc(Br)c3C(C1)C2
Brc1c2c(nc3cnsc13)OCC1CC2C1
N=c1nc2oc(Br)c3c4c(cc1n4-2)C=NC3=O
OC1=C(Br)C2=C(N=S1)C1CC2N2CCCC12
O=c1nc2c(c(Br)o1)C1CC2C2NCCNC12
Brc1c2c(nc3sccc13)C1C3C=CC2N31
O=S1(=O)C=NN2C=CC3CN4CC4C3C2=C1Br
N=c1scc2c(c1Br)C1=C(CC=C2)CCC1


#### Storing 30xSMILES in a list for later

In [9]:
smiles1 = []
for i in structures[:30]:
    n = int(i[:-4])
    smiles1.append(sml.smiles_cleaner(sml.standard_smiles(sml.xyz_to_smiles(xyz_dir + i,chiral=False,charged_fragments=True))))

In [10]:
smiles1

['N#CC1=CC2C3Cc4sc(=N)c(Br)c1c4N23',
 'N=c1[nH]cc2c(c1Br)C1CC2C2CNC(=O)N12',
 'N#CC12CNc3nc(=O)sc(Br)c3C(C1)C2',
 'Brc1c2c(nc3cnsc13)OCC1CC2C1',
 'N=c1nc2oc(Br)c3c4c(cc1n4-2)C=NC3=O',
 '[O-]c1[s+]nc2c(c1Br)C1CC2C2CCCN12',
 'O=c1nc2c(c(Br)o1)C1CC2C2NCCNC12',
 'Brc1c2c(nc3sccc13)C1C3C=CC2N31',
 'O=S1(=O)C=NN2C=CC3CN4CC4C3C2=C1Br',
 'N=c1scc2c(c1Br)C1=C(CC=C2)CCC1',
 'C#Cc1c(Br)c2cn[nH]c(Br)c2c2cccc12',
 'O=c1cc2[nH]c(Br)c3c(=O)[nH]nc4nc1c2=c43',
 'O=c1occ2c(c1Br)C1CC2C2CCCNC12',
 'N=c1scc2c(c1Br)C1CCC3NC31CN2',
 'O=C1CC2C3CC(c4cc(=O)[nH]c(Br)c43)C2O1',
 'O=S1(=O)C=NN2C(=C1Br)C1C3COC(C3)CC12',
 'N=c1nc2c(c(Br)s1)C1CC2CC12CCN2',
 'N=C1NC2CCC(C1=O)c1n[s+]c([O-])c(Br)c12',
 'N=C1SC(Br)=C2C3CC(C4CC4N3)N2S1(=O)=O',
 'O=C1OC2C3CC(c4c3c[nH]c(=O)c4Br)C2O1',
 'N=C1CC2Cc3cc(=O)[nH]c(Br)c3C2C(=N)N1',
 'O=S1([O-])=C2C(Br)=[S+]C(=C1)COCOC1CC2C1',
 'O=S1(=O)C=C2OC3CCC4C(=C1Br)N2C34',
 'N=c1nc2c(c(Br)[nH]1)C1CC2C2OCC1C2=O',
 'O=S1(=O)C=C2NC=NC3CC(C3)C2=C(Br)N1',
 'N#CC1NC(=N)CC2Cc3ccnc(Br)c3C21',
 'N#CC

### Method 2 - Using OpenBabel based xyz2smiles
This method is based on OpenBabel.

#### Reading the xyz files

In [18]:
sml.OB_xyz_to_smiles('xyz/1.xyz')

'BrC1=C2[C@H]3[C@@H]4C[C@@H]4[C@H]([C@@H]3C=O)[C]2[N]NS1(=O)=O'

In [19]:
for i in structures[:10]:
    n = int(i[:-4])
    print(sml.OB_xyz_to_smiles(xyz_dir + i))

BrC1=C2[C]3[C](C[C@@H]4[C@H]([CH][C]2[C][N])[N@]34)S[C]1[NH]
BrC1=C(N[CH]C2=C1[C@@H]1C[C@H]2[C@H]2CNC(=O)N12)[NH]
Brc1c2[C@H]3C[C@](C3)(CNc2nc(=O)s1)C#N
Brc1c2[C@@H]3C[C@@H](C3)COc2nc2c1snc2
BrC1=C2c3c(cc4n3C(=N[C]4[NH])O1)C=NC2=O
Brc1c2[C@H]3C[C@H]([C@@H]4CCC[N@]34)c2nsc1=O
Brc1c2[C@H]3C[C@H]([C@@H]4NCCN[C@@H]34)c2nc(=O)o1
Brc1c2[C@@H]3C=C[C@@H]4[C@@H]([N@]34)c2nc2c1ccs2
BrC1=C2[C@H]3[C@@H]4C[N@@]4C[C@@H]3C=CN2N=CS1(=O)=O
BrC1=C(S[CH]C2=C1[C]1[C](CCC1)CC=C2)[NH]


#### Using the standard_smiles function based on RDKIT to standardize the SMILES in Canonical format 

In [26]:
for i in structures[:10]:
    n = int(i[:-4])
    print(sml.standard_smiles(sml.OB_xyz_to_smiles(xyz_dir + i)))

[N][C][C]1[CH]C2C3C[C]4S[C]([NH])C(Br)=C1[C]4N23
[NH]C1=C(Br)C2=C([CH]N1)C1CC2N2C(=O)NCC12
N#CC12CNc3nc(=O)sc(Br)c3C(C1)C2
Brc1c2c(nc3cnsc13)OCC1CC2C1
[NH][C]1N=C2OC(Br)=C3C(=O)N=Cc4cc1n2c43
O=c1snc2c(c1Br)C1CC2C2CCCN12
O=c1nc2c(c(Br)o1)C1CC2C2NCCNC12
Brc1c2c(nc3sccc13)C1C3C=CC2N31
O=S1(=O)C=NN2C=CC3CN4CC4C3C2=C1Br
[NH]C1=C(Br)C2=C([CH]S1)C=CC[C]1CCC[C]12


#### Calling the smiles_cleaner to get a more simplified version of the SMILES for NLP

In [27]:
for i in structures[:10]:
    n = int(i[:-4])
    print(sml.smiles_cleaner(sml.standard_smiles(sml.OB_xyz_to_smiles(xyz_dir + i))))

NCC1CC2C3CC4SC(N)C(Br)=C1C4N23
NC1=C(Br)C2=C(CN1)C1CC2N2C(=O)NCC12
N#CC12CNc3nc(=O)sc(Br)c3C(C1)C2
Brc1c2c(nc3cnsc13)OCC1CC2C1
NC1N=C2OC(Br)=C3C(=O)N=Cc4cc1n2c43
O=c1snc2c(c1Br)C1CC2C2CCCN12
O=c1nc2c(c(Br)o1)C1CC2C2NCCNC12
Brc1c2c(nc3sccc13)C1C3C=CC2N31
O=S1(=O)C=NN2C=CC3CN4CC4C3C2=C1Br
NC1=C(Br)C2=C(CS1)C=CCC1CCCC12


#### Storing 30xSMILES in a list for later

In [32]:
smiles2 = []
for i in structures[:30]:
    n = int(i[:-4])
    smiles2.append(sml.smiles_cleaner(sml.OB_standard_smiles(sml.OB_xyz_to_smiles(xyz_dir + i))))

In [35]:
smiles2

['NCC1CC2C3N2C2C1=C(Br)C(SC2C3)N',
 'O=C1NCC2N1C1CC2C2=C1C(=C(NC2)N)Br',
 'N#CC12CNc3c(C(C1)C2)c(Br)sc(=O)n3',
 'Brc1c2C3CC(C3)COc2nc2c1snc2',
 'BrC1=C2C(=O)N=Cc3c2n2C(=NC(c2c3)N)O1',
 'O=c1snc2c(c1Br)C1CC2C2N1CCC2',
 'O=c1nc2C3CC(c2c(o1)Br)C1C3NCCN1',
 'Brc1c2C3C=CC4N3C4c2nc2c1ccs2',
 'BrC1=C2N(C=CC3C2C2CN2C3)N=CS1(=O)=O',
 'NC1=C(Br)C2=C(CS1)C=CCC1C2CCC1',
 'C#Cc1c(Br)c2cn[nH]c(c2c2c1ccc2)Br',
 'BrC1NC2=CC(=O)C3C2C2C1C(=O)NNC2=N3',
 'O=c1occ2c(c1Br)C1CC2C2C1NCCC2',
 'NC1=C(Br)C2=C(CS1)NCC13C2CCC3N1',
 'O=C1CC2C(O1)C1CC2c2c1cc(=O)[nH]c2Br',
 'BrC1=C2N(NCS1(=O)=O)C1C2C2COC(C1)C2',
 'NC1=NC2C(=C(S1)Br)C1CC2CC21CCN2',
 'O=C1C(N)NC2CCC1c1nsc(=O)c(c21)Br',
 'BrC1=C2C3NC4C(C(N2S(=O)(=O)C(S1)N)C3)C4',
 'O=C1OC2C(O1)C1CC2c2c1c(Br)c(=O)[nH]c2',
 'N=C1CC2Cc3c(C2C(=N)N1)c(Br)[nH]c(=O)c3',
 'BrC1SC2COCOC3CC(C1S(=O)(=O)C2)C3',
 'BrC1=C2C3CCC4C3N2C(CS1(=O)=O)O4',
 'O=C1C2OCC1C1CC2C2=C1C(Br)NC(=N2)N',
 'BrC1NS(=O)(=O)C=C2C1C1CC(C1)N=CN2',
 'NCC1NC(N)CC2C1c1c(C2)ccnc1Br',
 'NCC1OC2C3C1C1=C(Br)C(N)SN=

As one can see, this list of SMILES shows a few differences compared to the previous method. Therefore, we recommend the Method1 in general.

## NLP
---
After preparing the list of SMILES we can use the NLP tools to process them.

### Spliting the SMILES

In [23]:
smiles1[0]

'N#CC1=CC2C3Cc4sc(=N)c(Br)c1c4N23'

In [22]:
print(sml.smilesSEP(smiles1[0]))

['N', '#', 'C', 'C', '1', '=', 'C', 'C', '2', 'C', '3', 'C', 'c', '4', 's', 'c', '(', '=', 'N', ')', 'c', '(', 'Br', ')', 'c', '1', 'c', '4', 'N', '2', '3']


### Creating the Vocab

In [11]:
vocab = sml.smilesVOC(smiles1)
vocab_size = len(vocab)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.1s finished


This is how the vocab looks like:

In [13]:
print(vocab)

['#', '(', ')', '-', '1', '2', '3', '4', '=', 'Br', 'C', 'N', 'O', 'S', '[C+]', '[C-]', '[NH-]', '[O+]', '[O-]', '[S+]', '[nH]', '[s+]', 'c', 'n', 'o', 's']


We can use the vocab to create a index-based sequence following the vocab.

The original SMILES:

In [24]:
print(smiles1[0])

N#CC1=CC2C3Cc4sc(=N)c(Br)c1c4N23


THe indexed SMILES:

In [19]:
print(sml.smilesToSequence(smiles1[0],vocab))

[[11], [0], [10], [10], [4], [8], [10], [10], [5], [10], [6], [10], [22], [7], [25], [22], [1], [8], [11], [2], [22], [1], [9], [2], [22], [4], [22], [7], [11], [5], [6]]
