In [73]:
import numpy as np
import pandas as pd
from rdkit import Chem

In [48]:
def get_data(filename):
    with open(filename) as f:
        lines = [line.rstrip() for line in f]
    return lines[1:]

In [54]:
def parse_line(line):
    data = line.split()
    index = int(data[0])
    SMILES_GDB = data[1]
    SMILES_B3LYP = data[2]
    SMILES_CORINA = data[3]
    distance = float(data[4])
    return_list = [index, SMILES_GDB, SMILES_B3LYP, SMILES_CORINA, distance]
    return return_list

In [63]:
def construct_data_lists(filename):
    lines = get_data(filename)

    llists = []
    
    for i, line in enumerate(lines): 
            data_list = parse_line(line)
            llists.append(data_list)
            
    return llists

In [64]:
def construct_dataframe(filename):
    columns = ["Index", "SMILES_GDB", "SMILES_B3LYP", "SMILES_CORINA", "distance"]
    data = construct_data_lists(filename)
    dataframe = pd.DataFrame(data, columns=columns)
    return dataframe

In [67]:
filename = "../data/failed.txt"
df = construct_dataframe(filename)
df

Unnamed: 0,Index,SMILES_GDB,SMILES_B3LYP,SMILES_CORINA,distance
0,58,NC(=N)C#N,NC(=N)C#N,[NH]C(=[NH2])C#N,6.036217
1,61,NC(=N)C=O,NC(=N)C=O,[NH]C(=[NH2])C=O,5.631463
2,80,[NH3+]CC([O-])=O,NCC(=O)O,[NH3]CC(=O)[O],10.093600
3,185,N=C1NC=CO1,[NH][C@@H]1NC=CO1,N=c1occ[nH]1,1.588400
4,186,N=C1OC=CO1,[NH][C@@H]1OC=CO1,N=c1occo1,1.785040
...,...,...,...,...,...
3049,133858,CC1N2C3C4=CCC13C24,C[C@H]1[C]2C[CH][C]3[C@@H]2N1[CH]3,C[C@@H]1N2[C@H]3[C@@]41[C@@H]2[C]3[CH]C4,36.267950
3050,133860,OC1C2C3C4=CCC13C24,O[C@@H]1[C@H]2[C@]31C[CH][C]1[C@H]2[C@H],O[C@@H]1[C@H]2[C@]31C[CH][C][C@@H]3[CH]2,27.088470
3051,133864,C1C2CC3OC4CC1C234,[CH]1C[C@@H]2C[C@H]3[C]2[C@@H](O1)C3,C1[C@@H]2C[C@H]3[C@@]42[C@@H]1C[C@H]4O3,44.102030
3052,133865,C1C2CN3CC4OC1C234,[CH]1CN2C[C@H]3[C]2[C@@H](O1)C3,C1[C@H]2CN3[C@]42[C@H]1O[C@@H]4C3,34.235130


In [77]:
# need to canonicalise SMILES
def canonical_SMILES(SMILES):
    mol = Chem.MolFromSmiles(SMILES)
    try:
        canon_SMILES = Chem.MolToSmiles(mol, canonical=True)
    except: 
        canon_SMILES = "NoSMILES"
    return canon_SMILES

In [88]:
# change columns to canonical versions
def canon_df(df):
    c_df = df.copy()
    c_df["SMILES_GDB"] = c_df["SMILES_GDB"].apply(canonical_SMILES)
    c_df["SMILES_B3LYP"] = c_df["SMILES_B3LYP"].apply(canonical_SMILES)
    c_df["SMILES_CORINA"] = c_df["SMILES_CORINA"].apply(canonical_SMILES)
    return c_df

In [108]:
c_df = canon_df(df)
c_df

RDKit ERROR: [14:58:59] Explicit valence for atom # 2 N, 4, is greater than permitted
RDKit ERROR: [14:58:59] Explicit valence for atom # 2 N, 4, is greater than permitted
RDKit ERROR: [14:58:59] Explicit valence for atom # 3 N, 4, is greater than permitted
RDKit ERROR: [14:58:59] Explicit valence for atom # 3 N, 4, is greater than permitted
RDKit ERROR: [14:58:59] Explicit valence for atom # 2 N, 4, is greater than permitted
RDKit ERROR: [14:58:59] Explicit valence for atom # 3 N, 4, is greater than permitted
RDKit ERROR: [14:58:59] Explicit valence for atom # 3 N, 4, is greater than permitted
RDKit ERROR: [14:58:59] Explicit valence for atom # 3 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 2 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 2 

RDKit ERROR: [14:59:00] Explicit valence for atom # 2 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 6 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 6 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 6 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 8 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 8 

RDKit ERROR: [14:59:00] Explicit valence for atom # 8 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 8 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 8 

RDKit ERROR: [14:59:00] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 8 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [14:59:00] Explicit valence for atom # 1 

Unnamed: 0,Index,SMILES_GDB,SMILES_B3LYP,SMILES_CORINA,distance
0,58,N#CC(=N)N,N#CC(=N)N,NoSMILES,6.036217
1,61,N=C(N)C=O,N=C(N)C=O,NoSMILES,5.631463
2,80,[NH3+]CC(=O)[O-],NCC(=O)O,NoSMILES,10.093600
3,185,N=c1[nH]cco1,[NH][C@@H]1NC=CO1,N=c1[nH]cco1,1.588400
4,186,N=c1occo1,[NH]C1OC=CO1,N=c1occo1,1.785040
...,...,...,...,...,...
3049,133858,CC1N2C3C4=CCC13C42,C[C@H]1[C]2C[CH][C]3[CH]N1[C@@H]32,C[C@@H]1N2[C@H]3[C]4[CH]C[C@]13[C@@H]42,36.267950
3050,133860,OC1C2C3C4=CCC13C42,NoSMILES,O[C@@H]1[C@@H]2[CH][C@H]3[C][CH]C[C@@]312,27.088470
3051,133864,C1C2CC3OC4CC1C234,[CH]1C[C@@H]2C[C@@H]3C[C@H](O1)[C]23,C1[C@H]2C[C@H]3O[C@H]4C[C@@H]1[C@@]243,44.102030
3052,133865,C1C2CN3CC4OC1C243,[CH]1CN2C[C@@H]3C[C@H](O1)[C]32,C1[C@H]2CN3C[C@H]4O[C@@H]1[C@]243,34.235130


In [133]:
# check if SMILES_GDB and SMILES_B3LYP are the same and whether distance has 
# anything to do with that
c_df["Boolean"] = (c_df["SMILES_GDB"] == c_df["SMILES_B3LYP"])
c_df

Unnamed: 0,Index,SMILES_GDB,SMILES_B3LYP,SMILES_CORINA,distance,Boolean
0,58,N#CC(=N)N,N#CC(=N)N,NoSMILES,6.036217,True
1,61,N=C(N)C=O,N=C(N)C=O,NoSMILES,5.631463,True
2,80,[NH3+]CC(=O)[O-],NCC(=O)O,NoSMILES,10.093600,False
3,185,N=c1[nH]cco1,[NH][C@@H]1NC=CO1,N=c1[nH]cco1,1.588400,False
4,186,N=c1occo1,[NH]C1OC=CO1,N=c1occo1,1.785040,False
...,...,...,...,...,...,...
3049,133858,CC1N2C3C4=CCC13C42,C[C@H]1[C]2C[CH][C]3[CH]N1[C@@H]32,C[C@@H]1N2[C@H]3[C]4[CH]C[C@]13[C@@H]42,36.267950,False
3050,133860,OC1C2C3C4=CCC13C42,NoSMILES,O[C@@H]1[C@@H]2[CH][C@H]3[C][CH]C[C@@]312,27.088470,False
3051,133864,C1C2CC3OC4CC1C234,[CH]1C[C@@H]2C[C@@H]3C[C@H](O1)[C]23,C1[C@H]2C[C@H]3O[C@H]4C[C@@H]1[C@@]243,44.102030,False
3052,133865,C1C2CN3CC4OC1C243,[CH]1CN2C[C@@H]3C[C@H](O1)[C]32,C1[C@H]2CN3C[C@H]4O[C@@H]1[C@]243,34.235130,False


In [134]:
n_true = len(np.where(c_df["Boolean"] == True)[0])
n_true

257

In [140]:
def bad_molecule_df(c_df):
    b_df = c_df.copy()
    b_df = b_df[(c_df["Boolean"] == False)]
    del b_df["Boolean"]
    b_df = b_df.reset_index(drop=True)
    return b_df

In [141]:
b_df = bad_molecule_df(c_df)
b_df

Unnamed: 0,Index,SMILES_GDB,SMILES_B3LYP,SMILES_CORINA,distance
0,80,[NH3+]CC(=O)[O-],NCC(=O)O,NoSMILES,10.093600
1,185,N=c1[nH]cco1,[NH][C@@H]1NC=CO1,N=c1[nH]cco1,1.588400
2,186,N=c1occo1,[NH]C1OC=CO1,N=c1occo1,1.785040
3,267,C[NH2+]CC(=O)[O-],CNCC(=O)O,NoSMILES,8.290945
4,286,CC([NH3+])C(=O)[O-],C[C@@H](N)C(=O)O,NoSMILES,12.344910
...,...,...,...,...,...
2792,133858,CC1N2C3C4=CCC13C42,C[C@H]1[C]2C[CH][C]3[CH]N1[C@@H]32,C[C@@H]1N2[C@H]3[C]4[CH]C[C@]13[C@@H]42,36.267950
2793,133860,OC1C2C3C4=CCC13C42,NoSMILES,O[C@@H]1[C@@H]2[CH][C@H]3[C][CH]C[C@@]312,27.088470
2794,133864,C1C2CC3OC4CC1C234,[CH]1C[C@@H]2C[C@@H]3C[C@H](O1)[C]23,C1[C@H]2C[C@H]3O[C@H]4C[C@@H]1[C@@]243,44.102030
2795,133865,C1C2CN3CC4OC1C243,[CH]1CN2C[C@@H]3C[C@H](O1)[C]32,C1[C@H]2CN3C[C@H]4O[C@@H]1[C@]243,34.235130


In [144]:
# only need to export indices of bad molecules 
indices = list(b_df["Index"])
indices

[80,
 185,
 186,
 267,
 286,
 752,
 753,
 754,
 755,
 881,
 884,
 885,
 888,
 891,
 894,
 897,
 898,
 906,
 907,
 908,
 914,
 1003,
 1102,
 1117,
 1126,
 1129,
 1232,
 1259,
 1265,
 1281,
 1284,
 1290,
 1596,
 1600,
 1675,
 1680,
 1684,
 1685,
 1686,
 1806,
 1994,
 2356,
 2357,
 2386,
 3704,
 3751,
 3763,
 3771,
 3773,
 3831,
 3838,
 3842,
 3885,
 3886,
 3891,
 3895,
 3899,
 3941,
 3945,
 4010,
 4239,
 4240,
 4266,
 4267,
 4276,
 4277,
 4278,
 4290,
 4291,
 4292,
 4293,
 4303,
 4329,
 4392,
 4395,
 4396,
 4422,
 4423,
 4427,
 4429,
 4430,
 4435,
 4436,
 4439,
 4441,
 4444,
 4445,
 4450,
 4451,
 4555,
 4558,
 4561,
 4607,
 4613,
 4623,
 4625,
 4626,
 4627,
 4628,
 4629,
 4635,
 4636,
 4807,
 4871,
 4872,
 4876,
 4879,
 4880,
 4883,
 4885,
 4888,
 4891,
 4892,
 4895,
 4898,
 4899,
 4939,
 4940,
 4997,
 5003,
 5006,
 5011,
 5013,
 5014,
 5016,
 5020,
 5023,
 5026,
 5032,
 5034,
 5040,
 5100,
 5101,
 5367,
 5370,
 5373,
 5374,
 5418,
 5419,
 5557,
 5598,
 5599,
 5631,
 5642,
 5730,
 6006,


In [146]:
# write to file 
filename = "../data/bad_molecules.txt"
with open(filename, "w") as f:
    for idx in indices:
        f.write(str(idx))
        f.write("\n")