In [1]:
import joblib, argparse, uuid, sigopt
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit import DataStructs
from sklearn import preprocessing
from utils.sklearn_utils import *
import matplotlib.pyplot as plt
import seaborn as sns

import selfies as sf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers,regularizers

from utils.selfies_util import selfies,smile_to_hot, \
multiple_smile_to_hot, selfies_to_hot, multiple_selfies_to_hot,\
get_selfie_and_smiles_encodings_for_dataset, compare_equality, tanimoto_dist



  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:

names, ret, homo, homo1, diff = selfies()
print(len(names))

selfies_list, selfies_alphabet, largest_selfies_len,\
smiles_list, smiles_alphabet, largest_smiles_len\
= get_selfie_and_smiles_encodings_for_dataset(names)


data = multiple_selfies_to_hot(selfies_list, largest_selfies_len,\
                                       selfies_alphabet)


max_mol_len = data.shape[1]
alpha_len = data.shape[2]
len_alphabet_mol = alpha_len * max_mol_len

..........converting xyz to smiles.......
 3007 /61492

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 9763 /61492

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 18172 /61492

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 25109 /61492

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 31997 /61492

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 39207 /61492

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 46327 /61492

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 53463 /61492

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 58294 /61492

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 1526 /636782

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 5579 /63678

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 9496 /63678

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 13292 /63678

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 17121 /63678

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 2760 / 61182

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 26702 / 61182

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 56232 / 61182

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 61181 / 61182

smiles length: 61180


 61178 /6118058183
58183
58183
58183
58183
58183
--> Translating SMILES to SELFIES...
Finished translating SMILES to SELFIES.


In [6]:
print(len(names))
print(len(data))

58183
58183


In [4]:

# removes functional groups from quinone backbone, this works for IDing what functional 
# groups are in a given file, also reconstructs
# TODO: generate structure from molecules
arom = Chem.MolFromSmiles("c1(ccc(cc1))")
quinone = Chem.MolFromSmiles("c1(ccc(cc1)[O])[O]")
quinone2 = Chem.MolFromSmiles("C1=CC(=O)C=CC1=O")



count1 = 0
count2 = 0
count3 = 0

count_fail_no_match = 0
count_fail = 0 
pattern = "(*)"
pattern_double = "[*]"
functional_list = []

for i, can_smi in enumerate(names):
    
    try:
        temp = Chem.MolFromSmiles(can_smi)
        rm = Chem.DeleteSubstructs(temp, quinone)
        rm2 = Chem.DeleteSubstructs(temp, arom)
        rm3 = Chem.DeleteSubstructs(temp, quinone2)

        #print(can_smi)
        if (len(Chem.MolToSmiles(rm3).split(".")) > 1):
            count1 = count1 + 1
            [functional_list.append(i) for i in Chem.MolToSmiles(rm3).split(".")]
        else: 
            if(len(Chem.MolToSmiles(rm2).split(".")) > 1):
                count2 = count2 + 1
                [functional_list.append(i) for i in Chem.MolToSmiles(rm2).split(".")]

            else:
                if(len(Chem.MolToSmiles(rm).split(".")) > 1):
                    count3 = count3 + 1
                    [functional_list.append(i) for i in Chem.MolToSmiles(rm).split(".")]
                    
                else:
                    pieces_smi = Chem.BRICS.BRICSDecompose(temp)
                    pieces = [Chem.MolFromSmiles(x) for x in BRICS.BRICSDecompose(temp)]
                    count_fail_no_match += 1
                    print(can_smi)

                    
    except:
        count_fail += 1
print(list(set(functional_list))) #retrieve only the found functional groups
print(len(list(set(functional_list))))
print(count1, count2, count3)
print("total processed: "+ str(count1+count2+count3))
print("no substructured: "+ str(count_fail_no_match))
print("fail processed: "+ str(count_fail))

['FC(F)F', 'CC(N)=O', 'C#N', '[O]', 'CO', 'Nc1ccccc1', 'N', 'c1ccccc1', 'Br', 'CC', 'Cl', 'COC=O', 'O', 'CNC', 'O=[NH+][O-]', 'Oc1ccccc1', 'Cc1ccccc1', 'C', 'CN', 'CC([O])(O)O', 'F', 'CC(=O)O', 'CC(C)C']
23
9251 25398 0
total processed: 34649
no substructured: 0
fail processed: 23534


In [5]:
from rdkit.Chem import BRICS
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw

from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem import Recap
from rdkit.Chem import rdRGroupDecomposition as rdRGD

#print(len(list(set(functional_list))))
#print(Chem.MolFromSmiles(names[10]))

can_smi = names[100]
mol_set = [Chem.MolFromSmiles(can_smi) for can_smi in names]
quinone = Chem.MolFromSmiles("c1(ccc(cc1)[O])[O]")
quinone_2 = Chem.MolFromSmiles("C1=CC(=O)C=CC1=O")
quinone_3 = Chem.MolFromSmiles("C1=CC(=O)CCC1=O")
quinone_4 = Chem.MolFromSmiles("C1CC(=O)CCC1=O")
quinone_5 = Chem.MolFromSmiles("C1=CC(=O)C=CC1[O]")
quinone_6 = Chem.MolFromSmiles("C1CC(=O)CC=C1[O]")
quinone_7 = Chem.MolFromSmiles("C1CC(=O)CCC1[O]")
quinone_8 = Chem.MolFromSmiles("[CH]=1[CH]C(=O)C=CC1[O]")


test = Chem.MolFromSmiles("[CH]=1[CH]C(=O)C=CC1[O]")
test2 = Chem.MolFromSmiles("C1(=C[CH]C(=O)C(=C1OC)[O])")
test3 = Chem.MolFromSmiles("C1(=O)[C]=[C]C(=O)[C]=[C]1")

temp = Chem.MolFromSmiles("c1(c(c(c(cc1)[O])C(C)(C)C)N)[O]")
temp_fail = Chem.MolFromSmiles("C1(=C[CH]C(=O)C(=C1OC)C(F)(F)F)[O]")
#temp_fail_trial = Chem.MolFromSmiles("C1(=C[CH]C(=O)C(=C1OC))[O]")

#res, unmatched = rdRGD.RGroupDecompose([quinone], [temp_fail], asSmiles=True)
#print(len(unmatched))
#print(len(res))
#res, unmatched = rdRGD.RGroupDecompose([quinone2], [temp_fail], asSmiles=True)
#print(len(unmatched))
#print(len(res))
#res, unmatched = rdRGD.RGroupDecompose([test], [temp_fail], asSmiles=True)
#print(len(unmatched))
print(len(res))
#res, unmatched = rdRGD.RGroupDecompose([test2], [temp_fail], asSmiles=True)
#print(len(unmatched))
#print(len(res))
#res, unmatched = rdRGD.RGroupDecompose([test3], [temp_fail_trial], asSmiles=True)
#print(len(unmatched))
#print(len(res))



RDKit ERROR: [15:24:01] Explicit valence for atom # 9 N, 4, is greater than permitted
RDKit ERROR: [15:24:01] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [15:24:01] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [15:24:01] Explicit valence for atom # 11 N, 4, is greater than permitted
RDKit ERROR: [15:24:01] Explicit valence for atom # 11 N, 4, is greater than permitted
RDKit ERROR: [15:24:01] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [15:24:01] Explicit valence for atom # 11 N, 4, is greater than permitted
RDKit ERROR: [15:24:01] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [15:24:01] Explicit valence for atom # 15 N, 4, is greater than permitted
RDKit ERROR: [15:24:01] Explicit valence for atom # 14 N, 4, is greater than permitted
RDKit ERROR: [15:24:01] Explicit valence for atom # 10 N, 4, is greater than permitted
RDKit ERROR: [15:24:01] Explicit valence for a

RDKit ERROR: [15:24:05] Explicit valence for atom # 8 N, 4, is greater than permitted
RDKit ERROR: [15:24:05] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [15:24:05] Explicit valence for atom # 14 N, 4, is greater than permitted
RDKit ERROR: [15:24:05] Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [15:24:05] Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [15:24:05] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [15:24:05] Explicit valence for atom # 8 N, 4, is greater than permitted
RDKit ERROR: [15:24:05] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [15:24:05] Explicit valence for atom # 15 N, 4, is greater than permitted
RDKit ERROR: [15:24:05] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [15:24:05] Explicit valence for atom # 16 N, 4, is greater than permitted
RDKit ERROR: [15:24:05] Explicit valence for ato

RDKit ERROR: [15:24:06] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [15:24:06] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [15:24:06] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [15:24:06] Explicit valence for atom # 8 N, 4, is greater than permitted
RDKit ERROR: [15:24:06] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [15:24:06] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [15:24:06] Explicit valence for atom # 8 N, 4, is greater than permitted
RDKit ERROR: [15:24:06] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [15:24:06] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [15:24:06] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [15:24:06] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [15:24:06] Explicit valence for atom # 6 

NameError: name 'res' is not defined

In [21]:
mol_set = [Chem.MolFromSmiles(can_smi) for can_smi in names]

res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8 = [], [], [], [], [], [], [], []
fail_list = []
frag_list = []
homo_frag = []
homo1_frag = []
diff_frag = []
success_name = []

for ind,i in enumerate(mol_set):
        fail = 0 
        try:
            res1, unmatched = rdRGD.RGroupDecompose([quinone], [i], asSmiles=True)
        except:pass
        try:
            res2, unmatched = rdRGD.RGroupDecompose([quinone_2], [i], asSmiles=True)
        except:pass
        try:
            res3, unmatched = rdRGD.RGroupDecompose([quinone_3], [i], asSmiles=True)
        except:pass
        try:
            res4, unmatched = rdRGD.RGroupDecompose([quinone_4], [i], asSmiles=True)
        except:pass
        try:
            res5, unmatched = rdRGD.RGroupDecompose([quinone_5], [i], asSmiles=True)
        except:pass
        try:
            res6, unmatched = rdRGD.RGroupDecompose([quinone_6], [i], asSmiles=True)
        except:pass
        try:
            res7, unmatched = rdRGD.RGroupDecompose([quinone_7], [i], asSmiles=True)
        except:pass
        try:
            res8, unmatched = rdRGD.RGroupDecompose([quinone_8], [i], asSmiles=True)
        except:pass
     
        if(len(res8) > 1 or len(res7) > 1 or len(res6) > 1 or len(res5) > 1 or len(res4) > 1 or len(res3) > 1 or len(res2) > 1 or len(res1) > 1):
            print("longer than 1, shit's wrong")
            
        if(len(res1) != 0):
            frag_list.append(res1)
        else: 
            if(len(res2) != 0):
                frag_list.append(res2)
            else:
                if(len(res3) != 0):
                    frag_list.append(res3)
                else:
                    if(len(res4) != 0):
                        frag_list.append(res4)
                    else:
                        if(len(res5) != 0):
                            frag_list.append(res5)
                        else:
                            if(len(res6) != 0):
                                frag_list.append(res6)
                            else:
                                if(len(res7) != 0):
                                    frag_list.append(res7)
                                else:
                                    if(len(res8) != 0):
                                        frag_list.append(res8)
                                    else:
                                        fail_list.append(i)
                                        fail = 1
        if(fail == 0):
            homo_frag.append(homo[ind])
            homo1_frag.append(homo1[ind])
            diff_frag.append(diff[ind])
            success_name.append(names[ind])
            fail = 0
                    
print(len(frag_list))
print(len(homo_frag))
print(len(homo1_frag))
print(len(diff_frag))

RDKit ERROR: [16:11:07] Explicit valence for atom # 9 N, 4, is greater than permitted
RDKit ERROR: [16:11:07] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [16:11:07] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [16:11:07] Explicit valence for atom # 11 N, 4, is greater than permitted
RDKit ERROR: [16:11:07] Explicit valence for atom # 11 N, 4, is greater than permitted
RDKit ERROR: [16:11:07] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [16:11:07] Explicit valence for atom # 11 N, 4, is greater than permitted
RDKit ERROR: [16:11:07] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [16:11:07] Explicit valence for atom # 15 N, 4, is greater than permitted
RDKit ERROR: [16:11:07] Explicit valence for atom # 14 N, 4, is greater than permitted
RDKit ERROR: [16:11:07] Explicit valence for atom # 10 N, 4, is greater than permitted
RDKit ERROR: [16:11:07] Explicit valence for a

RDKit ERROR: [16:11:11] Explicit valence for atom # 8 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 14 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 8 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 15 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 16 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for ato

RDKit ERROR: [16:11:11] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 11 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 11 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 11 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 11 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 15 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence for atom # 14 N, 4, is greater than permitted
RDKit ERROR: [16:11:11] Explicit valence fo

58152
58152
58152
58152


In [22]:
print(names[-100])
print(frag_list[-100])

print(frag_list[-100][0].get("R3"))
print(frag_list[-100][0].get("R2"))
print(frag_list[-100][0].get("R1"))
print(frag_list[-100][0].get("R2").split("[")[0] )

C1(=O)C(=C(C(=C([N](C)(C)C)[CH]1)[O])OC(=O)C)F
[{'Core': 'O=C1C=C([*:1])C(=O)C([*:2])=C1[*:3]', 'R1': 'N#C[*:1]', 'R2': 'N#C[*:2]', 'R3': 'N#C[*:3]'}]
N#C[*:3]
N#C[*:2]
N#C[*:1]
N#C


In [23]:
r1_list = []
r2_list = []
r3_list = []
r4_list = []

for i in frag_list:
    temp_r1, temp_r2, temp_r3, temp_r4 = '', '', '', ''
    
    try:
        temp_r1 = i[0].get("R1")
        if(len(split_list) == 2):
            r1_list.append(split_list[0])
        else:
            pass
            #r1_list.append(split_list[0] + split_list[-1])

    except:pass
    
    try:
        temp_r2 = i[0].get("R2") 
        if(len(split_list) == 2):
            r2_list.append(split_list[0])
        else:
            pass
            #r2_list.append(split_list[0] + split_list[-1])

    except:pass

    try:
        temp_r3 = i[0].get("R3")
        if(len(split_list) == 2):
            r3_list.append(split_list[0])
        else:
            pass
            #r3_list.append(split_list[0] + split_list[-1])

    except:pass

    try:
        temp_r4 = i[0].get("R4")
        split_list = temp_r4.split("[*:4]")
        if(len(split_list) == 2):
            r4_list.append(split_list[0])
        else:
            pass
            #r4_list.append(split_list[0] + split_list[-1])
    except:pass
    
    #print(temp_r1, temp_r2, temp_r3, temp_r4)


In [28]:
print(len(r1_list), len(r2_list), len(r3_list), len(r4_list), len(success_name))
print(r1_list[0], r2_list[0], r3_list[0], r4_list[0])
print(names[0])


58152 58152 58152 51739 58152
[H]N(C(=O)C([H])([H])[H]) [H]N(C(=O)C([H])([H])[H]) [H]N(C(=O)C([H])([H])[H]) Br
c1(ccc(cc1)[O])[O]


In [None]:
selfies_list, selfies_alphabet, largest_selfies_len,\
smiles_list, smiles_alphabet, largest_smiles_len\
= get_selfie_and_smiles_encodings_for_dataset(r4_list)

In [None]:
quinone_8 = Chem.MolFromSmiles("[CH]=1[CH]C(=O)C=CC1[O]")
quinone_3 = Chem.MolFromSmiles("C1=CC(=O)CCC1=O")
quinone_4 = Chem.MolFromSmiles("C1CC(=O)CCC1=O")
quinone_5 = Chem.MolFromSmiles("C1=CC(=O)C=CC1[O]")
quinone_6 = Chem.MolFromSmiles("C1CC(=O)CC=C1[O]")
quinone_7 = Chem.MolFromSmiles("C1CC(=O)CCC1[O]")
quinone = Chem.MolFromSmiles("c1(ccc(cc1)[O])[O]")
quinone2 = Chem.MolFromSmiles("C1=CC(=O)C=CC1=O")

Draw.MolsToGridImage([quinone, quinone2, quinone_3,
                     quinone_4, quinone_5,
                     quinone_6, quinone_7, quinone_8])

In [None]:
print(len(fail_list))
Draw.MolsToGridImage(fail_list[20:40],molsPerRow=4)

In [None]:
rm = Chem.DeleteSubstructs(temp, quinone)
Draw.MolsToImage([rm])
quinone2 = Chem.MolFromSmiles("C1=CC(=O)C=CC1=O")
quinone = Chem.MolFromSmiles("c1(ccc(cc1)[O])[O]")
Draw.MolsToImage([quinone])


In [None]:

rm2 = Chem.DeleteSubstructs(temp, arom)
Draw.MolsToImage([rm2])


In [None]:

rm3 = Chem.DeleteSubstructs(temp, quinone2)
Draw.MolsToImage([rm3])

In [None]:
count = 0;
for i in list(set(functional_list)):
    if(len(i)< 15):
        count += 1
print(count)

In [None]:
import os 
files = os.listdir("../data/sdf/DB3")
files = [file.split(".")[0] for file in files]
for split_str in files: 
    split_str.split("_")[0]
    if(split_str.split("_")[0] == "tetra"):
        groups = split_str.split("_")[1:5]
    elif(split_str.split("_")[0] == "tris"):
        groups = [split_str.split("_")[1:4], "H"]
    elif(split_str.split("_")[0] == "bis-23"):
        groups = [split_str.split("_")[1], split_str.split("_")[2], "H", "H"]
    elif(split_str.split("_")[0] == "bis-25"):
        groups = [split_str.split("_")[1], "H", split_str.split("_")[2], "H"]
    elif(split_str.split("_")[0] == "bis-26"):
        groups = [split_str.split("_")[1], "H", "H", split_str.split("_")[2]]
    elif(split_str.split("_")[0] == "mono"):
        groups = [split_str.split("_")[1], "H", "H", "H"]
    else: 
        groups = ["H", "H", "H", "H"]
    
    #print(split_str.split("_")[0])
    

In [None]:
from rdkit.Chem import Draw, Descriptors
from rdkit.Chem.Draw import IPythonConsole

mol_list = []
files = os.listdir("../data/sdf/DB3/")
quinone_neg = Chem.MolFromSmiles("c1(ccc(cc1)[O-])[O-]")
quinone_neg
for i in files:
    if(i[0:4] == "mono"):

        #print(i)
        mol =  Chem.SDMolSupplier("../data/sdf/DB3/" + i)[0]
        rm = Chem.DeleteSubstructs(mol, quinone2)
        #print(Chem.MolToSmiles(rm))
        if( Descriptors.MolWt(rm) > 120):
            rm = Chem.DeleteSubstructs(mol, quinone_neg)
        mol_list.append(rm)
            
img = Draw.MolsToGridImage(mol_list, molsPerRow=5, subImgSize=(150, 150), 
                           legends=None, useSVG=True)

In [None]:
img # should print only the sustituent fragments