# Description

This notebook is used to process and evalutate the initial universe of generated smiles, and then retain the network applying techniques and principles from both transfer learning and genetic algorithms to increasingly improve molecule generation for the specific task of binding with cornovirus protease.

## First process initial generated smiles for PyRx analysis

In [1]:
import pandas as pd
from rdkit import Chem, DataStructs
import random
import numpy as np
import rdkit.Chem.PropertyMol
from rdkit.Chem import Descriptors

In [None]:
gen0_table = pd.read_csv('./generations/gen0.smi',sep=',', header=None)
gen0 = list(gen0_table[0])[0:10000]
len(gen0)

In [2]:
def validate_mols(list_of_smiles):
    valid_mols = []
    for smi in list_of_smiles:
        mol = Chem.MolFromSmiles(smi)
        if mol is not None:
            valid_mols.append(mol)
    return valid_mols

def convert_mols_to_smiles(list_of_mols):
    valid_smiles = [Chem.MolToSmiles(mol) for mol in list_of_mols]
    return valid_smiles

In [None]:
gen0_mols = validate_mols(gen0)
len(gen0_mols)

In [3]:
'''Intakes a list of smiles, randomly shuffles them, then adds first thirty,
then sets a max-similarity threshold between any new molecule and existing list
and iteratively increases the treshold until X components are picked to ensure diveristy'''

def initialize_generation_from_mols(list_of_mols,desired_length):  
    assert desired_length >30
    random.shuffle(list_of_mols)
    random.shuffle(list_of_mols)
    
    #Prepare fingerprints for similarity calcs
    mol_fingerprints = []
    for mol in list_of_mols:
        mol_fingerprints.append(Chem.RDKFingerprint(mol))
    
    selected_mols = list_of_mols[0:30]
    selected_fingerprints = mol_fingerprints[0:30]
    remaining_mols = list_of_mols[30:]
    remaining_fingerprints = mol_fingerprints[30:]
    
    similarity_threshold = .05   
    while len(selected_mols) < desired_length:
        for fingerprint, mol in zip(remaining_fingerprints, remaining_mols):
            max_similarity = np.max(DataStructs.BulkTanimotoSimilarity(fingerprint,selected_fingerprints))
            if (max_similarity <= similarity_threshold) and (max_similarity < 1):
                selected_fingerprints.append(fingerprint)
                selected_mols.append(mol)
        print("Completed loop with threshold at: ", similarity_threshold, ". Length is currently: ", len(selected_mols))
        similarity_threshold += .05
    return selected_mols

In [None]:
gen0_mols = initialize_generation_from_mols(gen0_mols,1000)
print(len(gen0_mols))

In [None]:
master_table = pd.read_csv('./generations/master_results_table.csv',sep=',')
master_table.shape[0]

In [4]:
'''Certainly not opimized and not strictly necessary, but in the PyRx GUI
molecule names would sort oddly when in any numeric order, so ordering
molcules by a four letter code. This function iterates the four letter code.'''
def iterate_alpha(alpha_code):
    numbers = []
    for letter in alpha_code:
        number = ord(letter)
        numbers.append(number)
    
    if numbers[3]+1 > 90:
        if numbers[2]+1 > 90:
            if numbers[1]+1 > 90:
                if numbers[0]+1 > 90:
                    raise ValueError('Too long for alpha code')
                else:
                    numbers[3] = 65
                    numbers[2] = 65
                    numbers[1] = 65
                    numbers[0] = numbers[0] + 1
            else:
                numbers[3] = 65
                numbers[2] = 65
                numbers[1] = numbers[1] + 1
        else:
            numbers[3] = 65
            numbers[2] = numbers[2] + 1
    else:
        numbers[3] = numbers[3] + 1
    

    new_code = ""
    for number in numbers:
        new_code += chr(number)
    return new_code
iterate_alpha('AAAA')

'AAAB'

In [5]:
def append_to_tracking_table(master_table,mols_to_append, source, generation):
    # Assign IDs for tracking to each mol, and assign a pandas table entry for each
    mols_to_export = []
    rows_list = []
    
    master_table_gen = master_table[master_table['gen'] == generation]
    if master_table_gen.shape[0] == 0:
        id_code = 'AAAA'
    else:
        master_table_gen_ids = master_table_gen.sort_values('id', ascending=True)
        master_table_gen_max_id = master_table_gen_ids.tail(1)
        key = master_table_gen_max_id['id'].keys()[0]
        id_code = iterate_alpha(str(master_table_gen_max_id['id'][key]))
        
    training_data = pd.read_csv('./datasets/dataset_cleansed.smi', header=None)
    training_set = set(list(training_data[0]))
    
    for mol in mols_to_append:
        pm = Chem.PropertyMol.PropertyMol(mol)
        title = 'id' + str(id_code) + 'gen'+ str(generation)
        print(title)
        # Enables for tracking which molecule is which in PyRx GUI and PyRx results export
        pm.SetProp('Title', title)
        mols_to_export.append(pm)

        #And track in pandas
        mol_dict = {}
        mol_dict['id'] = id_code
        mol_dict['gen'] = generation
        smile = Chem.MolToSmiles(mol)
        assert type(smile) == type('string')
        mol_dict['smile'] = smile

        if (source!= 'hiv' and source != 'manual' and source != 'baseline') and (smile in training_set):
            mol_dict['source'] = 'training'
        else:
            mol_dict['source'] = source
        mol_dict['score'] = 99.9

        rows_list.append(mol_dict)
        id_code = iterate_alpha(id_code)
        
    df = pd.DataFrame(rows_list)
    return df, mols_to_export

In [None]:
new_mols_to_test = append_to_tracking_table(master_table,gen0_mols, 'generated', 0)
mols_for_pd = new_mols_to_test[0]
mols_for_export = new_mols_to_test[1]
master_table = master_table.append(mols_for_pd)
len(mols_for_export)

In [None]:
master_table = master_table.reset_index(drop=True)
master_table.to_csv(r'./generations/master_results_table.csv', index=False)

In [None]:
# Add HIV inhibitors manually into the table
hiv_smiles = pd.read_csv('./datasets/possible_inhibitors.smi',sep=',', header=None)
hiv_smiles = list(hiv_smiles[0])
hiv_mols = validate_mols(hiv_smiles)

master_table = pd.read_csv('./generations/master_results_table.csv',sep=',')
new_mols_to_test = append_to_tracking_table(master_table,hiv_mols, 'hiv', 0)
mols_for_pd = new_mols_to_test[0]
mols_for_export = mols_for_export + new_mols_to_test[1]

master_table = master_table.append(mols_for_pd)
master_table = master_table.reset_index(drop=True)
master_table.to_csv(r'./generations/master_results_table.csv', index=False)

In [None]:
# Add a few other smiles manually into the table ("control group" of training smiles)
manual_smiles = pd.read_csv('./datasets/manual_testing_cleaned.smi',sep=',', header=None)
manual_smiles = list(manual_smiles[0])
manual_mols = validate_mols(hiv_smiles)

master_table = pd.read_csv('./generations/master_results_table.csv',sep=',')
new_mols_to_test = append_to_tracking_table(master_table,manual_mols, 'manual', 0)
mols_for_pd = new_mols_to_test[0]
mols_for_export = mols_for_export + new_mols_to_test[1]

master_table = master_table.append(mols_for_pd)
master_table = master_table.reset_index(drop=True)
master_table.to_csv(r'./generations/master_results_table.csv', index=False)

In [6]:
def write_gen_to_sdf(mols_for_export, generation, batch_size):
    if len(mols_for_export) > batch_size:
        batches = (len(mols_for_export) // 1000)+1
        for i in range(0,batches):
            batch_to_export = mols_for_export[i*batch_size:(i+1)*batch_size]
            w = Chem.SDWriter('./generations/gen' +str(generation) + '_batch_' + str(i+1) + '.sdf')
            for m in batch_to_export: w.write(m)
    else:
        w = Chem.SDWriter('./generations/gen' +str(generation) + '.sdf')
        for m in mols_for_export:
            w.write(m)
    
    # Noticed an issue where the very last line item of an sdf write is not written correctly until another arbitary write is made
    w = Chem.SDWriter('./generations/junk/test.sdf')
    w.write(m)
    
    return mols_for_export

In [None]:
write_gen_to_sdf(mols_for_export, 0, 2000)
print('ok')

## NOW GO TO PyRx: Analyze the SDF file and create a csv of binding score results


## Afterwards, process binding simulation results to 'evolve' the molecules

In [7]:
'''This number must be MANUALLY iterated each generation. I did not write the entire process into a smooth function or loop but that would be the next steps.''' 
GLOBAL_GENERATION = 7

In [8]:
master_table = pd.read_csv('./generations/master_results_table_gen' + str(GLOBAL_GENERATION-1) + '.csv',sep=',')
master_table.tail()

Unnamed: 0,id,gen,smile,source,weight,score
3304,AARM,6,O=C(NC1CC2CC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=O...,generated,,99.9
3305,AARN,6,CC1(CC(CC(O)C2(O)CCCC2)C(O)CO)CCCCCCCCCC1(O)C(...,generated,,99.9
3306,AARO,6,O=C1CC2CCC(N1)N2CCc1ccccc1,generated,,99.9
3307,AARP,6,CC1(C)CCCC2(C)C(CCC3CCCC3)CCCC12,generated,,99.9
3308,AARQ,6,CC1(CC(CC(O)C2(O)CCCC2)C(O)CO)CC1CCC12CC3CC(CC...,generated,,99.9


In [9]:
new_scores = pd.read_csv('./generations/results/results_gen' + str(GLOBAL_GENERATION-1) + '.csv',sep=',')
new_scores.head()

Unnamed: 0,Ligand,Binding Affinity,rmsd/ub,rmsd/lb
0,6lu7_idAALEgen6_uff_E=7109711649.77,-18.3,0.0,0.0
1,6lu7_idAALEgen6_uff_E=7109711649.77,-17.9,4.841,2.076
2,6lu7_idAAPQgen6_uff_E=7075445396.28,-18.1,0.0,0.0
3,6lu7_idAALEgen6_uff_E=7109711649.77,-11.0,3.872,2.341
4,6lu7_idAAPQgen6_uff_E=7075445396.28,-10.9,2.803,1.242


In [10]:
new_scores = new_scores.groupby("Ligand").min()["Binding Affinity"].reset_index()
new_scores['id'] = new_scores['Ligand'].str.split("_").str[1].str.split("gen").str[0].str.split("id").str[1]
new_scores['gen'] = new_scores['Ligand'].str.split("_").str[1].str.split("gen").str[1]
new_scores['score'] = new_scores["Binding Affinity"]
new_scores = new_scores[['id','gen','score']]
new_scores.head()

Unnamed: 0,id,gen,score
0,AAGO,6,-4.3
1,AAGP,6,-5.8
2,AAGQ,6,-6.8
3,AAGR,6,-6.3
4,AAGS,6,-6.3


In [11]:
new_scores.id = new_scores.id.astype(str)
new_scores.gen = new_scores.gen.astype(int)
master_table.id = master_table.id.astype(str)
master_table.gen = master_table.gen.astype(int)
new_table = pd.merge(master_table, new_scores, on=['id','gen'], suffixes=('_old','_new'), how='left')
new_table['score'] = np.where(new_table['score_new'].isnull(), new_table['score_old'], new_table['score_new'])
new_table = new_table.drop(['score_old','score_new'], axis=1)
new_table['weight'] = new_table['smile'].apply(lambda x: Chem.Descriptors.MolWt(Chem.MolFromSmiles(x)))
new_table = new_table.sort_values('score', ascending=True)
new_table.head()

Unnamed: 0,id,gen,smile,source,weight,score
3140,AALE,6,O=C(NC(Cc1ccccc1)C(=O)O)C1CC2CCC(C1)C2C(=O)NC1...,generated,776.931,-18.3
3256,AAPQ,6,O=C(NC1CC2CC(C1)N2CCc1ccccc1)C1CC2CCC(C1)N2CCc...,generated,457.662,-18.1
0,AABL,10,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,generated,786.926,-17.9
2,AACA,10,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,generated,838.062,-17.7
1,AADG,9,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,generated,838.062,-17.7


In [12]:
new_table.to_csv(r'./generations/master_results_table_gen' + str(GLOBAL_GENERATION-1) + '.csv', index=False)

In [13]:
# Select top X ranked by score for training data to refine the molecule generator RNN
training_smiles = list(set(list(new_table.head(35)['smile'])))
len(training_smiles)

22

In [14]:
training_fingerprints = []
for smile in training_smiles:
    training_fingerprints.append(Chem.RDKFingerprint(Chem.MolFromSmiles(smile)))

def calc_similarity_score(row):
    fingerprint = Chem.RDKFingerprint(Chem.MolFromSmiles(row['smile']))
    similarity = np.max(DataStructs.BulkTanimotoSimilarity(fingerprint,training_fingerprints))
    adj_factor = (1 / similarity) **.333
    adj_score = row['score'] * adj_factor
    return adj_score

similarity_adjusted = new_table.copy(deep=True)
similarity_adjusted = similarity_adjusted[similarity_adjusted['weight'] < 900]
similarity_adjusted['similarity_adj_score'] = similarity_adjusted.apply(calc_similarity_score, axis=1)
similarity_adjusted = similarity_adjusted.sort_values('similarity_adj_score', ascending=True)
similarity_adjusted.head()

Unnamed: 0,id,gen,smile,source,weight,score,similarity_adj_score
411,AAGJ,6,CC(C)(C)C1CCC(C2CCCCCC2(C)C)CCCC2(C)C(CCC3CCCC...,generated,661.2,-11.5,-24.697105
204,AABP,5,CC1(CC(CC(O)C2(O)CCCC2)C(O)CO)CCCCCCCCCCCCCCCC...,generated,763.286,-13.0,-23.467864
642,AAAE,7,CC1(C)CCCC2(C)C(CCC3CCCC3)CCCCCC(CCC3CCCC3)CCC...,generated,661.2,-10.3,-22.120016
649,AAAI,8,CC(C)(C)C1CC2(C(C)(C)C)CCCC2(C)C1CCC12CC3CC(CC...,generated,398.719,-10.3,-21.819331
643,AAAL,7,CC(C)(C)C1CC2(C(C)(C)C)CCCC2(C)C1CCC12CC3CC(CC...,generated,398.719,-10.3,-21.819331


In [15]:
# Select top X ranked by similarity adjusted score for training data to refine the molecule generator RNN (ensures diverity)
training_smiles += list(similarity_adjusted.head(5)['smile'])
len(training_smiles)

27

In [16]:
def calc_weight_score(row):
    adj_factor = (900 / row['weight']) ** .333
    if adj_factor < 1:
        adj_score = 0
    else:
        adj_score = row['score'] * adj_factor
    return adj_score

weight_adjusted = new_table.copy(deep=True)
weight_adjusted['weight_adj_score'] = weight_adjusted.apply(calc_weight_score, axis=1)
weight_adjusted = weight_adjusted.sort_values('weight_adj_score', ascending=True)
weight_adjusted.head()

Unnamed: 0,id,gen,smile,source,weight,score,weight_adj_score
0,AABL,10,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,generated,786.926,-17.9,-18.718446
1,AADG,9,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,generated,838.062,-17.7,-18.125294
2,AACA,10,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,generated,838.062,-17.7,-18.125294
15,AABX,9,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,generated,772.899,-17.2,-18.094489
13,AACC,10,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,generated,772.899,-17.2,-18.094489


In [17]:
# Select top X ranked by similarity adjusted score for training data to refine the molecule generator RNN (ensures diverity)
training_smiles += list(weight_adjusted.head(5)['smile'])
len(training_smiles)

32

In [18]:
import tensorflow
tensorflow.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [19]:
import numpy as np
from copy import copy

import keras

from lstm_chem.utils.config import process_config
from lstm_chem.model import LSTMChem
from lstm_chem.generator import LSTMChemGenerator
from lstm_chem.trainer import LSTMChemTrainer
from lstm_chem.data_loader import DataLoader

In [20]:
# Generate some with the base original model
CONFIG_FILE = './config/config.json'
config = process_config(CONFIG_FILE)
modeler = LSTMChem(config, session='generate')
generator = LSTMChemGenerator(modeler)

Loading model architecture from ./config/model_arch.json ...
Loading model checkpoint from ./checkpoints/LSTM_Chem-baseline-model-full.hdf5 ...
Loaded the Model.


In [21]:
sample_number = 20

In [22]:
base_generated = generator.sample(num=sample_number)

  0%|          | 0/20 [00:00<?, ?it/s]







  5%|▌         | 1/20 [00:14<04:29, 14.16s/it]



 10%|█         | 2/20 [00:20<03:33, 11.83s/it]









 15%|█▌        | 3/20 [00:45<04:26, 15.67s/it]





100%|██████████| 20/20 [01:41<00:00,  5.10s/it]


In [23]:
base_generated_mols = validate_mols(base_generated)
base_generated_smiles = convert_mols_to_smiles(base_generated_mols)
random.shuffle(base_generated_smiles)
random.shuffle(base_generated_smiles)
# Select X for training data to refine the molecule generator RNN (ensures diverity)
training_smiles += base_generated_smiles[0:5]
len(training_smiles)

RDKit ERROR: [02:14:29] SMILES Parse Error: extra open parentheses for input: 'CC(C)C1(NC(=O)C(Cc2ccccc2)CC(CC2CCCCCC2)C(=O)NC(Cc2ccccc1)C(=O)NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)NC(Cc1ccccc'
RDKit ERROR: [02:14:29] SMILES Parse Error: extra open parentheses for input: 'CC(C)(C)CC(NC(=O)C(Cc1ccccc1)NC(=O)C(Cc1ccccc1)NC(=O)C(Cc1ccccc1)NC(=O)C(Cc1ccccc1)NC(=O)OC1COC2OCCC12)S(=O)(=O)c1ccc(NC(=O)C(Cc'
RDKit ERROR: [02:14:29] SMILES Parse Error: extra open parentheses for input: 'CC(C)(C)CC(NC(=O)C(Cc1ccccc1)NC(=O)C(Cc1ccccc1)NC(=O)C(Cc1ccccc1)NC(=O)OC1COC2OCCC12)S(=O)(=O)c1ccc(NC(=O)C(O)C(O)C(O)C(O)C(O)CO'
RDKit ERROR: [02:14:29] SMILES Parse Error: extra open parentheses for input: 'CC(C)(C)CC(NC(=O)C(Cc1ccccc1)NC(=O)C(CC(C)(C)C)NC(=O)C(Cc1ccccc1)NC(=O)C(Cc1ccccc1)NC(=O)C(Cc1ccccc1)NC(=O)C(Cc1ccccc1)NC(=O)OC1'
RDKit ERROR: [02:14:29] SMILES Parse Error: extra open parentheses for input: 'CC1(C)CCCC2(C)C(CCC1CCCCCCCCC(N)=O)CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC

37

In [24]:
master_table = pd.read_csv('./generations/master_results_table_gen' + str(GLOBAL_GENERATION-1) + '.csv',sep=',')
master_table.head()

Unnamed: 0,id,gen,smile,source,weight,score
0,AABL,10,O=C(NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O)C1CC...,generated,786.926,-17.9
1,AADG,9,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,generated,838.062,-17.7
2,AACA,10,O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=...,generated,838.062,-17.7
3,AABO,9,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C(...,generated,853.077,-17.6
4,AABK,10,O=C(NC1CC2CCC(C1)N2CCc1ccccn1)C1CC2CCC(C1)C2C(...,generated,853.077,-17.6


In [26]:
# Save the list of smiles to train on
with open('./generations/training/gen' + str(GLOBAL_GENERATION) + '_training.smi', 'w') as f:
    for item in training_smiles:
        f.write("%s\n" % item)


## Retrain the network to create molecules more like those selected above

In [27]:
from lstm_chem.finetuner import LSTMChemFinetuner

In [28]:
config = process_config('config/config.json')
config['model_weight_filename'] = './checkpoints/finetuned_gen' + str(GLOBAL_GENERATION-1) + '.hdf5'
config['finetune_data_filename'] = './generations/training/gen' + str(GLOBAL_GENERATION) + '_training.smi'
print(config)

batch_size: 512
checkpoint_dir: experiments/2020-10-13/LSTM_Chem/checkpoints/
checkpoint_mode: min
checkpoint_monitor: val_loss
checkpoint_save_best_only: false
checkpoint_save_weights_only: true
checkpoint_verbose: 1
config_file: config/config.json
data_filename: ./datasets/dataset_cleansed.smi
data_length: 0
exp_dir: experiments/2020-10-13/LSTM_Chem
exp_name: LSTM_Chem
finetune_batch_size: 1
finetune_data_filename: ./generations/training/gen6_training.smi
finetune_epochs: 5
model_arch_filename: ./config/model_arch.json
model_weight_filename: ./checkpoints/finetuned_gen5.hdf5
num_epochs: 42
optimizer: adam
sampling_temp: 0.75
seed: 71
smiles_max_length: 128
tensorboard_log_dir: experiments/2020-10-13/LSTM_Chem/logs/
tensorboard_write_graph: true
train_smi_max_len: 128
units: 256
validation_split: 0.1
verbose_training: true



In [30]:
modeler = LSTMChem(config, session='finetune')
finetune_dl = DataLoader(config, data_type='finetune')

finetuner = LSTMChemFinetuner(modeler, finetune_dl)
finetuner.finetune()

Loading model architecture from ./config/model_arch.json ...


100%|██████████| 37/37 [00:00<00:00, 1986.70it/s]

Loading model checkpoint from ./checkpoints/finetuned_gen5.hdf5 ...
Loaded the Model.
loading SMILES...
done.
tokenizing SMILES...
done.
Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/5





Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe95c695790>

In [31]:
finetuner.model.save_weights('./checkpoints/finetuned_gen' + str(GLOBAL_GENERATION) + '.hdf5')

In [32]:
config['model_weight_filename'] = './checkpoints/finetuned_gen' + str(GLOBAL_GENERATION) + '.hdf5'
modeler = LSTMChem(config, session='generate')
generator = LSTMChemGenerator(modeler)
print(config)

Loading model architecture from ./config/model_arch.json ...
Loading model checkpoint from ./checkpoints/finetuned_gen6.hdf5 ...
Loaded the Model.
batch_size: 512
checkpoint_dir: experiments/2020-10-13/LSTM_Chem/checkpoints/
checkpoint_mode: min
checkpoint_monitor: val_loss
checkpoint_save_best_only: false
checkpoint_save_weights_only: true
checkpoint_verbose: 1
config_file: config/config.json
data_filename: ./datasets/dataset_cleansed.smi
data_length: 0
exp_dir: experiments/2020-10-13/LSTM_Chem
exp_name: LSTM_Chem
finetune_batch_size: 1
finetune_data_filename: ./generations/training/gen6_training.smi
finetune_epochs: 5
model_arch_filename: ./config/model_arch.json
model_weight_filename: ./checkpoints/finetuned_gen6.hdf5
num_epochs: 42
optimizer: adam
sampling_temp: 0.75
seed: 71
smiles_max_length: 128
tensorboard_log_dir: experiments/2020-10-13/LSTM_Chem/logs/
tensorboard_write_graph: true
train_smi_max_len: 128
units: 256
validation_split: 0.1
verbose_training: true



In [33]:
sample_number = 5000
sampled_smiles = generator.sample(num=sample_number)

  0%|          | 0/5000 [00:00<?, ?it/s]















  0%|          | 1/5000 [00:44<61:32:31, 44.32s/it]



  2%|▏         | 75/5000 [03:18<3:04:20,  2.25s/it]



  4%|▍         | 198/5000 [07:16<1:59:39,  1.50s/it]



 10%|▉         | 499/5000 [17:19<1:54:12,  1.52s/it]



100%|██████████| 5000/5000 [2:44:20<00:00,  1.97s/it]  


In [35]:
valid_mols = []
for smi in sampled_smiles:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        valid_mols.append(mol)
# low validity
print('Validity: ', f'{len(valid_mols) / sample_number:.2%}')

valid_smiles = [Chem.MolToSmiles(mol) for mol in valid_mols]
# high uniqueness
print('Uniqueness: ', f'{len(set(valid_smiles)) / len(valid_smiles):.2%}')

# Of valid smiles generated, how many are truly original vs ocurring in the training data
import pandas as pd
training_data = pd.read_csv('./datasets/dataset_cleansed.smi', header=None)
training_set = set(list(training_data[0]))
original = []
for smile in list(set(valid_smiles)):
    if not smile in training_set:
        original.append(smile)
print('Originality: ', f'{len(set(original)) / len(set(valid_smiles)):.2%}')

RDKit ERROR: [05:16:00] SMILES Parse Error: extra close parentheses while parsing: O=C(NCC1(c2ccccc2)CC1)NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=O)NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O
RDKit ERROR: [05:16:00] SMILES Parse Error: Failed parsing SMILES 'O=C(NCC1(c2ccccc2)CC1)NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=O)NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O' for input: 'O=C(NCC1(c2ccccc2)CC1)NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CC(C1)C2C(=O)NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O'
RDKit ERROR: [05:16:00] SMILES Parse Error: extra close parentheses while parsing: CC(C)(C)C1CCC(C2CCCCCC2(C)C)CC(C)(C)C)CCCC1(C)C
RDKit ERROR: [05:16:00] SMILES Parse Error: Failed parsing SMILES 'CC(C)(C)C1CCC(C2CCCCCC2(C)C)CC(C)(C)C)CCCC1(C)C' for input: 'CC(C)(C)C1CCC(C2CCCCCC2(C)C)CC(C)(C)C)CCCC1(C)C'
RDKit ERROR: [05:16:00] Explicit valence for atom # 22 O, 3, is greater than permitted
RDKit ERROR: [05:16:00] Explicit valence for atom # 22 O, 3, is greater than permitted
RDKit ERROR: [05:16:00] SMILES Parse Error

RDKit ERROR: [05:16:00] SMILES Parse Error: extra open parentheses for input: 'CC1(CC(CC(O)C(CO)O)CCCC2(C)CCCCCCCC12'
RDKit ERROR: [05:16:00] Explicit valence for atom # 21 O, 3, is greater than permitted
RDKit ERROR: [05:16:00] Explicit valence for atom # 21 O, 3, is greater than permitted
RDKit ERROR: [05:16:00] SMILES Parse Error: extra open parentheses for input: 'CC1(CC(C(C)(C)C)CCCC1(C)C'
RDKit ERROR: [05:16:00] Explicit valence for atom # 21 O, 3, is greater than permitted
RDKit ERROR: [05:16:00] SMILES Parse Error: extra open parentheses for input: 'CC(C)(C)C1CC(CCC2CCCCCCC(CCC3CCCC3)CCCCC2(C)C1CCC12CC3CC(CC(C3)C1)C2'
RDKit ERROR: [05:16:00] SMILES Parse Error: extra open parentheses for input: 'CC1(CC(CC(O)C(Cc2ccccc2)C(=O)O)CC1CCC12CC3CC(CC(C3)C1)C2'
RDKit ERROR: [05:16:00] Explicit valence for atom # 22 O, 3, is greater than permitted
RDKit ERROR: [05:16:00] SMILES Parse Error: extra close parentheses while parsing: O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CC2CCC(C1)N2CCc1ccccc1)C(=O

RDKit ERROR: [05:16:01] SMILES Parse Error: extra open parentheses for input: 'CC(C)(C)C1CCC(CCC2(C)C(CCC3CCCC3)CCCC12'
RDKit ERROR: [05:16:01] SMILES Parse Error: extra open parentheses for input: 'O=C(NC1CC2CCC(O1)C2C(=O)NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O'
RDKit ERROR: [05:16:01] Explicit valence for atom # 21 O, 3, is greater than permitted
RDKit ERROR: [05:16:01] SMILES Parse Error: extra close parentheses while parsing: O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1COc2ccccc2)c(=O)c(CC(=O)NC(Cc2ccccc2)C(=O)O)cc1
RDKit ERROR: [05:16:01] SMILES Parse Error: Failed parsing SMILES 'O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1COc2ccccc2)c(=O)c(CC(=O)NC(Cc2ccccc2)C(=O)O)cc1' for input: 'O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1COc2ccccc2)c(=O)c(CC(=O)NC(Cc2ccccc2)C(=O)O)cc1'
RDKit ERROR: [05:16:01] SMILES Parse Error: extra open parentheses for input: 'CC1(CC(CC(O)CC(O)C(O)CO)CCCCC1'
RDKit ERROR: [05:16:01] Explicit valence for atom # 21 O, 3, is greater than permitted
RDKit ERROR: [05:16:01] Explici

RDKit ERROR: [05:16:01] Explicit valence for atom # 21 O, 3, is greater than permitted
RDKit ERROR: [05:16:01] Explicit valence for atom # 21 O, 3, is greater than permitted
RDKit ERROR: [05:16:01] Explicit valence for atom # 22 O, 3, is greater than permitted
RDKit ERROR: [05:16:01] SMILES Parse Error: extra open parentheses for input: 'CC(C)(C)C1CCC(C2CC(CCC3CCCC3)CCCC2(C)C1CCC12CC3CC(CC(C3)C1)C2'
RDKit ERROR: [05:16:01] SMILES Parse Error: extra open parentheses for input: 'CC1(CC(CC(O)C2(O)CCCC2)C(O)CO)CCCCCCCCCCCCCCCC1CC1(C(C)(C)C)CC(O)CC(C2CCCC3)CCCC2(C)C1CC(O)C(Cc1ccccc1)NC(=O)C1C2CC(NC(=O)C(Cc3c'
RDKit ERROR: [05:16:01] SMILES Parse Error: extra close parentheses while parsing: O=C(NC1CC2CCCCC2)CC1CC1(c2ccccc2)CC1)NC1CC2CCC(C1)N2CCc1ccccc1
RDKit ERROR: [05:16:01] SMILES Parse Error: Failed parsing SMILES 'O=C(NC1CC2CCCCC2)CC1CC1(c2ccccc2)CC1)NC1CC2CCC(C1)N2CCc1ccccc1' for input: 'O=C(NC1CC2CCCCC2)CC1CC1(c2ccccc2)CC1)NC1CC2CCC(C1)N2CCc1ccccc1'
RDKit ERROR: [05:16:01] Explicit va

RDKit ERROR: [05:16:02] SMILES Parse Error: extra close parentheses while parsing: CC(C)(C)C1CCC(C)C2CCC3CCCC3)CCCC12
RDKit ERROR: [05:16:02] SMILES Parse Error: Failed parsing SMILES 'CC(C)(C)C1CCC(C)C2CCC3CCCC3)CCCC12' for input: 'CC(C)(C)C1CCC(C)C2CCC3CCCC3)CCCC12'
RDKit ERROR: [05:16:02] Explicit valence for atom # 21 O, 3, is greater than permitted
RDKit ERROR: [05:16:02] Explicit valence for atom # 21 O, 3, is greater than permitted
RDKit ERROR: [05:16:02] SMILES Parse Error: ring closure 1 duplicates bond between atom 25 and atom 26 for input: 'O=C(NC1CC2CCC(C1)N2CCc1ccccc1)C1CO2CCC1C1(C1)C2C(=O)NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)NC(Cc1ccccc1)C(=O)O'
RDKit ERROR: [05:16:02] SMILES Parse Error: ring closure 2 duplicates bond between atom 9 and atom 25 for input: 'CC(C)(C)NC(=O)C1CC2(CC3CCC(C1)N3CCc1ccccc1)C2'
RDKit ERROR: [05:16:02] SMILES Parse Error: extra open parentheses for input: 'CC(c1ccccc1)N(CC(O)C(Cc1ccccc1)NC(=O)C(Cc1ccccc1)NC(=O)C1C2CC(NC(=O)C(Cc3ccccc3)C(=O)NC(Cc3cc

Validity:  95.40%
Uniqueness:  6.06%
Originality:  100.00%


In [36]:
valid_smiles = list(set(valid_smiles))
len(valid_smiles)

289

In [37]:
#take the valid smiles from above and run them through process to add to tracking table and to generate next PyRx testing data
mols_for_next_generation = validate_mols(valid_smiles)

master_table = pd.read_csv('./generations/master_results_table_gen' + str(GLOBAL_GENERATION-1) +'.csv',sep=',')
new_mols_to_test = append_to_tracking_table(master_table,mols_for_next_generation, 'generated', GLOBAL_GENERATION)
mols_for_pd = new_mols_to_test[0]
mols_for_export = new_mols_to_test[1]

master_table = master_table.append(mols_for_pd)
master_table = master_table.reset_index(drop=True)
master_table.to_csv(r'./generations/master_results_table_gen' + str(GLOBAL_GENERATION) + '.csv', index=False)

idAAGOgen6
idAAGPgen6
idAAGQgen6
idAAGRgen6
idAAGSgen6
idAAGTgen6
idAAGUgen6
idAAGVgen6
idAAGWgen6
idAAGXgen6
idAAGYgen6
idAAGZgen6
idAAHAgen6
idAAHBgen6
idAAHCgen6
idAAHDgen6
idAAHEgen6
idAAHFgen6
idAAHGgen6
idAAHHgen6
idAAHIgen6
idAAHJgen6
idAAHKgen6
idAAHLgen6
idAAHMgen6
idAAHNgen6
idAAHOgen6
idAAHPgen6
idAAHQgen6
idAAHRgen6
idAAHSgen6
idAAHTgen6
idAAHUgen6
idAAHVgen6
idAAHWgen6
idAAHXgen6
idAAHYgen6
idAAHZgen6
idAAIAgen6
idAAIBgen6
idAAICgen6
idAAIDgen6
idAAIEgen6
idAAIFgen6
idAAIGgen6
idAAIHgen6
idAAIIgen6
idAAIJgen6
idAAIKgen6
idAAILgen6
idAAIMgen6
idAAINgen6
idAAIOgen6
idAAIPgen6
idAAIQgen6
idAAIRgen6
idAAISgen6
idAAITgen6
idAAIUgen6
idAAIVgen6
idAAIWgen6
idAAIXgen6
idAAIYgen6
idAAIZgen6
idAAJAgen6
idAAJBgen6
idAAJCgen6
idAAJDgen6
idAAJEgen6
idAAJFgen6
idAAJGgen6
idAAJHgen6
idAAJIgen6
idAAJJgen6
idAAJKgen6
idAAJLgen6
idAAJMgen6
idAAJNgen6
idAAJOgen6
idAAJPgen6
idAAJQgen6
idAAJRgen6
idAAJSgen6
idAAJTgen6
idAAJUgen6
idAAJVgen6
idAAJWgen6
idAAJXgen6
idAAJYgen6
idAAJZgen6
idAAKAgen6

In [38]:
len(mols_for_export)

289

In [39]:
write_gen_to_sdf(mols_for_export, GLOBAL_GENERATION, 2000)
print('ok')

ok
