In [None]:
import sqlite3
import numpy as np
import pandas as pd
import mols2grid
import ipywidgets as widgets
from IPython.display import display, HTML

from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw, QED, Descriptors, Lipinski

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
# db_file = '/raid/drugdiscovery/cheminformatics/vs_screening/9a70a0d6-8ece-11ec-aa14-c7359b90650f/common.sqlite3'
# db_file = '/raid/drugdiscovery/cheminformatics/vs_screening/519f4546-8f51-11ec-aa14-c7359b90650f/common.sqlite3'
# db_file = '/raid/drugdiscovery/cheminformatics/vs_screening/7693847c-8fa4-11ec-aa14-c7359b90650f/common.sqlite3'
# db_file = '/content/50f1f46c-915f-11ec-aa14-c7359b90650f/common.sqlite3'

# Regression includes data from all generations.
# db_file = '/content/152ca056-9238-11ec-aa14-c7359b90650f/common.sqlite3'

# Regression include data from previous three generation
# db_file = '/content/59ce114a-92ce-11ec-aa14-c7359b90650f/common.sqlite3'

# db_file = '/content/1e988488-934c-11ec-aa14-c7359b90650f/common.sqlite3'
# db_file = '/content/1d108578-9e84-11ec-83ca-7de881115940/common.sqlite3'
# db_file = '/content/6e6e2b68-9ef8-11ec-83ca-7de881115940/common.sqlite3'

# RBF
# db_file = '/content/74077332-a825-11ec-aa14-c7359b90650f/common.sqlite3'
db_file = '/content/7251bc2e-a8ba-11ec-aa14-c7359b90650f/common.sqlite3'
conn = sqlite3.connect(db_file, uri=True)

In [None]:
df = pd.read_sql('''
                 SELECT generated_smiles.id as id, 
                        generated_smiles.input_id as input_id, 
                        generated_smiles.smiles as smiles, 
                        smiles.smiles as input_smiles, 
                        smiles.generation as generation, 
                        generated_smiles.score as score
                 FROM smiles, generated_smiles 
                 WHERE smiles.id = generated_smiles.input_id
                    AND score is not null
                 ''', 
                 con=conn)
generations = df['generation'].unique()
input_smiles = df.query('generation == 0')['input_smiles'].values[0]
input_score = df.query('generation == 0')['score'].values[0]

# Compute properties using RDkit
mol_wt = []
mol_logp = []
hdonors = []
hacceptors = []
rotatable_bonds = []
qeds = []
test = []

for idx in range(df.shape[0]):
    smiles = df.iat[idx, 2]
    m = Chem.MolFromSmiles(smiles)
    
    mol_logp.append(Descriptors.MolLogP(m))
    mol_wt.append(Descriptors.MolWt(m))
    hdonors.append(Lipinski.NumHDonors(m))
    hacceptors.append(Lipinski.NumHAcceptors(m))
    rotatable_bonds.append(Lipinski.NumRotatableBonds(m))
    qeds.append(QED.qed(m))
    
    test.append(smiles)
    # print(smiles)
    
df['Molecular Weight'] = mol_wt
df['LogP'] = mol_logp
df['H-Bond Donors'] = hdonors
df['H-Bond Acceptors'] = hacceptors
df['Rotatable Bonds'] = rotatable_bonds
df['QED'] = qeds
df['test'] = test

print('Input Smile:', input_smiles)
print('Score:', input_score)

In [None]:
mol = Chem.MolFromSmiles(input_smiles)
i_mol_logp = Descriptors.MolLogP(mol)
i_mol_wt = Descriptors.MolWt(mol)
i_hdonors = Lipinski.NumHDonors(mol)
i_hacceptors = Lipinski.NumHAcceptors(mol)
i_rotatable_bonds = Lipinski.NumRotatableBonds(mol)
i_qeds = QED.qed(m)

print('Input Molecule')
Draw.MolToImage(mol, includeAtomNumbers=True)

In [None]:
# zinc15_smiles = Chem.MolFromSmiles('O=C(/C=C(/Cc1cc(F)c(F)cc1F)N[C@@H](CC(=O)N1CCn2c(nnc2C(F)(F)F)C1)Cc1cc(F)c(F)cc1F)N1CCn2c(nnc2C(F)(F)F)C1')
# Draw.MolToImage(zinc15_smiles, includeAtomNumbers=True)

In [None]:
plot_cols = ['score', 'LogP', 'Molecular Weight', 'H-Bond Donors', 'H-Bond Acceptors', 'Rotatable Bonds', 'QED']
lipinski_rule_max = [input_score, 5, 500, 10, 5, 3, 1]
input_mol_values = [input_score, i_mol_logp, i_mol_wt, i_hdonors, i_hacceptors, i_rotatable_bonds, i_qeds]

tot_subplots = len(plot_cols) * len(generations)
cols = len(plot_cols)
rows = tot_subplots // cols
rows += tot_subplots % cols

position = range(1, tot_subplots + 1)

fig = plt.figure(figsize=(28, 4 * rows))
axs = fig.subplots(rows, cols)
  
for i in generations:
    df_gen = df.query(f'generation == {i}')
    df_gen = df_gen.sort_values('score', ascending=True)
    df_gen.reset_index(inplace=True)
    for k in range(len(plot_cols)):
        ax = axs[i, k]
        ax.plot(df_gen[plot_cols[k]])
        ax.set_title(plot_cols[k])
        ax.axhline(y=lipinski_rule_max[k], color='red', linestyle='--')
        ax.axhline(y=input_mol_values[k], color='blue', linestyle='--')
fig.show()

In [None]:
 def show_molgrid(generation):
    pcols = plot_cols[1:-1]
    rule_max = lipinski_rule_max[1:-1]
    if generation == 'Select':
        return
    df_gen = df.query('generation == ' + str(generation))
    grid = mols2grid.MolGrid(df_gen,
                             smiles_col='smiles',
                             size=(250, 200))
    grid = grid.display(n_cols=2, n_rows=2,
                        subset=['score', 'img', 'smiles'],
                        tooltip=['smiles'],
                        sort_by='score')
    display(grid)
    
    tot_subplots = len(pcols) * len(generations)
    cols = len(pcols)
    rows = tot_subplots // cols
    rows += tot_subplots % cols

    position = range(1, tot_subplots + 1)

    df_gen = df.query(f'generation == {generation}')
    df_gen = df_gen.sort_values('score', ascending=True)
    df_gen.reset_index(inplace=True)

    fig = plt.figure(figsize=(28, 4))
    axs = fig.subplots(1, cols)
    for k in range(len(pcols)):
        ax = axs[k]
        ax.plot(df_gen[pcols[k]])
        ax.set_title(pcols[k])
        ax.axhline(y=rule_max[k], color='red', linestyle='--')
        ax.axhline(y=input_mol_values[k], color='blue', linestyle='--')
    fig.show()
    
def selection():
    values = generations.tolist()
    values.insert(0, 'Select')
    dd = widgets.Dropdown(options=values,
                          value='Select',
                          description='Select Generation')

    f = widgets.interactive(show_molgrid, generation=dd);
    return widgets.VBox(children=f.children,
                        layout=widgets.Layout(width='50%')), dd
    
# Display two grids side by side
first = selection() 
second = selection()
second[1].value = generations[-1]
first[1].value = 0
display(widgets.HBox([first[0], second[0]]))

In [None]:
df_stats = pd.read_sql('''
                       SELECT min(score) min_score, max(score) max_score, avg(score) avg_score, smiles.generation as generation
                       FROM smiles, generated_smiles 
                       WHERE smiles.id = generated_smiles.input_id
                           AND smiles.smiles <> generated_smiles.smiles
                           AND score is not null
                       GROUP BY smiles.generation
                       ''', 
                       con=conn)

# Add the input SMILES
df_stats.loc[-1] = [input_score, input_score, input_score, -1] 
df_stats = df_stats.sort_index()

min_scores = df_stats['min_score']
index = df_stats.index

fig = plt.figure()
sub_plt = fig.add_subplot(111)
sub_plt.set_xlabel('Generation')
sub_plt.set_ylabel('Docking Score')

# Min-Score line
sub_plt.plot(index, min_scores)

# Trend line
z = np.polyfit(index, min_scores, 1)
p = np.poly1d(z)
sub_plt.plot(index, p(index),"--")

plt.savefig('score.png', dpi=150)
plt.show()

df_stats

In [None]:
# Top scores
pd.read_sql(
    '''
    SELECT gs.id, gs.score, s.generation
    FROM smiles s, generated_smiles gs
    WHERE s.id = gs.input_id
        AND gs.score is not null
    ORDER BY score
    LIMIT 5
    ''', con=conn)

In [None]:
# Query to list unscored molecules generation wise
pd.read_sql(
    '''
    SELECT s.generation, count(*)
    FROM smiles s, generated_smiles gs
    WHERE  s.id = gs.input_id
        AND gs.score is null
    GROUP BY s.generation
    ''', con=conn).replace(np.nan, 0)

In [None]:
# Query to list unscored molecules generation wise
pd.read_sql(
    '''
    SELECT s.generation, count(*)
    FROM smiles s join generated_smiles gs on s.id = gs.input_id
        LEFT JOIN (SELECT s.generation, count(*) unscored
                   FROM smiles s, generated_smiles gs
                   WHERE  s.id = gs.input_id
                       AND gs.score is null
                   GROUP BY s.generation) as unscored_query
        on unscored_query.generation = s.generation
    GROUP BY s.generation
    ''', con=conn).replace(np.nan, 0)

In [None]:
# Query to list molecules generation wise
pd.read_sql(
    '''
    SELECT s.generation, count(*)
    FROM smiles s join generated_smiles gs 
        ON s.id = gs.input_id
    GROUP BY s.generation
    ''', con=conn).replace(np.nan, 0)

In [None]:
pd.read_sql(
    '''
    SELECT stat_key as gen, value as r2
    FROM stats
    WHERE stat_type = 'r2'
    ''', con=conn)

In [None]:
pd.read_sql(
    '''
    SELECT stat_key as gen, value as mse
    FROM stats
    WHERE stat_type = 'mse'
    ''', con=conn)

In [None]:
# Query to list unscored molecules generation wise
pd.read_sql(
    '''
    SELECT *
    FROM smiles
    ''', con=conn).replace(np.nan, 0)