In [2]:
import pandas as pd
from synnet.config import DELIM
import pickle
import rdkit.Chem as Chem
from synnet.encoding.distances import _tanimoto_similarity, mol_fp
import numpy as np
from rdkit.Chem import AllChem, DataStructs
from rdkit.Contrib.SA_Score import sascorer
import rdkit.Chem as Chem

17:06:46 rdkit INFO: Enabling RDKit 2023.09.5 jupyter extensions


In [14]:


NUM_BEAMS = 5
df_baseline = pd.read_csv('SynthesisNet/results/baseline/chembl/decoded_results_analogs_beam_width=30.csv')
mask = np.arange(len(df_baseline))
mask = (mask%100 < NUM_BEAMS)
df_baseline = df_baseline.iloc[mask, :]
df_baseline = df_baseline.dropna(subset=['decoded'])

# df_ours = pd.read_csv('SynthesisNet/results/viz/reconstruct-ablation-train.csv')
# df_ours['decoded'] = df_ours['smiles']

lines = open('SynthesisNet/output_analog_top_k=3_max_num_rxns=4_max_rxns=-1_top_k_rxn=3_strategy=topological_progress.txt').readlines()
data = []
inds = []
for line in lines:
    ind, r, sim = line.split()
    (target, best_smi, index) = r.split(DELIM)
    data.append({'target': target, 'sim': float(sim), 'decoded': best_smi})
    inds.append(int(ind))
data = [data[ind] for ind in np.argsort(inds)]
df_ours = pd.DataFrame(data).groupby('target', sort=False).apply(lambda x: x.iloc[:NUM_BEAMS,:]).reset_index(drop=True)

lines = open('SynthesisNet/output_mcmc_analog_top_k=3_top_k_rxn=3_max_rxns=-1_max_num_rxns=3_strategy=topological_progress.txt').readlines()
data = []
inds = []
for line in lines:
    index, res, scores, smis, ins, *pargs = line.split()
    smiles, ind = res.split(DELIM)
    score_history = list(map(float, scores.split(',')))
    smiles_history = list(map(str, smis.split(',')))
    inds_history = list(map(int, ins.split(',')))
    for smi, score in zip(smiles_history, score_history):
        data.append({'decoded': smi, 'targets': smiles, 'sim': score})
        inds.append(int(index))
data = [data[ind] for ind in np.argsort(inds)]
# df_mcmc = pd.DataFrame(data).groupby('targets', sort=False).apply(lambda x: x.iloc[:NUM_BEAMS,:]).reset_index(drop=True)
df_mcmc = pd.DataFrame(data)

# lines = open('SynthesisNet/output_analog_top_k=3_max_num_rxns=4_max_rxns=-1_top_k_rxn=3_strategy=bottom_up_topological_train.txt').readlines()
# data = []
# for line in lines:
#     _, r, sim = line.split()
#     (target, best_smi, index) = r.split(DELIM)
#     data.append({'target': target, 'sim': float(sim), 'decoded': best_smi})
# df_ours = pd.DataFrame(data)

mols = Chem.SDMolSupplier('SynthesisNet/results/baseline/chembl/results_fix_1.sdf')
targets = open('SynthesisNet/data/assets/molecules/chembl_34_1000.txt').readlines()
decoded = [Chem.MolToSmiles(mol) for mol in mols]
targets = targets[:len(decoded)]
df_unirxn = pd.DataFrame({'decoded': decoded, 'targets': targets})
sims = [_tanimoto_similarity(mol_fp(target, 2, 4096), mol_fp(decode, 2, 4096)) for decode, target in zip(decoded, targets)]
df_unirxn['sim'] = sims



In [4]:
df_mcmc.shape

(692, 3)

In [6]:


def compute_diversity(mol_list):
    if len(mol_list) == 1:
        return 0.
    mol_list = [Chem.MolFromSmiles(smi) for smi in mol_list]
    similarity = 0
    mol_list = [AllChem.GetMorganFingerprintAsBitVect(x, 3, 2048) for x in mol_list] 
    for i in range(len(mol_list)):
        sims = DataStructs.BulkTanimotoSimilarity(mol_list[i], mol_list[:i])
        similarity += sum(sims)
    n = len(mol_list)
    n_pairs = n * (n - 1) / 2
    diversity = 1 - similarity / n_pairs
    return diversity

# first step is group all analogs by target
# take the top K via similarity



for NUM_ANALOGS in [1,3,5]:
    result_baseline = df_baseline.groupby('targets').apply(lambda x: x.drop_duplicates(subset='decoded').nlargest(NUM_ANALOGS, 'similarity')).reset_index(drop=True)
    result_ours = df_ours.groupby('target').apply(lambda x: x.drop_duplicates(subset='decoded').nlargest(NUM_ANALOGS, 'sim')).reset_index(drop=True)
    result_unirxn = df_unirxn.groupby('targets').apply(lambda x: x.drop_duplicates(subset='decoded').nlargest(NUM_ANALOGS, 'sim')).reset_index(drop=True)
    result_mcmc = df_mcmc.groupby('targets').apply(lambda x: x.drop_duplicates(subset='decoded').nlargest(NUM_ANALOGS, 'sim')).reset_index(drop=True)

    # result_baseline = result_baseline.groupby('targets', sort=False).apply(lambda x: compute_diversity(x)).reset_index(drop=True)

    result_baseline['sa_score'] = [sascorer.calculateScore(Chem.MolFromSmiles(smi)) for smi in result_baseline['decoded']]
    result_baseline_['sa_score'] = [sascorer.calculateScore(Chem.MolFromSmiles(smi)) for smi in result_baseline_['decoded']]
    result_ours['sa_score'] = [sascorer.calculateScore(Chem.MolFromSmiles(smi)) for smi in result_ours['decoded']]
    result_unirxn['sa_score'] = [sascorer.calculateScore(Chem.MolFromSmiles(smi)) for smi in result_unirxn['decoded']]
    result_mcmc['sa_score'] = [sascorer.calculateScore(Chem.MolFromSmiles(smi)) for smi in result_mcmc['decoded']]

    aggr_baseline_sim = result_baseline.groupby('targets', sort=False).agg(avg_sim=('similarity', 'mean'))
    aggr_baseline__sim = result_baseline_.groupby('targets', sort=False).agg(avg_sim=('similarity', 'mean'))
    aggr_ours_sim = result_ours.groupby('target', sort=False).agg(avg_sim=('sim', 'mean'))
    aggr_unirxn_sim = result_unirxn.groupby('targets', sort=False).agg(avg_sim=('sim', 'mean'))
    aggr_mcmc_sim = result_mcmc.groupby('targets', sort=False).agg(avg_sim=('sim', 'mean'))

    aggr_baseline_sim_max = result_baseline.groupby('targets', sort=False).agg(avg_sim=('similarity', 'max'))
    aggr_baseline__sim_max = result_baseline_.groupby('targets', sort=False).agg(avg_sim=('similarity', 'max'))
    aggr_ours_sim_max = result_ours.groupby('target', sort=False).agg(avg_sim=('sim', 'max'))
    aggr_unirxn_sim_max = result_unirxn.groupby('targets', sort=False).agg(avg_sim=('sim', 'max'))
    aggr_mcmc_sim_max = result_mcmc.groupby('targets', sort=False).agg(avg_sim=('sim', 'max'))

    aggr_baseline_sa = result_baseline.groupby('targets', sort=False).agg(sa_score=('sa_score', 'mean'))
    aggr_baseline__sa = result_baseline_.groupby('targets', sort=False).agg(sa_score=('sa_score', 'mean'))
    aggr_ours_sa = result_ours.groupby('target', sort=False).agg(sa_score=('sa_score', 'mean'))
    aggr_unirxn_sa = result_unirxn.groupby('targets', sort=False).agg(sa_score=('sa_score', 'mean'))
    aggr_mcmc_sa = result_mcmc.groupby('targets', sort=False).agg(sa_score=('sa_score', 'mean'))

    aggr_baseline_diversity = result_baseline.groupby('targets', sort=False).agg(diversity=('decoded', compute_diversity))
    aggr_baseline__diversity = result_baseline_.groupby('targets', sort=False).agg(diversity=('decoded', compute_diversity))
    aggr_ours_diversity = result_ours.groupby('target', sort=False).agg(diversity=('decoded', compute_diversity))
    aggr_mcmc_diversity = result_mcmc.groupby('targets', sort=False).agg(diversity=('decoded', compute_diversity))

    aggr_baseline_recover = aggr_baseline_sim_max == 1.
    aggr_baseline__recover = aggr_baseline__sim_max == 1.
    aggr_ours_recover = aggr_ours_sim_max == 1.
    aggr_unirxn_recover = aggr_unirxn_sim_max == 1.
    aggr_mcmc_recover = aggr_mcmc_sim_max == 1.

    res = aggr_ours_recover.mean(), aggr_baseline_recover.mean(), aggr_baseline__recover.mean(), aggr_unirxn_recover.mean(), aggr_mcmc_recover.mean(), aggr_ours_sim.mean(), aggr_baseline_sim.mean(), aggr_baseline__sim.mean(), aggr_unirxn_sim.mean(), aggr_mcmc_sim.mean(), aggr_ours_diversity.mean(), aggr_baseline_diversity.mean(), aggr_baseline__diversity.mean(), aggr_mcmc_diversity.mean(), aggr_ours_sa.mean(), aggr_baseline_sa.mean(), aggr_baseline__sa.mean(), aggr_unirxn_sa.mean(), aggr_mcmc_sa.mean()
    metrics = ['recover_ours', 'recover_baseline',  'recover_baseline_', 'recover_unirxn', 'recover_mcmc',
                'sim_ours', 'sim_baseline', 'sim_baseline_', 'sim_unirxn', 'sim_mcmc',
                'diversity_ours', 'diversity_baseline', 'diversity_baseline_', 'diversity_mcmc',
                'sa_ours', 'sa_baseline', 'sa_baseline_', 'sa_unirxn', 'sa_mcmc']
    assert len(metrics) == len(res)
    for metric, r in zip(metrics, res):
        print(f"{NUM_ANALOGS} {metric} {r}")

1 recover_ours avg_sim    0.929577
dtype: float64
1 recover_baseline avg_sim    0.049
dtype: float64
1 recover_baseline_ avg_sim    0.46339
dtype: float64
1 recover_unirxn avg_sim    0.001227
dtype: float64
1 recover_mcmc avg_sim    0.092308
dtype: float64
1 sim_ours avg_sim    0.7988
dtype: float64
1 sim_ours avg_sim    0.978279
dtype: float64
1 sim_baseline_ avg_sim    0.765932
dtype: float64
1 sim_unirxn avg_sim    0.103975
dtype: float64
1 sim_mcmc avg_sim    0.531612
dtype: float64
1 diversity_ours diversity    0.0
dtype: float64
1 diversity_baseline diversity    0.0
dtype: float64
1 diversity_mcmc diversity    0.0
dtype: float64
1 sa_ours sa_score    3.030975
dtype: float64
1 sa_baseline sa_score    2.690997
dtype: float64
1 sa_unirxn sa_score    3.018992
dtype: float64
1 sa_mcmc sa_score    2.509575
dtype: float64
3 recover_ours avg_sim    0.929577
dtype: float64
3 recover_baseline avg_sim    0.072
dtype: float64
1 sa_ours sa_score    3.07491
dtype: float64
1 sa_baseline sa_scor

In [102]:
for smi in list(result_baseline_['decoded']):
    try:
        score = sascorer.calculateScore(Chem.MolFromSmiles(smi))
    except:
        print(smi)
        break
    # print(score)

nan


In [6]:
result_ours.shape

(7270, 4)

In [136]:

bb_file = 'SynthesisNet/data/assets/building-blocks/enamine_us_matched.csv'
reactants = pd.read_csv(bb_file)['SMILES']
dic = {'reactant': list(reactants), 'reagent': list(reactants)}
pickle.dump(dic, open('Uni-RXN-official/dataset/data/enamine_react_lib_smi.pkl', 'wb+'))


In [30]:
score_history

[0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.38461538461538464,
 0.27941176470588236,
 0.38461538461538464,
 0.38461538461538464,
 0.509090909090909,
 0.38461538461538464,
 0.509090909090909,
 0.509090909090909,
 0.35294117647058826,
 0.2602739726027397,
 0.509090909090909,
 0.2602739726027397,
 0.509090909090909,
 0.509090909090909,
 0.28169014084507044,
 0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.28169014084507044,
 0.509090909090909,
 0.509090909090909,
 0.28169014084507044,
 0.509090909090909,
 0.21333333333333335,
 0.509090909090909,
 0.509090909090909,
 0.28169014084507044,
 0.1917808219178082,
 0.26666666666666666,
 0.26666666666666666,
 0.26666666666666666,
 0.26666666666666666,
 0.509090909090909,
 0.26666666666666666,
 0.26666666666666666,
 0.26666666666666666,
 0.509090909090909,
 0.26666666666666666,
 0.2372881355932203

In [12]:

bb_file = 'SynthesisNet/data/assets/building-blocks/enamine_us_matched.csv'
reactants = pd.read_csv(bb_file)['SMILES']
dic = {'reactant': list(reactants), 'reagent': list(reactants)}
pickle.dump(dic, open('Uni-RXN-official/dataset/data/enamine_react_lib_smi.pkl', 'wb+'))


In [6]:
import pickle
skeletons = pickle.load(open('SynthesisNet/results/viz/skeletons.pkl', 'rb'))
train = pickle.load(open('SynthesisNet/results/viz/skeletons-train.pkl', 'rb'))
valid = pickle.load(open('SynthesisNet/results/viz/skeletons-valid.pkl', 'rb'))
test = pickle.load(open('SynthesisNet/results/viz/skeletons-test.pkl', 'rb'))