In [2]:
import pandas as pd
from synnet.config import DELIM
import pickle
import rdkit.Chem as Chem
from synnet.encoding.distances import _tanimoto_similarity, mol_fp
import numpy as np
from rdkit.Chem import AllChem, DataStructs
from rdkit.Contrib.SA_Score import sascorer
import rdkit.Chem as Chem

23:08:38 rdkit INFO: Enabling RDKit 2023.09.5 jupyter extensions


In [3]:


df_baseline = pd.read_csv('/home/msun415/SynTreeNet/results/baseline/chembl/decoded_results_analogs_beam_width=30.csv')
mask = np.arange(len(df_baseline))
mask = (mask%100 < 30)
df_baseline = df_baseline.iloc[mask, :]
df_baseline = df_baseline.dropna(subset=['decoded'])

# df_ours = pd.read_csv('/home/msun415/SynTreeNet/results/viz/reconstruct.csv')

lines = open('/home/msun415/SynTreeNet/output_analog_top_k=3_max_num_rxns=3_max_rxns=-1_top_k_rxn=3_strategy=topological.txt').readlines()
data = []
for line in lines:
    _, r, sim = line.split()
    (target, best_smi, index) = r.split(DELIM)
    data.append({'target': target, 'sim': float(sim), 'decoded': best_smi})
df_ours = pd.DataFrame(data)

mols = Chem.SDMolSupplier('/home/msun415/SynTreeNet/results/baseline/chembl/results_fix_1.sdf')
targets = open('/home/msun415/SynTreeNet/data/assets/molecules/chembl_34_1000.txt').readlines()
decoded = [Chem.MolToSmiles(mol) for mol in mols]
targets = targets[:len(decoded)]
df_unirxn = pd.DataFrame({'decoded': decoded, 'targets': targets})
sims = [_tanimoto_similarity(mol_fp(target, 2, 4096), mol_fp(decode, 2, 4096)) for decode, target in zip(decoded, targets)]
df_unirxn['sim'] = sims



In [None]:
df_ours.shape

In [4]:
lines = open('/home/msun415/SynTreeNet/output_mcmc_analog_top_k=3_top_k_rxn=3_max_rxns=-1_max_num_rxns=3_strategy=conf.txt').readlines()
data = []
for line in lines:
    index, res, scores, smis, inds, *pargs = line.split()
    smiles, ind = res.split(DELIM)
    score_history = list(map(float, scores.split(',')))
    smiles_history = list(map(str, smis.split(',')))
    inds_history = list(map(int, inds.split(',')))
    for smi, score in zip(smiles_history, score_history):
        data.append({'decoded': smi, 'targets': smiles, 'sim': score})
df_mcmc = pd.DataFrame(data)

In [5]:


def compute_diversity(mol_list):
    if len(mol_list) == 1:
        return 0.
    mol_list = [Chem.MolFromSmiles(smi) for smi in mol_list]
    similarity = 0
    mol_list = [AllChem.GetMorganFingerprintAsBitVect(x, 3, 2048) for x in mol_list] 
    for i in range(len(mol_list)):
        sims = DataStructs.BulkTanimotoSimilarity(mol_list[i], mol_list[:i])
        similarity += sum(sims)
    n = len(mol_list)
    n_pairs = n * (n - 1) / 2
    diversity = 1 - similarity / n_pairs
    return diversity

# first step is group all analogs by target
# take the top K via similarity



for NUM_ANALOGS in [1,3,5,30]:
    result_baseline = df_baseline.groupby('targets').apply(lambda x: x.drop_duplicates(subset='decoded').nlargest(NUM_ANALOGS, 'similarity')).reset_index(drop=True)
    result_ours = df_ours.groupby('target').apply(lambda x: x.drop_duplicates(subset='decoded').nlargest(NUM_ANALOGS, 'sim')).reset_index(drop=True)
    result_unirxn = df_unirxn.groupby('targets').apply(lambda x: x.drop_duplicates(subset='decoded').nlargest(NUM_ANALOGS, 'sim')).reset_index(drop=True)
    result_mcmc = df_mcmc.groupby('targets').apply(lambda x: x.drop_duplicates(subset='decoded').nlargest(NUM_ANALOGS, 'sim')).reset_index(drop=True)

    # result_baseline = result_baseline.groupby('targets').apply(lambda x: compute_diversity(x)).reset_index(drop=True)

    result_baseline['sa_score'] = [sascorer.calculateScore(Chem.MolFromSmiles(smi)) for smi in result_baseline['decoded']]
    result_ours['sa_score'] = [sascorer.calculateScore(Chem.MolFromSmiles(smi)) for smi in result_ours['decoded']]
    result_unirxn['sa_score'] = [sascorer.calculateScore(Chem.MolFromSmiles(smi)) for smi in result_unirxn['decoded']]
    result_mcmc['sa_score'] = [sascorer.calculateScore(Chem.MolFromSmiles(smi)) for smi in result_mcmc['decoded']]

    aggr_baseline_sim = result_baseline.groupby('targets').agg(avg_sim=('similarity', 'mean'))
    aggr_ours_sim = result_ours.groupby('target').agg(avg_sim=('sim', 'mean'))
    aggr_unirxn_sim = result_unirxn.groupby('targets').agg(avg_sim=('sim', 'mean'))
    aggr_mcmc_sim = result_mcmc.groupby('targets').agg(avg_sim=('sim', 'mean'))

    aggr_baseline_sim_max = result_baseline.groupby('targets').agg(avg_sim=('similarity', 'max'))
    aggr_ours_sim_max = result_ours.groupby('target').agg(avg_sim=('sim', 'max'))
    aggr_unirxn_sim_max = result_unirxn.groupby('targets').agg(avg_sim=('sim', 'max'))
    aggr_mcmc_sim_max = result_mcmc.groupby('targets').agg(avg_sim=('sim', 'max'))

    aggr_baseline_sa = result_baseline.groupby('targets').agg(sa_score=('sa_score', 'mean'))
    aggr_ours_sa = result_ours.groupby('target').agg(sa_score=('sa_score', 'mean'))
    aggr_unirxn_sa = result_unirxn.groupby('targets').agg(sa_score=('sa_score', 'mean'))
    aggr_mcmc_sa = result_mcmc.groupby('targets').agg(sa_score=('sa_score', 'mean'))

    aggr_baseline_diversity = result_baseline.groupby('targets').agg(diversity=('decoded', compute_diversity))
    aggr_ours_diversity = result_ours.groupby('target').agg(diversity=('decoded', compute_diversity))
    aggr_mcmc_diversity = result_mcmc.groupby('targets').agg(diversity=('decoded', compute_diversity))

    aggr_baseline_recover = aggr_baseline_sim_max == 1.
    aggr_ours_recover = aggr_ours_sim_max == 1.
    aggr_unirxn_recover = aggr_unirxn_sim_max == 1.
    aggr_mcmc_recover = aggr_mcmc_sim_max == 1.

    res = aggr_ours_recover.mean(), aggr_baseline_recover.mean(), aggr_unirxn_recover.mean(), aggr_mcmc_recover.mean(), aggr_ours_sim.mean(), aggr_baseline_sim.mean(), aggr_unirxn_sim.mean(), aggr_mcmc_sim.mean(), aggr_ours_diversity.mean(), aggr_baseline_diversity.mean(), aggr_mcmc_diversity.mean(), aggr_ours_sa.mean(), aggr_baseline_sa.mean(), aggr_unirxn_sa.mean(), aggr_mcmc_sa.mean()
    metrics = ['recover_ours', 'recover_baseline',  'recover_unirxn', 'recover_mcmc',
                'sim_ours', 'sim_baseline', 'sim_unirxn', 'sim_mcmc',
                'diversity_ours', 'diversity_baseline', 'diversity_mcmc',
                'sa_ours', 'sa_baseline', 'sa_unirxn', 'sa_mcmc']
    assert len(metrics) == len(res)
    for metric, r in zip(metrics, res):
        print(f"{NUM_ANALOGS} {metric} {r}")

1 recover_ours avg_sim    0.078652
dtype: float64
1 recover_baseline avg_sim    0.072
dtype: float64
1 recover_unirxn avg_sim    0.001227
dtype: float64
1 recover_mcmc avg_sim    0.103448
dtype: float64
1 sim_ours avg_sim    0.545772
dtype: float64
1 sim_baseline avg_sim    0.551169
dtype: float64
1 sim_unirxn avg_sim    0.103975
dtype: float64
1 sim_mcmc avg_sim    0.499317
dtype: float64
1 diversity_ours diversity    0.0
dtype: float64
1 diversity_baseline diversity    0.0
dtype: float64
1 diversity_mcmc diversity    0.0
dtype: float64
1 sa_ours sa_score    2.552345
dtype: float64
1 sa_baseline sa_score    2.690997
dtype: float64
1 sa_unirxn sa_score    3.018992
dtype: float64
1 sa_mcmc sa_score    2.509575
dtype: float64
3 recover_ours avg_sim    0.078652
dtype: float64
3 recover_baseline avg_sim    0.072
dtype: float64
3 recover_unirxn avg_sim    0.001227
dtype: float64
3 recover_mcmc avg_sim    0.103448
dtype: float64
3 sim_ours avg_sim    0.475911
dtype: float64
3 sim_baseline av

(avg_sim    0.103975
 dtype: float64,
 sa_score    3.018992
 dtype: float64)

In [166]:
aggr_baseline_sim==1

Unnamed: 0_level_0,avg_sim
targets,Unnamed: 1_level_1
Br.C[C@@]1(O)CS[C@@H](n2ccc(N)nc2=O)[C@@H]1O,0.397519
Br.Cc1ccc2n(c1)cc(COc1ccc(/C=N/NC(=N)NO)cc1)[n+]2C.[Br-],0.271178
Brc1ccc(OC(c2ccccc2)C2CCNCC2)cc1,0.765883
C#CC(=O)N(c1cccc(NS(C)(=O)=O)c1)C(C(=O)NCc1ccccc1)c1cccc(N)c1,0.531385
C#CC(C)(C)N=C(S)NC1CC2C=CC1C2,0.295130
...,...
c1cc2c(cc1CN[C@H]1C3C4CC5C6C4CC3C6C51)OCO2,0.739114
c1ccc(C(n2ccnc2)n2ccnc2)cc1,0.380998
c1ccc(CC2CCN(c3ncnc4sc5c(c34)CCCC5)CC2)cc1,0.815158
c1ccc(Nc2ncnc3c2oc2cc(-c4ccc5c(c4)OCO5)cnc23)cc1,0.448492


In [136]:

bb_file = '/home/msun415/SynTreeNet/data/assets/building-blocks/enamine_us_matched.csv'
reactants = pd.read_csv(bb_file)['SMILES']
dic = {'reactant': list(reactants), 'reagent': list(reactants)}
pickle.dump(dic, open('/home/msun415/Uni-RXN-official/dataset/data/enamine_react_lib_smi.pkl', 'wb+'))


In [30]:
score_history

[0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.38461538461538464,
 0.27941176470588236,
 0.38461538461538464,
 0.38461538461538464,
 0.509090909090909,
 0.38461538461538464,
 0.509090909090909,
 0.509090909090909,
 0.35294117647058826,
 0.2602739726027397,
 0.509090909090909,
 0.2602739726027397,
 0.509090909090909,
 0.509090909090909,
 0.28169014084507044,
 0.509090909090909,
 0.509090909090909,
 0.509090909090909,
 0.28169014084507044,
 0.509090909090909,
 0.509090909090909,
 0.28169014084507044,
 0.509090909090909,
 0.21333333333333335,
 0.509090909090909,
 0.509090909090909,
 0.28169014084507044,
 0.1917808219178082,
 0.26666666666666666,
 0.26666666666666666,
 0.26666666666666666,
 0.26666666666666666,
 0.509090909090909,
 0.26666666666666666,
 0.26666666666666666,
 0.26666666666666666,
 0.509090909090909,
 0.26666666666666666,
 0.2372881355932203

NameError: name 'df_baseline' is not defined