In [1]:
import os
import gc
import random
import itertools
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import tokenizers
import transformers
from transformers import AutoTokenizer, EncoderDecoderModel, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import datasets
from datasets import load_dataset, load_metric
import sentencepiece
import argparse
from datasets.utils.logging import disable_progress_bar
from rdkit import Chem
import rdkit
disable_progress_bar()

class CFG:
    model = 't5'
    dataset_path = 'multiinput_prediction_output.csv'
    model_name_or_path = 'sagawa/ReactionT5-product-prediction'
    num_beams = 5
    num_return_sequences = 5
    debug = True
    seed = 42
    

device = 'cpu'

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(seed=CFG.seed)  
    

# dataset = pd.read_csv(CFG.dataset_path)

tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')

In [2]:
df = pd.read_csv('forward_reaction_prediction_output_beam10.csv')
seed_everything(seed=CFG.seed)  
df['target'] = pd.read_csv('sampled.csv')['PRODUCT']
df

Unnamed: 0,input,0th,1th,2th,3th,4th,5th,6th,7th,8th,...,1th score,2th score,3th score,4th score,5th score,6th score,7th score,8th score,9th score,target
0,REACTANT:C#CCO.C1CCOC1.CCN(C(C)C)C(C)C.ClC(Cl)...,Cc1cc(C)cc(C#CCO)c1,Cc1cc(C)cc(C#CCO)c1.Cc1cc(C)cc(C#CCO)c1,C#CCOc1cc(C)cc(C)c1,Cc1cc(C)cc(C#CCO)c1.OCC#Cc1cc(C)cc(I)c1,Cc1cc(C)cc(C#CCO)c1.Cc1ccc(P(c2ccccc2)c2ccccc2)c1,Cc1cc(C)cc(C#CCO)c1.Cc1cc(C)cc(I)c1,Cc1cc(C)cc(C#CCO)c1.Cc1ccc(S(=O)(=O)[O-])cc1,C#CCOc1cc(C)cc(C)c1.OCC#Cc1cc(C)cc(I)c1,Cc1cc(C)cc(C#CCO)c1)c1cc(C)cc(I)c1,...,-0.222811,-0.262950,-0.269789,-0.282104,-0.299953,-0.329690,-0.330433,-0.358373,-0.360476,Cc1cc(C)cc(C#CCO)c1
1,REACTANT:C1CCOC1.C=CC(=O)O.CCN=C=NCCCN(C)C.Cl....,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,...,-0.060352,-0.074450,-0.088970,-0.089133,-0.094103,-0.097159,-0.097267,-0.100273,-0.101312,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...
2,REACTANT:COc1cc(Br)ccc1C=O.C[Si](C)(C)Cl.NC(=O...,Cn1nnnc1-c1cc(Br)cc([N+](=O)[O-])c1,O=S(=O)([O-])[O-].O=S(=O)([O-])[O-].O=S(=O)([O...,Cc1cc(Cc2cc(C)c(O)c(C=O)c2)cc(C=O)c1O,Cn1nnnc1-c1cc(N)cc(Br)c1,c1ccc2sc(-c3nccc4ccccc34)cc2c1,CCOC(=O)c1cccc(-c2cn(-c3ccc(F)c(F)c3)nc2C(=O)O...,COc1ccc(C(OCC(O)CN(C)C)(c2ccccc2)c2ccc(OC)cc2)cc1,Cc1cc(Cc2cc(C)c(O)c(C=O)c2C)c(C)c(C=O)c1O,O=C(NNC(=O)C(F)(F)F)C(F)(F)F,...,-0.029513,-0.120316,-0.121527,-0.138246,-0.141353,-0.163921,-0.202375,-0.231343,-0.233597,Cn1nnnc1-c1cc(Br)cc([N+](=O)[O-])c1
3,REACTANT:CCOC(=O)CCCN(C)C1CCCCC1.[Na+].[OH-]RE...,Cc1cc(Cc2cc(C)c(O)c(C=O)c2)cc(C=O)c1O,O=C(O)c1cc(-n2c(=O)cc(C(F)(F)F)[nH]c2=O)ccc1Cl,COc1nc(N2CCCC2)ccc1[N+](=O)[O-],C[C@H](CO)CO[Si](c1ccccc1)(c1ccccc1)C(C)(C)C,Cn1nnnc1-c1cc(Br)cc([N+](=O)[O-])c1,Cc1cc(Cc2cc(C)c(O)c(C=O)c2C)c(C)c(C=O)c1O,c1ccc2sc(-c3nccc4ccccc34)cc2c1,COC(=O)c1nc(Br)cc(Br)c1OCc1ccccc1,Clc1ccc2cnc3ccccc3c2c1,...,-0.070244,-0.072933,-0.078933,-0.085600,-0.090770,-0.105308,-0.134063,-0.187424,-0.198361,Cc1cc(Cc2cc(C)c(O)c(C=O)c2)cc(C=O)c1O
4,REACTANT:CC(=O)[O-].COc1cc(C(=O)CBr)cc([N+](=O...,CCc1c(Cc2[nH]c(C(=O)OCc3ccccc3)c(C)c2CCC(=O)OC...,Oc1cc2cc[nH]c2cc1O,O=C(O)c1cc(-n2c(=O)cc(C(F)(F)F)[nH]c2=O)ccc1Cl,O=C(O)CC(O)(CC(=O)O)C(=O)O.O=C[O-].O=C[O-].[Cu+2],Cc1cc(Cc2cc(C)c(O)c(C=O)c2)cc(C=O)c1O,CCc1c(Cc2[nH]c(C(=O)OCc3ccccc3)c(C)c2CCC(=O)OC...,CCc1c(Cc2[nH]c(C(=O)OCc3ccccc3)c(C)c2CCC(=O)OC...,CCc1c(Cc2[nH]c(C(=O)OCc3ccccc3)c(C)c2CCC(=O)OC...,Cc1cc(Cc2cc(C)c(O)c(C=O)c2C)c(C)c(C=O)c1O,...,-0.025438,-0.073633,-0.106168,-0.109289,-0.141472,-0.169183,-0.176353,-0.176743,-0.178401,O=C(O)c1cc(-n2c(=O)cc(C(F)(F)F)[nH]c2=O)ccc1Cl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,REACTANT:ClCCCCCBr.O.OCC1CCCCC1.[H-].[Na+]REAG...,Cc1cc(Cc2cc(C)c(O)c(C=O)c2)cc(C=O)c1O,CC#CN1C(=O)C(C)Oc2ccc(-n3c(=O)cc(C(F)(F)F)[nH]...,C[C@H](CO)CO[Si](c1ccccc1)(c1ccccc1)C(C)(C)C,O=[N+]([O-])c1cc([N+](=O)[O-])c(OCCO)c(C(F)(F)...,Cn1nnnc1-c1cc(Br)cc([N+](=O)[O-])c1,O=C(O)c1cc(-n2c(=O)cc(C(F)(F)F)[nH]c2=O)ccc1Cl,Cc1cc(Cc2cc(C)c(O)c(C=O)c2C)c(C)c(C=O)c1O,COC(=O)c1nc(Br)cc(Br)c1OCc1ccccc1,Sc1nc(SCc2ccccc2)n[nH]1,...,-0.056146,-0.066046,-0.072189,-0.089448,-0.092988,-0.102559,-0.102608,-0.138105,-0.160780,NN=C(C=Cc1ccccc1)c1ccccc1
19996,REACTANT:CON1CCC(C#N)(NO)CC1.Cc1ccc(C)c(CC(=O)...,Cn1nnnc1-c1cc(Br)cc([N+](=O)[O-])c1,c1ccc2sc(-c3nccc4ccccc34)cc2c1,Cn1nnnc1-c1cc(N)cc(Br)c1,O=C(NNC(=O)C(F)(F)F)C(F)(F)F,CCOC(=O)c1cccc(-c2cn(-c3ccc(F)c(F)c3)nc2C(=O)O...,O=C(O)c1cccc(Cl)c1,COC(=O)c1nc(Br)cc(Br)c1OCc1ccccc1,Clc1ccc2cnc3ccccc3c2c1,NN=C(C=Cc1ccccc1)c1ccccc1,...,-0.067891,-0.097094,-0.106638,-0.109478,-0.113609,-0.160995,-0.167301,-0.177450,-0.263991,Cn1nnnc1-c1cc(Br)cc([N+](=O)[O-])c1
19997,REACTANT:CCN(CC)CC.CCOC(C)=O.NCC1=NN=C(N)C1=NN...,CCCCCCCC/C=C\CCCCCCCC(=O)O,O=C(O)c1cccc(Cl)c1,O=S(=O)([O-])[O-].O=S(=O)([O-])[O-].O=S(=O)([O...,C[C@H](CO)CO[Si](c1ccccc1)(c1ccccc1)C(C)(C)C,COC(=O)c1nc(Br)cc(Br)c1OCc1ccccc1,CCOC(=O)c1cccc(-c2cn(-c3ccc(F)c(F)c3)nc2C(=O)O...,c1ccc2sc(-c3nccc4ccccc34)cc2c1,Cc1cc(Cc2cc(C)c(O)c(C=O)c2)cc(C=O)c1O,Cn1nnnc1-c1cc(Br)cc([N+](=O)[O-])c1,...,-0.059629,-0.078654,-0.081071,-0.125459,-0.126357,-0.130624,-0.163021,-0.170716,-0.179579,O=C(O)c1cccc(Cl)c1
19998,REACTANT:BrCC1CO1.CN(C)C=O.[K+]REAGENT:Brc1ccc...,Brc1ccc2c(c1)c1cc(Br)ccc1n2CC1CO1,Brc1ccc2c(c1)c1cc(Br)ccc1n2CC1CO1.Brc1ccc2[nH]...,Brc1ccc2c(c1)c1cc(Br)ccc1n2CC1CO1.Brc1ccc2c(c1...,Brc1ccc2nn(CC3CO3)c3ccc(Br)cc3c2c1,OC(CBr)Cn1c2ccc(Br)cc2c2cc(Br)ccc21,Brc1ccc2c(c1)c1cc(Br)ccc1n2CC1CO1.Ic1ccc2[nH]c...,Brc1ccc2c(c1)c1cc(Br)ccc1n2CC(O)CBr,Brc1ccc2c(c1)c1cc(Br)ccc1n2CC1CO1.[K]c1ccc2[nH...,Brc1ccc2c(c1)c1cc(Br)ccc1n2CC1CO1.Brc1ccc2nc3c...,...,-0.232044,-0.274885,-0.275088,-0.299018,-0.300001,-0.323155,-0.330111,-0.342570,-0.351696,Brc1ccc2c(c1)c1cc(Br)ccc1n2CC1CO1


In [3]:
from rdkit import Chem
def canonicalize(mol):
    mol = Chem.MolToSmiles(Chem.MolFromSmiles(mol),True)
    return mol
i = 40
canonicalize(df['target'][i]),Chem.CanonSmiles(df['target'][i])

('c1ccc2sc(-c3nccc4ccccc34)cc2c1', 'c1ccc2sc(-c3nccc4ccccc34)cc2c1')

In [4]:
def remove_space(row):
    for i in range(5):
        row[f'{i}th'] = row[f'{i}th'].replace(' ', '')
#     row['valid compound'] = row['valid compound'].replace(' ', '')
    return row
df = df.apply(remove_space, axis=1)

In [5]:
def canonicalize2(mol):
    try:
        return canonicalize(mol)
    except:
        return None

In [6]:
top_k_invalidity = 5

top1, top2, top3, top5 = [], [], [], []
invalidity = []

for idx, row in df.iterrows():
    target = canonicalize(row['target'])
    if canonicalize2(row['0th']) == target:
        top1.append(1)
        top2.append(1)
        top3.append(1)
        top5.append(1)
    elif canonicalize2(row['1th']) == target:
        top1.append(0)
        top2.append(1)
        top3.append(1)
        top5.append(1)
    elif canonicalize2(row['2th']) == target:
        top1.append(0)
        top2.append(0)
        top3.append(1)
        top5.append(1)
    elif canonicalize2(row['3th']) == target:
        top1.append(0)
        top2.append(0)
        top3.append(0)
        top5.append(1)
    elif canonicalize2(row['4th']) == target:
        top1.append(0)
        top2.append(0)
        top3.append(0)
        top5.append(1)
    else:
        top1.append(0)
        top2.append(0)
        top3.append(0)
        top5.append(0)

        
    input_compound = row['input']
    output = [row[f'{i}th'] for i in range(top_k_invalidity)]
    inval_score = 0
    for ith, out in enumerate(output):
        mol = Chem.MolFromSmiles(out.rstrip('.'))
        if type(mol) != rdkit.Chem.rdchem.Mol:
            inval_score += 1
    invalidity.append(inval_score)
df['top1_accuracy'] = top1
df['top2_accuracy'] = top2
df['top3_accuracy'] = top3
df['top5_accuracy'] = top5
df['invalidity'] = invalidity

[11:28:16] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 9
[11:28:16] Explicit valence for atom # 15 N, 4, is greater than permitted
[11:28:16] Can't kekulize mol.  Unkekulized atoms: 2 12 13 14 15 16 17
[11:28:16] Can't kekulize mol.  Unkekulized atoms: 2 12 13 14 15 16 17
[11:28:16] SMILES Parse Error: extra open parentheses for input: 'COC(=O)C1=CO[C@@H](O[C@@H]2O[C@H](CO)[C@H](O)[C@H](O)[C@H]12'
[11:28:16] SMILES Parse Error: extra open parentheses for input: 'CSC1=CC(CCCCOc2ccccc2)(O[Si](C)(C)C)C(C(O)C=CC(CCCO[Si](C)(C)C(C)(C)C)CC1=O'
[11:28:16] Explicit valence for atom # 8 N, 4, is greater than permitted
[11:28:16] Can't kekulize mol.  Unkekulized atoms: 7 8 9 10 11 12 13 14 15
[11:28:17] SMILES Parse Error: extra open parentheses for input: 'O=C(OC1CCCCC1C(C(=O)N1CCNCC1)C(c1cccc(Cl)c1)C1CCCCC1O'
[11:28:17] SMILES Parse Error: extra open parentheses for input: 'CC(C)C(NC(=O)OCc1ccccc1)C(=O)OCC(COC(=O)CCC(=O)OCC1OC(n2cnc3c(=O)[nH]c(N)nc32)CC1F'
[11:28:17] SMILES Parse Error: ex

[11:28:24] SMILES Parse Error: extra open parentheses for input: 'COc1nc2ccc([N+](=O)[O-])cc2c1NC(=O)N[C@H]1CC[C@@]2(C)C(=CC[C@@H]3[C@@H]2CC[C@@]2(C)[C@H]3CC[C@@H]2[C@H](C)CCCC(C)C'
[11:28:24] SMILES Parse Error: extra close parentheses while parsing: COc1nc2ccc([N+](=O)[O-])cc2c1NC(=O)N[C@H]1CC[C@H]2C(CO)=C[C@H]3[C@@H]2CC[C@@]2(C)[C@H]3CC[C@@H]2[C@H](C)CCCC(C)C)C
[11:28:24] SMILES Parse Error: Failed parsing SMILES 'COc1nc2ccc([N+](=O)[O-])cc2c1NC(=O)N[C@H]1CC[C@H]2C(CO)=C[C@H]3[C@@H]2CC[C@@]2(C)[C@H]3CC[C@@H]2[C@H](C)CCCC(C)C)C' for input: 'COc1nc2ccc([N+](=O)[O-])cc2c1NC(=O)N[C@H]1CC[C@H]2C(CO)=C[C@H]3[C@@H]2CC[C@@]2(C)[C@H]3CC[C@@H]2[C@H](C)CCCC(C)C)C'
[11:28:24] SMILES Parse Error: extra close parentheses while parsing: COc1nc2ccc([N+](=O)[O-])cc2c1NC(=O)N[C@H]1CC[C@H]2C(CO)=CC[C@H]3[C@@H]2CC[C@@]2(C)[C@H]3CC[C@@H]2[C@H](C)CCCC(C)C)
[11:28:24] SMILES Parse Error: Failed parsing SMILES 'COc1nc2ccc([N+](=O)[O-])cc2c1NC(=O)N[C@H]1CC[C@H]2C(CO)=CC[C@H]3[C@@H]2CC[C@@]2(C)[C@H]3CC[C@@H]

[11:28:28] SMILES Parse Error: unclosed ring for input: 'CC(O[Si](C)(C)C(C)(C)C)C1C(=O)NC1C(C)C(=O)C(C)(C)OC1CCO'
[11:28:28] SMILES Parse Error: extra close parentheses while parsing: CC(O[Si](C)(C)C(C)(C)C)C1C(=O)NC1C(C)C(=O)C(C)(C)O)C(C)=O
[11:28:28] SMILES Parse Error: Failed parsing SMILES 'CC(O[Si](C)(C)C(C)(C)C)C1C(=O)NC1C(C)C(=O)C(C)(C)O)C(C)=O' for input: 'CC(O[Si](C)(C)C(C)(C)C)C1C(=O)NC1C(C)C(=O)C(C)(C)O)C(C)=O'
[11:28:28] SMILES Parse Error: unclosed ring for input: 'CN(C)C1(c2ccccc2)C(F)CC(F)N1CCC(O)(c1ccc(Cl)cc1)CC1'
[11:28:28] Can't kekulize mol.  Unkekulized atoms: 14 15 16 17 18 19 22 23 24
[11:28:28] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 9 10 25 26 33 34 35
[11:28:28] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 9 10
[11:28:28] Explicit valence for atom # 4 C, 5, is greater than permitted
[11:28:28] Can't kekulize mol.  Unkekulized atoms: 5 6 13 23 25
[11:28:28] Can't kekulize mol.  Unkekulized atoms: 1 2 3 5 10 11 26
[11:28:28] SMILES Parse Error: u

[11:28:31] SMILES Parse Error: syntax error while parsing: CC(C)(C)[CCCO+3]CC=C(C2=CC=CC2)CC=C1.[Cl-]
[11:28:31] SMILES Parse Error: Failed parsing SMILES 'CC(C)(C)[CCCO+3]CC=C(C2=CC=CC2)CC=C1.[Cl-]' for input: 'CC(C)(C)[CCCO+3]CC=C(C2=CC=CC2)CC=C1.[Cl-]'
[11:28:31] SMILES Parse Error: extra open parentheses for input: 'CCOC(C)OC(C#N)C(=CC=C(C)CCC=C(C)CCC=C(C)C[S+](c1ccccc1)(c1ccccc1)C(C)C.[Cl-]'
[11:28:31] SMILES Parse Error: extra open parentheses for input: 'CCOC(C)OC(C#N)C(=CC=C(C)CCC=C(C)C[S+](c1ccccc1)(c1ccccc1)C(C)C.[Cl-]'
[11:28:31] SMILES Parse Error: extra open parentheses for input: 'COCCCn1ncc2ccc(CC(CC(NC(=O)OC(C)(C)C)C(C)O)cc21'
[11:28:31] SMILES Parse Error: extra open parentheses for input: 'COCCCn1ncc2ccc(CC(CC(NC(=O)OC(C)(C)C)C(O)C=O)cc21'
[11:28:31] Can't kekulize mol.  Unkekulized atoms: 1 2 3 13 24
[11:28:31] Can't kekulize mol.  Unkekulized atoms: 1 2 3 16 27
[11:28:31] Can't kekulize mol.  Unkekulized atoms: 2 3 4 14 25
[11:28:31] Can't kekulize mol.  Unkekulized

[11:28:34] SMILES Parse Error: syntax error while parsing: OC1(c2ccccc2)CCCC(CN2CCN(c3ccccn3)CC2)C1.[c1ccccc1
[11:28:34] SMILES Parse Error: Failed parsing SMILES 'OC1(c2ccccc2)CCCC(CN2CCN(c3ccccn3)CC2)C1.[c1ccccc1' for input: 'OC1(c2ccccc2)CCCC(CN2CCN(c3ccccn3)CC2)C1.[c1ccccc1'
[11:28:34] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 26 28
[11:28:34] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 11 12 13 26 28
[11:28:34] Can't kekulize mol.  Unkekulized atoms: 17 18 19 21 22 25 26
[11:28:34] SMILES Parse Error: extra open parentheses for input: 'CN1CC2(CN(c3ccc(Nc4cc(-c5cccc(-n6ncc7cc(C(C)(C)C)cc(F)c5=O)nn(C)c4=O)nc3)C2)C1'
[11:28:34] SMILES Parse Error: extra open parentheses for input: 'CN1CC2(CN(c3ccc(Nc4cc(-c5cccc(-n6ncc7cc(C(C)(C)C)cc(F)c7c4=O)nn(C)c4=O)nc3)C2)C1'
[11:28:34] Can't kekulize mol.  Unkekulized atoms: 11 12 13
[11:28:34] SMILES Parse Error: extra open parentheses for input: 'CN1CC2(CN(c3ccc(Nc4cc(-c5cccc(-n6ncc7cc(C(C)(C)C)cc(F)c5c4=O)nn(C)c4=O)nc3)C2)C1

In [7]:
df.head()

Unnamed: 0,input,0th,1th,2th,3th,4th,5th,6th,7th,8th,...,6th score,7th score,8th score,9th score,target,top1_accuracy,top2_accuracy,top3_accuracy,top5_accuracy,invalidity
0,REACTANT:C#CCO.C1CCOC1.CCN(C(C)C)C(C)C.ClC(Cl)...,Cc1cc(C)cc(C#CCO)c1,Cc1cc(C)cc(C#CCO)c1.Cc1cc(C)cc(C#CCO)c1,C#CCOc1cc(C)cc(C)c1,Cc1cc(C)cc(C#CCO)c1.OCC#Cc1cc(C)cc(I)c1,Cc1cc(C)cc(C#CCO)c1.Cc1ccc(P(c2ccccc2)c2ccccc2)c1,Cc1cc(C)cc(C#CCO)c1.Cc1cc(C)cc(I)c1,Cc1cc(C)cc(C#CCO)c1.Cc1ccc(S(=O)(=O)[O-])cc1,C#CCOc1cc(C)cc(C)c1.OCC#Cc1cc(C)cc(I)c1,Cc1cc(C)cc(C#CCO)c1)c1cc(C)cc(I)c1,...,-0.32969,-0.330433,-0.358373,-0.360476,Cc1cc(C)cc(C#CCO)c1,1,1,1,1,0
1,REACTANT:C1CCOC1.C=CC(=O)O.CCN=C=NCCCN(C)C.Cl....,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,...,-0.097159,-0.097267,-0.100273,-0.101312,C=CC(=O)N1CCCC1C(=O)Nc1cc2c(Nc3ccc(OCc4ccccn4)...,1,1,1,1,0
2,REACTANT:COc1cc(Br)ccc1C=O.C[Si](C)(C)Cl.NC(=O...,Cn1nnnc1-c1cc(Br)cc([N+](=O)[O-])c1,O=S(=O)([O-])[O-].O=S(=O)([O-])[O-].O=S(=O)([O...,Cc1cc(Cc2cc(C)c(O)c(C=O)c2)cc(C=O)c1O,Cn1nnnc1-c1cc(N)cc(Br)c1,c1ccc2sc(-c3nccc4ccccc34)cc2c1,CCOC(=O)c1cccc(-c2cn(-c3ccc(F)c(F)c3)nc2C(=O)O...,COc1ccc(C(OCC(O)CN(C)C)(c2ccccc2)c2ccc(OC)cc2)cc1,Cc1cc(Cc2cc(C)c(O)c(C=O)c2C)c(C)c(C=O)c1O,O=C(NNC(=O)C(F)(F)F)C(F)(F)F,...,-0.163921,-0.202375,-0.231343,-0.233597,Cn1nnnc1-c1cc(Br)cc([N+](=O)[O-])c1,1,1,1,1,0
3,REACTANT:CCOC(=O)CCCN(C)C1CCCCC1.[Na+].[OH-]RE...,Cc1cc(Cc2cc(C)c(O)c(C=O)c2)cc(C=O)c1O,O=C(O)c1cc(-n2c(=O)cc(C(F)(F)F)[nH]c2=O)ccc1Cl,COc1nc(N2CCCC2)ccc1[N+](=O)[O-],C[C@H](CO)CO[Si](c1ccccc1)(c1ccccc1)C(C)(C)C,Cn1nnnc1-c1cc(Br)cc([N+](=O)[O-])c1,Cc1cc(Cc2cc(C)c(O)c(C=O)c2C)c(C)c(C=O)c1O,c1ccc2sc(-c3nccc4ccccc34)cc2c1,COC(=O)c1nc(Br)cc(Br)c1OCc1ccccc1,Clc1ccc2cnc3ccccc3c2c1,...,-0.105308,-0.134063,-0.187424,-0.198361,Cc1cc(Cc2cc(C)c(O)c(C=O)c2)cc(C=O)c1O,1,1,1,1,0
4,REACTANT:CC(=O)[O-].COc1cc(C(=O)CBr)cc([N+](=O...,CCc1c(Cc2[nH]c(C(=O)OCc3ccccc3)c(C)c2CCC(=O)OC...,Oc1cc2cc[nH]c2cc1O,O=C(O)c1cc(-n2c(=O)cc(C(F)(F)F)[nH]c2=O)ccc1Cl,O=C(O)CC(O)(CC(=O)O)C(=O)O.O=C[O-].O=C[O-].[Cu+2],Cc1cc(Cc2cc(C)c(O)c(C=O)c2)cc(C=O)c1O,CCc1c(Cc2[nH]c(C(=O)OCc3ccccc3)c(C)c2CCC(=O)OC...,CCc1c(Cc2[nH]c(C(=O)OCc3ccccc3)c(C)c2CCC(=O)OC...,CCc1c(Cc2[nH]c(C(=O)OCc3ccccc3)c(C)c2CCC(=O)OC...,Cc1cc(Cc2cc(C)c(O)c(C=O)c2C)c(C)c(C=O)c1O,...,-0.169183,-0.176353,-0.176743,-0.178401,O=C(O)c1cc(-n2c(=O)cc(C(F)(F)F)[nH]c2=O)ccc1Cl,0,0,1,1,0


In [8]:
print(sum(df['top1_accuracy']) / len(df), sum(df['top2_accuracy']) / len(df), sum(df['top3_accuracy']) / len(df), sum(df['top5_accuracy']) / len(df))
print(sum(invalidity)/(len(invalidity)*top_k_invalidity)*100)

0.53595 0.64195 0.6886 0.74785
0.17500000000000002


In [53]:
df = pd.read_csv('multiinput_prediction_output.csv')
target_df = pd.read_csv('val.csv')
df['target'] = target_df['PRODUCT']
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')
# tokenizer.add_tokens(['.', '>', '<', 'P','Pd', 'Na', 'K', 'Al', 'Cu', 'Si', 'Zn', 'Mn', 'Li', 'Mg', 'Fe', 'Ba', 'Pt', 'Ag', 'Yb', '6', 'e'])
tokenizer.add_tokens(['Ru', 'Cl', 'Pb', 'Ti','Tl', 'Ru', '7', 'Ni', 'Ca', 'Hg', 'Sb', 'Rh', 'Nd', 'As', '8', 'Zr', 'p', 'W', 'Ar', 'Ge', 'Sm', 'Ta', 'Re', 'Au', 'Mo', 'Bi'])
for ith, text in enumerate(df['input']):
    decoded = tokenizer.decode(tokenizer(text, add_special_tokens=True, max_length=512, padding='do_not_pad', return_offsets_mapping=False, truncation=True, return_attention_mask=True).input_ids)
    decoded = decoded.replace(' ', '')
    if '<unk>' in decoded:
        print(text)
        print(decoded)
#     if ith == 50000:
#         break


In [48]:
print(sorted(['Ru', 'Cl', 'Pb', 'Ti','Tl', 'Ru', '7', 'Ni', 'Ca', 'Hg', 'Sb', 'Rh', 'Nd', 'As', '8', 'Zr', 'p', 'W', 'Ar', 'Ge', 'Sm', 'Ta', 'Re', 'Au', 'Mo', 'Bi'] + ['.', '>', '<', 'P','Pd', 'Na', 'K', 'Al', 'Cu', 'Si', 'Zn', 'Mn', 'Li', 'Mg', 'Fe', 'Ba', 'Pt', 'Ag', 'Yb', '6', 'e']))

['.', '6', '7', '8', '<', '>', 'Ag', 'Al', 'Ar', 'As', 'Au', 'Ba', 'Bi', 'Ca', 'Cl', 'Cu', 'Fe', 'Ge', 'Hg', 'K', 'Li', 'Mg', 'Mn', 'Mo', 'Na', 'Nd', 'Ni', 'P', 'Pb', 'Pd', 'Pt', 'Re', 'Rh', 'Ru', 'Ru', 'Sb', 'Si', 'Sm', 'Ta', 'Ti', 'Tl', 'W', 'Yb', 'Zn', 'Zr', 'e', 'p']


In [50]:
df

Unnamed: 0,input,0th,1th,2th,3th,4th,5th,6th,7th,8th,...,6th score,7th score,8th score,9th score,10th score,11th score,12th score,13th score,14th score,valid compound score
0,REACTANT:COc1ccc(S(=O)(=O)Cl)cc1.[Na+]REAGENT:...,COc1ccc(S(=O)[O-])cc1,COc1ccc(S(=O)[O-])cc1.[Cl-],COc1ccc(S(=O)(=O)[O-])cc1,O=S(=O)([O-])c1ccc(O)cc1,COc1ccc(S(=O)(=O)[O-])cc1.[Cl-],COc1ccc(S(=O)[O-])cc1.c1cc[nH+]cc1,Cl.O=S(=O)([O-])c1ccc(O)cc1,O=S(=O)([O-])c1ccc([O-])cc1,Cl.COc1ccc(S(=O)[O-])cc1,...,-0.263000,-0.266919,-0.295366,-0.295947,-0.297291,-0.314993,-0.316880,-0.317811,-0.319223,-0.008070
1,REACTANT:N#Cc1c(N)nc(Cl)c(C#N)c1-c1ccccc1.OCc1...,N#Cc1c(N)nc(OCc2ccccn2)c(C#N)c1-c1ccccc1,N#Cc1c(N)nc(Cc2ccccn2)c(C#N)c1-c1ccccc1,N#Cc1c(N)[nH]c(OCc2ccccn2)c(C#N)c1-c1ccccc1,Cl.N#Cc1c(N)nc(OCc2ccccn2)c(C#N)c1-c1ccccc1,N#Cc1c(N)[nH]c(=O)c(C#N)c1-c1ccccc1,N#Cc1c(N)[nH+]c(OCc2ccccn2)c(C#N)c1-c1ccccc1.[...,N#Cc1c(N)nc(C(=O)c2ccccn2)c(C#N)c1-c1ccccc1,N#Cc1c(N)nc(Oc2ccccn2)c(C#N)c1-c1ccccc1,NC(=O)c1c(N)nc(OCc2ccccn2)c(C#N)c1-c1ccccc1,...,-0.234125,-0.238098,-0.258809,-0.271725,-0.274950,-0.283902,-0.292726,-0.296436,-0.310742,-0.000224
2,REACTANT:CN1CCC(CCO)CC1.Cc1ccc(N2CCN(C(=O)Oc3c...,Cc1ccc(N2CCN(C(=O)OCCC3CCN(C)CC3)CC2)cc1,CN1CCC(CCOC(=O)N2CCN(c3ccc(C)cc3)CC2)CC1,Cc1ccc(N2CCN(C(=O)OCCC3CCN(C)CC3)CC2)cc1.Cc1cc...,Cc1ccc(N2CCN(C(=O)OCCC3CCN(C)CC3)CC2)cc1.Cc1cc...,Cc1ccc(N2CCN(C(=O)CCC3CCN(C)CC3)CC2)cc1,Cc1ccc(N2CCN(C(=O)OCCC3CCN(C)CC3)CC2)cc1.Cc1cc...,CN1CCC(CCOC(=O)N2CCN(c3ccc(C)cc3)CC2)CC1.Cc1cc...,Cc1ccc(N2CCN(C(=O)NCCC3CCN(C)CC3)CC2)cc1,Cc1ccc(N2CCN(C(=O)OCC3CCN(C)CC3)CC2)cc1,...,-0.257414,-0.258352,-0.265043,-0.265205,-0.265414,-0.275489,-0.275925,-0.282900,-0.283351,-0.000132
3,REACTANT:CC(C)(C)OC(=O)N1CCC(COC(=O)C2CCC3CN2C...,O=C(OCC1CCNCC1)C1CCC2CN1C(=O)N2OS(=O)(=O)O,CC(C)(C)OC(=O)N1CCC(COC(=O)C2CCC3CN2C(=O)N3OS(...,O=C(C1CCC2CN1C(=O)N2OS(=O)(=O)O)OCC1CCNCC1,CC(C)(C)OC(=O)N1CCC(COC(=O)C2CCC3CN2C(=O)N3O)CC1,CC(C)(C)OC(=O)N1CCC(CO)CC1C(=O)C1CCC2CN1C(=O)N...,O=C1C2CCC(C(=O)OCC3CCNCC3)N2C(=O)N1OS(=O)(=O)O,CCN1CCC(COC(=O)C2CCC3CN2C(=O)N3OS(=O)(=O)O)CC1,CN1CCC(COC(=O)C2CCC3CN2C(=O)N3OS(=O)(=O)O)CC1,O=C(O)C1CCC2CN1C(=O)N2OS(=O)(=O)O,...,-0.152735,-0.159872,-0.163130,-0.166870,-0.169046,-0.169374,-0.176924,-0.185469,-0.195297,-0.001599
4,REACTANT:CCC12CCC3C4CCC(=O)C=C4CCC3C1C(O)CC2=O...,CCC12CCC3C4CCC(=O)C=C4CCC3C1C(OC(=O)c1ccccc1)C...,CCC12CCC(C3CCC(=O)C=C4CCC3C1C(OC(=O)c1ccccc1)C...,CCC12CCC(C3CCC(=O)C=C4CCC3C1C(OC(=O)c1ccccc1)C...,CCC12CCC(C3CCC(=O)C=C4CCC3C1C(OC(=O)c3ccccc3)C...,CCC12CCC3C4CCC(=O)C=C4CCC3C1C(O)C(C(=O)c1ccccc...,CCC12CCC(C3CCC(=O)C=C4CCC3C1C(OC(=O)c1ccccc1)C...,CCC12CCC3C4CCC(=O)C=C4CCC3C1C(OC(=O)c1ccccc1)C...,CCC12CCC(C3CCC(=O)C=C4CCC3C1C(OC(=O)c1ccccc1)C...,CCC12CCC3C4CCC(=O)C=C4CCC3C1C(O)(C(=O)c1ccccc1...,...,-0.168645,-0.169499,-0.177384,-0.181207,-0.185081,-0.187224,-0.188434,-0.189308,-0.190461,-0.000154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,REACTANT:CCN1CC2OC2C1.CCOc1ccccc1OREAGENT:Cl.C...,CCOc1ccccc1OC1CN(CC)CC1O,CCOc1ccccc1C1CN(CC)CC1O,CCOc1ccccc1OC1CN(CC)CC2Oc3ccccc3OCC12,CCOc1ccccc1OC1CN(CC)CC2Oc3ccccc3OC12,CCOc1ccccc1OC1CN(CC)CC1N1CC2Oc3ccccc3OCC21,CCOc1ccccc1OC1CN(CC)CC1N1CC2Oc3ccccc3OC2C1,CCOc1ccccc1OC1CN(CC)CC1O.CCOc1ccccc1O,CCOc1ccccc1OC1CN(CC)CC1Oc1ccccc1OCC,CCOc1ccccc1C12CN(CC)CC1O2,...,-0.304359,-0.310612,-0.316779,-0.329301,-0.335336,-0.339174,-0.354038,-0.357701,-0.366088,-0.001590
29996,REACTANT:CCOC(=O)C(=Cc1cccc(OCCc2ccc(OS(C)(=O)...,CCOC(=O)C(Cc1cccc(OCCc2ccc(OS(C)(=O)=O)cc2)c1)OCC,CCOC(=O)C(O)Cc1cccc(OCCc2ccc(OS(C)(=O)=O)cc2)c1,CCOC(=O)Cc1cccc(OCCc2ccc(OS(C)(=O)=O)cc2)c1,CCOC(=O)C(OCC)Cc1cccc(OCCc2ccc(OS(C)(=O)=O)cc2)c1,CCOC(Cc1cccc(OCCc2ccc(OS(C)(=O)=O)cc2)c1)C(=O)O,CCOC(=O)C(Cc1cccc(OCCc2ccc(OS(C)(=O)=O)cc2)c1)...,CCOC(=Cc1cccc(OCCc2ccc(OS(C)(=O)=O)cc2)c1)C(=O)O,CCOC(=O)C(=O)Cc1cccc(OCCc2ccc(OS(C)(=O)=O)cc2)c1,COC(=O)C(=O)Cc1cccc(OCCc2ccc(OS(C)(=O)=O)cc2)c1,...,-0.192674,-0.194049,-0.207467,-0.211696,-0.231481,-0.234585,-0.237440,-0.238560,-0.240112,-0.000323
29997,REACTANT:CCCCC1CCNCC1.Cc1ccc(S(=O)(=O)OCC(C)Cn...,CCCCC1CCN(CC(C)Cn2c(=O)sc3ccccc32)CC1,CCCCC1CCN(CC(C)Cn2c(=O)sc3ccccc32)CC1.Cc1ccc(S...,CCCCC1CCN(CC(C)Cn2c(=O)sc3ccccc32)CC1.Cc1ccc(S...,CCCCC1CCNC(CC(C)Cn2c(=O)sc3ccccc32)C1,CCCCC1CCNCC1Cn1c(=O)sc2ccccc21,CCC1CCN(CC(C)Cn2c(=O)sc3ccccc32)CC1,CCCCC1CCN(C(C)Cn2c(=O)sc3ccccc32)CC1,CCCCC1CCN(CC(C)Cn2c(=O)sc3ccccc32)CC1.COc1ccc(...,CCCCCC1CCN(CC(C)Cn2c(=O)sc3ccccc32)CC1,...,-0.331291,-0.342171,-0.354060,-0.356503,-0.361395,-0.361925,-0.362513,-0.371238,-0.376130,-0.000073
29998,REACTANT:CC(C)I.COC(=O)c1cc(Br)cc2[nH]ccc12REA...,COC(=O)c1cc(Br)cc2c1ccn2C(C)C,COC(=O)c1cc(Br)cc2c1ccn2-c1cc(C(C)C)c2cc(Br)cc...,COC(=O)c1cc(Br)cc2c1ccn2-c1cc(C)c(C(=O)OC)c1,COC(=O)c1cc(Br)cc2c1ccn2-c1cc2c(C(=O)OC)cc(Br)...,COC(=O)c1cc(Br)cc2c1ccn2COC(C)C,COC(=O)c1cc(Br)cc2c1ccn2-c1cc[nH]c2cc(Br)cc(C(...,COC(=O)c1cc(Br)cc2c1ccn2Cc1C,COC(=O)c1cc(Br)cc2[nH]cc(C(C)C)c12,COC(=O)c1c(Br)cc2c(ccn2C(C)C)c1O,...,-0.348125,-0.351092,-0.359734,-0.363458,-0.363472,-0.363746,-0.367554,-0.378466,-0.381652,-0.000025


In [10]:
import os
import gc
import random
import itertools
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import tokenizers
import transformers
from transformers import AutoTokenizer, EncoderDecoderModel, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
import datasets
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import sentencepiece
import argparse
from sklearn.model_selection import train_test_split
from datasets.utils.logging import disable_progress_bar
disable_progress_bar()
import sys
sys.path.append('../')
from utils import seed_everything, canonicalize, space_clean

df = pd.read_csv('../../all_ord_reaction_uniq_with_attr_v3.csv')
df = df[~df['PRODUCT'].isna()]
for col in ['CATALYST', 'REACTANT', 'REAGENT', 'SOLVENT','PRODUCT']:
    df[col] = df[col].fillna(' ')


df = df[df['REACTANT'] != ' ']
df = df[['REACTANT', 'PRODUCT', 'CATALYST', 'REAGENT', 'SOLVENT']].drop_duplicates().reset_index(drop=True)
df = df.iloc[df[['REACTANT', 'CATALYST', 'REAGENT', 'SOLVENT']].drop_duplicates().index].reset_index(drop=True)


df['REAGENT'] = df['CATALYST'] + '.' + df['REAGENT'] + '.' + df['SOLVENT']
df['REAGENT'] = df['REAGENT'].apply(lambda x: space_clean(x))
df['REAGENT'] = df['REAGENT'].apply(lambda x: canonicalize(x) if x != ' ' else ' ')


df['input'] = 'REACTANT:' + df['REACTANT'] + 'REAGENT:' + df['REAGENT']


lens = df['input'].apply(lambda x: len(x))
# df = df[lens <= 512]
df





Unnamed: 0,REACTANT,PRODUCT,CATALYST,REAGENT,SOLVENT,input
0,CC(=O)c1ccc(Br)cc1.O=C(O)C1CCCN1C(=O)OCc1ccccc1,CC(=O)c1ccc(C2CCCN2C(=O)OCc2ccccc2)cc1,CC(C)(C)c1ccn2->[Ir+]34(<-n5cc(C(F)(F)F)ccc5-c...,CC(C)(C)c1ccn2->[Ir+]34(<-n5cc(C(F)(F)F)ccc5-c...,CN(C)C=O,REACTANT:CC(=O)c1ccc(Br)cc1.O=C(O)C1CCCN1C(=O)...
1,CC(=O)c1ccc(Br)cc1.CC(C)(C)OC(=O)N1CCCCC1C(=O)O,CC(=O)c1ccc(C2CCCN2C(=O)OCc2ccccc2)cc1,CC(C)(C)c1ccn2->[Ir+]34(<-n5cc(C(F)(F)F)ccc5-c...,CC(C)(C)c1ccn2->[Ir+]34(<-n5cc(C(F)(F)F)ccc5-c...,CN(C)C=O,REACTANT:CC(=O)c1ccc(Br)cc1.CC(C)(C)OC(=O)N1CC...
2,CC(=O)c1ccc(Br)cc1.CC(C)(C)OC(=O)N1CCOCC1C(=O)O,CC(=O)c1ccc(C2CCCN2C(=O)OCc2ccccc2)cc1,CC(C)(C)c1ccn2->[Ir+]34(<-n5cc(C(F)(F)F)ccc5-c...,CC(C)(C)c1ccn2->[Ir+]34(<-n5cc(C(F)(F)F)ccc5-c...,CN(C)C=O,REACTANT:CC(=O)c1ccc(Br)cc1.CC(C)(C)OC(=O)N1CC...
3,CC(=O)c1ccc(Br)cc1.CC(C)C(NC(=O)OC(C)(C)C)C(=O)O,CC(=O)c1ccc(C2CCCN2C(=O)OCc2ccccc2)cc1,CC(C)(C)c1ccn2->[Ir+]34(<-n5cc(C(F)(F)F)ccc5-c...,CC(C)(C)c1ccn2->[Ir+]34(<-n5cc(C(F)(F)F)ccc5-c...,CN(C)C=O,REACTANT:CC(=O)c1ccc(Br)cc1.CC(C)C(NC(=O)OC(C)...
4,CC(=O)c1ccc(Br)cc1.CC(C)(C)OC(=O)NC(Cc1cn(C(=O...,CC(=O)c1ccc(C2CCCN2C(=O)OCc2ccccc2)cc1,CC(C)(C)c1ccn2->[Ir+]34(<-n5cc(C(F)(F)F)ccc5-c...,CC(C)(C)c1ccn2->[Ir+]34(<-n5cc(C(F)(F)F)ccc5-c...,CN(C)C=O,REACTANT:CC(=O)c1ccc(Br)cc1.CC(C)(C)OC(=O)NC(C...
...,...,...,...,...,...,...
1074308,CCOC(=O)/C=C(/C)NC(C(=O)O)c1ccccc1.CCOC(=O)Cl....,CC(O)CC(=O)[O-].O=C([O-])CCCO,,CC#N.CCN(CC)CC.CN(C)C=O.O,CC#N.CCN(CC)CC.CN(C)C=O.O,REACTANT:CCOC(=O)/C=C(/C)NC(C(=O)O)c1ccccc1.CC...
1074309,CO.ClCCl.NCCN1CCC(c2[nH]nc3cc(F)ccc23)CC1.O=C1...,CC(O)CC(=O)[O-].O=C([O-])CCCO,,CN(C)C=O,CN(C)C=O,REACTANT:CO.ClCCl.NCCN1CCC(c2[nH]nc3cc(F)ccc23...
1074310,CC#N.CN(C)CCCCl.Cl.Fc1ccc2c(C3CCNCC3)noc2c1.O=...,CC(O)CC(=O)[O-].O=C([O-])CCCO,CCCC[N+](CCCC)(CCCC)CCCC.O=S(=O)([O-])O,CCCC[N+](CCCC)(CCCC)CCCC.CCO.O.O=S(=O)([O-])O,CCO.O,REACTANT:CC#N.CN(C)CCCCl.Cl.Fc1ccc2c(C3CCNCC3)...
1074311,Fc1ccc2c(C3CCNCC3)noc2c1.O=C(CBr)N1CCc2ccccc21...,O=C(F)OCC(F)(F)F,,CC#N.CCO.ClCCl,CC#N.CCO.ClCCl,REACTANT:Fc1ccc2c(C3CCNCC3)noc2c1.O=C(CBr)N1CC...


In [None]:
df['input'] = 'REACTANT:' + df['REACTANT'] + 'REAGENT:' + df['REAGENT']


lens = df['input'].apply(lambda x: len(x))
# df = df[lens <= 512]
df

In [None]:
if CFG.use_reconstructed_data:
    df2 = pd.read_csv('../data/reconstructed-nodata.csv')
    df = pd.concat([df, df2]).sample(frac=1).reset_index(drop=True)
    
from rdkit import Chem
from rdkit.Chem import Descriptors

def calculate_molecular_weight(smiles_string):
    # Convert the SMILES string to a RDKit molecule object
    molecule = Chem.MolFromSmiles(smiles_string)

    # Calculate and return the molecular weight
    return Descriptors.MolWt(molecule)

# Example usage:
smiles_string = 'CC(=O)OC1=CC=CC=C1C(=O)O'  # This is the SMILES for Aspirin
print(f"The molecular weight of the molecule is: {calculate_molecular_weight(smiles_string)}")