In [6]:
import pandas as pd
import numpy as np
from llm_qa import join_string, process_messy_list_string

# # === Prepare the evaluation data ===
df = pd.read_csv('./new_labels/data_splits.csv', dtype=str)
df['options'] = df['options'].apply(lambda x: process_messy_list_string(x, sep=','))
df['smiles_option'] = df['smiles_option'].apply(lambda x: process_messy_list_string(x, sep=' '))
df['correct'] = df['correct'].astype(int)
df['smiles_correct'] = df['smiles_correct'].apply(lambda x: int(float(x)))

# select correct from d['options'] using d['correct'] as index
df['ans1'] = pd.NA
df['ans2'] = pd.NA
for i in range(len(df)):
    if len(df['options'][i]) > df['correct'][i]-1:
        # df['ans1'][i] = df['options'][i][df['correct'][i]-1]
        # change to use loc
        df.loc[i, 'ans1'] = df['options'][i][df['correct'][i]-1]
    if len(df['smiles_option'][i]) > df['smiles_correct'][i]-1:
        # df['ans2'][i] = df['smiles_option'][i][df['smiles_correct'][i]-1]
        df.loc[i, 'ans2'] = df['smiles_option'][i][df['smiles_correct'][i]-1]
print("failed text qa", df['ans1'].isna().sum(), "failed smiles qa", df['ans2'].isna().sum())

df['text1'] = "Q: You are given a SMILES string of a molecule, a question about the molecule and a set of candidate options. Pick the best option.\n"+\
    "SMILES_string:"+df['SMILES']+"\n"+\
    "Question:"+df['question']+'\n'+\
    "Options:"+df['options'].apply(lambda x: join_string(x))
df['text2'] = "Q: You are given a sentence describing a molecule. Choose the SMILES string that best describes the SMILES string.\n"+\
    "Sentence:"+df['sentence']+'\n'+\
    "Options:"+df['smiles_option'].apply(lambda x: join_string(x))
df2 = pd.DataFrame(np.concatenate([df['text1'] + df['ans1'].astype(str), df['text2'] + df['ans2'].astype(str)]), columns=['sentence'])
df2['split'] = np.concatenate([df['split'], df['split']])
df2['ans'] = np.concatenate([df['ans1'], df['ans2']])
df2['correct'] = np.concatenate([df['correct'], df['smiles_correct']])
df2['options'] = np.concatenate([df['options'], df['smiles_option']])
df2 = df2[df2['split']=='test']

failed text qa 3 failed smiles qa 0


In [9]:
import json
import os
import editdistance
from rouge_score import rouge_scorer

def extract_answer(text):
    ans = text.split('\n\nAnswer:')[-1].split('</s>')[0]
    return ans

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

print("model, finetuned, accuracy")
all_y_pred = []
models = ['galactica-125m', 'galactica-1.3b', 'galactica-6.7b', 'molt5-large','molt5-large-caption2smiles','molt5-large-smiles2caption']
for model in models:
    for load_finetuned in [False, True]:
        path = f'./outputs1/{model}_load_finetuned={load_finetuned}.json'
        with open(path) as f:
            data = json.load(f)

        # print(model, load_finetuned, len(data))
        answers = [extract_answer(x) for x in data]
        num_errors = [i for i in range(len(answers)) if answers[i]=='ERROR']
        # print("num_errors", len(num_errors))
        # print("answer", answers[:10])
        if model == 'molt5-large-caption2smiles':
            answers = ([""] * (len(df2)//2)) + answers # add empty string to the first half, since we are only interested in the second half
        if model == 'molt5-large-smiles2caption':
            answers = answers + ([""] * (len(df2)//2))

        y_pred = []
        y_true = []
        # first half is smiles to text, second half is text to smiles
        for i in range(0, len(df2)//2):
            options = df2['options'].values[i]
            rogue_score_pred = [scorer.score(option, answers[i])['rouge1'].fmeasure for option in options]
            max_rouge_score = np.argmax(rogue_score_pred)
            y_pred.append(max_rouge_score)
            y_true.append(df2['correct'].values[i]-1)
        for i in range(len(df2)//2, len(df2)):
            options = df2['options'].values[i]
            editdistance_pred = [editdistance.eval(option, answers[i]) for option in options] 
            min_editdistance = np.argmin(editdistance_pred)
            y_pred.append(min_editdistance)
            y_true.append(df2['correct'].values[i]-1)

        all_y_pred.append(y_pred)
        print(f"{model}, {load_finetuned}")

# === Save the predictions ===
model_names = [model + "_finetuned=" + str(load_finetuned) for model in models for load_finetuned in [True]]
for i, model_name in enumerate(model_names):
    df2[model_name] = all_y_pred[i]
df2.iloc[:len(df2)//2].to_csv('forward_predictions.csv', index=False)
df2.iloc[len(df2)//2:].to_csv('reverse_predictions.csv', index=False)

model, finetuned, accuracy
galactica-125m, True
galactica-1.3b, True
galactica-6.7b, True
molt5-large, True
molt5-large-caption2smiles, True
molt5-large-smiles2caption, True


In [11]:
import pandas as pd
forward_preds = pd.read_csv('forward_predictions.csv')
reverse_preds = pd.read_csv('reverse_predictions.csv')
print(forward_preds.shape, reverse_preds.shape)
print("Model, Forward Accuracy, Reverse Accuracy")
# for mname in ['galactica-125m_finetuned=False', 'galactica-125m_finetuned=True', 'galactica-1.3b_finetuned=False', 'galactica-1.3b_finetuned=True', 'galactica-6.7b_finetuned=False', 'galactica-6.7b_finetuned=True', 'molt5-large_finetuned=False', 'molt5-large_finetuned=True', 'molt5-large-caption2smiles_finetuned=False', 'molt5-large-caption2smiles_finetuned=True', 'molt5-large-smiles2caption_finetuned=False', 'molt5-large-smiles2caption_finetuned=True']:
for mname in ['galactica-125m_finetuned=True', 'galactica-1.3b_finetuned=True', 'galactica-6.7b_finetuned=True', 'molt5-large_finetuned=True', 'molt5-large-caption2smiles_finetuned=True', 'molt5-large-smiles2caption_finetuned=True']:
    print(mname, ',',
            np.mean(forward_preds[mname] == forward_preds['correct']-1), ',',
            np.mean(reverse_preds[mname] == reverse_preds['correct']-1))

(12092, 11) (12092, 11)
Model, Forward Accuracy, Reverse Accuracy
galactica-125m_finetuned=True , 0.4397122064174661 , 0.2161759841217334
galactica-1.3b_finetuned=True , 0.6098246774727092 , 0.22171683757856434
galactica-6.7b_finetuned=True , 0.6901257029440953 , 0.2230400264637777
molt5-large_finetuned=True , 0.3414654316903738 , 0.23544492226265298
molt5-large-caption2smiles_finetuned=True , 0.14207740654978498 , 0.22998676811114788
molt5-large-smiles2caption_finetuned=True , 0.34692358584187893 , 0.23858749586503472


In [67]:
# from transformers import AutoTokenizer
# from peft import LoraConfig, get_peft_model, TaskType, AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained("laituan245/molt5-large-caption2smiles")
# model = AutoPeftModelForSeq2SeqLM.from_pretrained("./finetuned_models1/molt5-large/")

In [2]:
# import pandas as pd
# # import ast

# finetune_d = pd.read_csv("/srv/local/data/chufan2/molqa/galactica_finetuning.csv", dtype=str)
# finetune_d['sentence'] = "Q: "+finetune_d['Q'] + "\n\nAnswer: " + finetune_d['A']
# finetune_d['sentence'].values
# # print(finetune_d.columns)  #Index(['Unnamed: 0', 'CID', 'smiles', 'Q', 'A', 'type', 'sentence'], dtype='object')

# # '''
# # Q: You are given a SMILES string of a molecule, a question about the molecule and a set of candidate options. Pick the best option.
# # SMILES_string:[START_I_SMILES]CC(=O)OC(C)=O[END_I_SMILES]
# # Question:What is the functional group present in the molecule?
# # Options:(1)Aldehyde
# # (2)Ketone
# # (3)Carboxylic acid
# # (4)Amide
# # (5)Ester

# # Answer:
# # '''
# # '''
# # Q: You are given a sentence describing a molecule. Choose the SMILES string that best describes the SMILES string.
# # Sentence:The functional group present in the molecule is <Carboxylic acid>
# # Options:(1)C=C(C)[C@H]1Oc2ccc([C@H](C)O)cc2[C@@H]1O
# # (2)OCCOCCOCCO
# # (3)CCCCC/C=C\C/C=C\C/C=C\CC1OC1CCCC(=O)SCCNC(=O)CCNC(=O)[C@H](O)C(C)(C)COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H](n2cnc3c(N)ncnc32)[C@H](O)[C@@H]1OP(=O)([O-])[O-]
# # (4)CC(=O)OC(C)=O
# # (5)CC1=C(/C=C/C(C)=C/C=C/C(C)=C/C=C/C=C(C)/C=C/C=C(\C)[C@H]2C=C3C(C)(C)C[C@H](O)C[C@@]3(C)O2)C(C)(C)C[C@H](O)C1

# # Answer: (4)[START_I_SMILES]CC(=O)OC(C)=O[END_I_SMILES]
# # '''

# eval_d = pd.read_csv("/srv/local/data/chufan2/molqa/goldstandard_galactica_validated.csv", dtype=str)
# eval_d['options'] = eval_d['options'].apply(lambda x: eval_string(x))
# eval_d['smiles_options'] = eval_d['smiles_options'].apply(lambda x: eval_smiles(x))
# eval_d['text1'] = "Q: You are given a SMILES string of a molecule, a question about the molecule and a set of candidate options. Pick the best option.\n"+\
#     "SMILES_string:[START_I_SMILES]"+eval_d['smiles']+"[END_I_SMILES]\n"+\
#     "Question:"+eval_d['question']+'\n'+\
#     "Options:"+eval_d['options']+'\n\nAnswer:'
# eval_d['text2'] = "Q: You are given a sentence describing a molecule. Choose the SMILES string that best describes the SMILES string.\n"+\
#     "Sentence:"+eval_d['sentence']+'\n'+\
#     "Options:"+eval_d['smiles_options']+'\n\nAnswer:'