In [1]:
import re
import pandas as pd
import ast

import json
from collections import OrderedDict

In [3]:
def expand_reactants(current_reactants, intermediates):
    expanded = set()
    for reactant in current_reactants:
        if reactant in intermediates:
            # Recursively expand reactant if it's an intermediate
            expanded.update(expand_reactants(intermediates[reactant], intermediates))
        else:
            expanded.add(reactant)
    return expanded

def make_intermediates_list(response):
    product_reactants = {}
    products_list = list(response["Product"].keys())
    intermediates = {}

    reaction_steps = set()
    work_up_invovle_chemicals = set()
    work_up_chemicals = set()

    reaction_label =[]
    work_up_label = []
    work_up_invovle_label = []

    if not isinstance(response, dict):
        raise TypeError("response should be a dictionary.")
    if not isinstance(products_list, list):
        raise TypeError("products_list should be a list.")
    
    # Process each reaction step
    for step, equation in response["Reaction Steps"].items():
        reactants, products = equation.split("->")
        reactants = set(reactants.split("+"))
        products = products.split("+")    
        if "Reaction" in step or "reaction" in step:
            reaction_label.append(int(step.split(' ')[0])-1)
            reaction_steps.update(reactants, products)
        elif "Work-up-involve" in step or "work-up-involve" in step:
            work_up_invovle_label.append(int(step.split(' ')[0])-1)
            work_up_invovle_chemicals.update(reactants, products)
        else:
            work_up_label.append(int(step.split(' ')[0])-1)
            work_up_chemicals.update(reactants, products)
        # Store intermediate products
        for product in products:
            if product in response["Product"]:
                # If the product is a final product, record the expanded reactants
                expanded_reactants = expand_reactants(reactants, intermediates)
                product_reactants[product] = expanded_reactants
            # Update or add new intermediates including current reactants
            intermediates[product] = reactants

    temp_intermediates_list = list(zip(range(len(intermediates)),intermediates.keys(),intermediates.values()))
    grouped = {}
    for index, name, sets in temp_intermediates_list:
        # Convert set to a frozenset for use as a dictionary key
        key = frozenset(sets)
        if key in grouped:
            grouped[key].append(name)
        else:
            grouped[key] = [name]

    intermediates_list = []
    for i, (sets, names) in enumerate(grouped.items()):
    # Join names with a dot if there are multiple names for the same set
        merged_name = '.'.join(sorted(names))
        intermediates_list.append((i, merged_name, sets))

    work_up_chemicals = work_up_chemicals - set(products_list)
    work_up_chemicals = work_up_chemicals - set(reaction_steps)
    return intermediates_list, products_list, work_up_chemicals, work_up_label, work_up_invovle_chemicals, work_up_invovle_label, reaction_label

In [4]:
def join_reaction(data):
    result = []
    for item in data:
        if isinstance(item, frozenset):
            # Convert set to sorted list and then to string
            sorted_items = sorted(item)
            filtered = []
            for temp in sorted_items:
                if 'mixture' not in temp:
                    filtered.append(temp)
            joined_string = '.'.join(filtered)
            result.append(joined_string)
        elif isinstance(item, str):
            # Directly append strings and symbols
            result.append(item)
    return ''.join(result)
        

In [5]:
def filter_intermediates_list(intermediates_list):
    # Create a new list to store updated data
    new_intermediates_list = []
    
    for step, intermediates, reactants in intermediates_list:
        # Create a new set for filtered reactants
        new_reactants_set = set()
        
        # Filter out reactants containing "mixture"
        for reactant in reactants:
            if 'mixture' not in reactant:
                new_reactants_set.add(reactant)

        # Append the updated tuple to the new list
        new_intermediates_list.append((step, intermediates, frozenset(new_reactants_set)))
    
    return new_intermediates_list

In [6]:
def segmentation(intermediates_list,products_list):
    breakpoints = set()
    for i, intermediates in enumerate(intermediates_list):
        if any(product in intermediates[1] for product in products_list):
            breakpoints.add(i)
                
    segments = []
    start = 0
    for point in sorted(breakpoints):
        segments.append(list(range(start,point+1)))
        start = point+1
    return segments

def process_reaction_data(data):
    try:
        intermediates_list, products_list, work_up_chemicals, work_up_label, work_up_invovle_chemicals, work_up_involve_label, reaction_label = make_intermediates_list(data)
        
        final_result = {}
    
        for steps_list in segmentation(intermediates_list, products_list):
            temp_reaction_equation = []
            product = intermediates_list[steps_list[-1]][1]
            final_result[product] = temp_reaction_equation
            for step in steps_list:
                if step in reaction_label and len(filter_intermediates_list(intermediates_list)[step][2]) != 0:
                    temp_reaction_equation.append(filter_intermediates_list(intermediates_list)[step][2])
                    temp_reaction_equation.append('>')
                elif step in work_up_involve_label and len(filter_intermediates_list(intermediates_list)[step][2]) != 0:
                        temp_reaction_equation.append('>')
                        temp_reaction_equation.append(filter_intermediates_list(intermediates_list)[step][2])
                        temp_reaction_equation.append('>')            
            final_result[product] = join_reaction(temp_reaction_equation)
        final_reaction_formulas = {}
        for product, rxn_eqn in final_result.items():
            if rxn_eqn.startswith('>'):
                rxn_eqn = rxn_eqn.lstrip('>')
            if rxn_eqn.endswith('>'):
                final_reaction_formulas[product] = (rxn_eqn+ product).replace('>>','>')
            if rxn_eqn.endswith(''):
                final_reaction_formulas[product] = (rxn_eqn+ '>'+ product).replace('>>','>')
        
        for product, rxn_eqn in list(final_reaction_formulas.items()):
             if rxn_eqn.startswith('>'):
                 del final_reaction_formulas[product]   
        return list(final_reaction_formulas.values())
    
    except Exception as e:
            error_message = "Error: "+ str(e) +';reactant from the previous step is stated as the product in the next step'
            print(error_message)
            return None

In [7]:
def replace_with_smiles(rxn_code, smiles_dict):
    # Split the fixed reaction code into parts separated by '>'
    parts = rxn_code.split('>')
    # Replace each code in the parts with the corresponding SMILES string
    replaced_parts = []
    for part in parts:
        codes = part.split('.')
        replaced_codes = [smiles_dict.get(code, code) for code in codes]  
        replaced_parts.append('.'.join(replaced_codes))
    replaced_rxn_code = '>'.join(replaced_parts)
    return replaced_rxn_code

def process_smiles_data(merged_json_responses, merged_smiles_dict):
    skeleton_smiles = []
    final_smiles = []
    skeleton_error_smiles = []
    
    for i, (json_response, smiles_dict) in enumerate(zip(merged_json_responses, merged_smiles_dict)):
        try:
            # Evaluate the string representation to convert back to dictionary or list
            if isinstance(json_response, str):
                json_response = ast.literal_eval(json_response.strip())
            elif not isinstance(json_response, dict):
                raise ValueError(f"json_response at index {i} is not a valid JSON string or dictionary.")

            if isinstance(smiles_dict, str):
                smiles_dict = ast.literal_eval(smiles_dict.strip())
            elif not isinstance(smiles_dict, dict):
                raise ValueError(f"smiles_dict at index {i} is not a valid JSON string or dictionary.")
            
            rxn_codes = process_reaction_data(json_response)
            skeleton_smiles.append(rxn_codes)
            fixed_rxn_codes = []
            if rxn_codes:
                for rxn_code in rxn_codes:
                    reaction_equation = replace_with_smiles(rxn_code, smiles_dict)
                    fixed_rxn_codes.append(reaction_equation)
            else:
                fixed_rxn_codes.append("Error: empty rxn_codes")
            final_smiles.append(fixed_rxn_codes)
        except Exception as e:
            print(f"Exception at index {i}: {e}")
            final_smiles.append(f"{i} Error: empty rxn_codes")
            skeleton_smiles.append(f"{i} Error: empty rxn_codes")
            skeleton_error_smiles.append(f"{i} Error: empty rxn_codes")

    # Return a tuple of all collected lists
    return skeleton_smiles, final_smiles, skeleton_error_smiles


## Run

In [9]:
filtered_result = pd.read_csv('GPT_response.csv', encoding='utf-8-sig')
filtered_result = filtered_result.applymap(lambda x: x.encode('utf-8').decode('raw_unicode_escape') if isinstance(x, str) else x)

result_df = filtered_result[['Issue', 'title', 'paragraph', 'Lowe_smiles', 'GPT_finetuned_five','GPT_finetuned_five_smiles']].copy()
result_df['GPT_finetuned_five'] = result_df['GPT_finetuned_five'].apply(
    lambda x: json.dumps(json.loads(x), indent=2) if isinstance(x, str) else x)

configs = [('GPT_finetuned_five', 'GPT_finetuned_five_smiles')]
for model, smiles_col in configs:
    responses = filtered_result[model].tolist()
    smiles = filtered_result[smiles_col].tolist()
    skeleton_smiles, final_smiles, skeleton_error_smiles = process_smiles_data(responses, smiles)

    # Store results in the DataFrame
    result_df[f'{model}_skeleton'] = skeleton_smiles
    result_df[f'{model}_rxn'] = final_smiles
    result_df[f'{model}_smiles'] = smiles

  filtered_result = filtered_result.applymap(lambda x: x.encode('utf-8').decode('raw_unicode_escape') if isinstance(x, str) else x)


In [11]:
result_df.head(10)

Unnamed: 0,Issue,title,paragraph,Lowe_smiles,GPT_finetuned_4o_five,GPT_finetuned_4o_five_smiles,GPT_finetuned_4o_five_skeleton,GPT_finetuned_4o_five_rxn
0,['many_reagent (> 5)'],1-(p-fluorophenyl)-4-[4-(p-chlorophenyl)-4-hyd...,To a mixture of 2.0 g of lithium aluminum hydr...,[H-].[Al+3].[Li+].[H-].[H-].[H-].[F:7][C:8]1[C...,"{\n ""Reactants, Solvents, Catalysts"": {\n ...","{'A': '[AlH4-].[Li+]', 'B': 'O1CCCC1', 'C': 'F...",[A.B.C>D>E],[[AlH4-].[Li+].O1CCCC1.FC1=CC=C(C(=O)CCC(=O)N2...
1,['many_reagent (> 5)'],Î³-(4-hydroxy-4-p-methylphenylpiperidino)-p-fl...,One gram of chromium trioxide was added with s...,N1C=CC=CC=1.[F:7][C:8]1[CH:13]=[CH:12][C:11]([...,"{\n ""Reactants, Solvents, Catalysts"": {\n ...","{'A': '[O-2].[O-2].[O-2].[Cr+6]', 'B': 'N1=CC=...",[A.B>C>D>H],[[O-2].[O-2].[O-2].[Cr+6].N1=CC=CC=C1>FC1=CC=C...
2,['many_reagent (> 5)'],1-(p-fluorophenyl)-4-[4-(2-oxo-1-benzimidazoli...,To a stirred mixture of 2.0 g of lithium alumi...,[H-].[Al+3].[Li+].[H-].[H-].[H-].O1CCCC1.[F:12...,"{\n ""Reactants, Solvents, Catalysts"": {\n ...","{'A': '[AlH4-].[Li+]', 'B': 'O1CCCC1', 'C': 'F...",[A.B.C>D>E.F>H],[[AlH4-].[Li+].O1CCCC1.FC1=CC=C(C(=O)CCC(=O)N2...
3,['many_reagent (> 5)'],trans N-(2-ETHYLAMINOCYCLOHEXYL)-p-TOLUENESULF...,Twenty-three grams of trans-N-(2-ACETAMIDOCYCL...,[C:1]([NH:4][C@@H:5]1[CH2:10][CH2:9][CH2:8][CH...,"{\n ""Reactants, Solvents, Catalysts"": {\n ...",{'A': 'C(C)(=O)N[C@H]1[C@@H](CCCC1)NS(=O)(=O)C...,[A.B.C>D>C>G],[C(C)(=O)N[C@H]1[C@@H](CCCC1)NS(=O)(=O)C1=CC=C...
4,['many_reagent (> 5)'],"Preparation of 4-hydroxy-4-(Î±,Î±,Î±-trifluoro...",A mixture of 10.0 g. (0.03 mol) of 1-benzyl-4-...,C([N:8]1[CH2:13][CH2:12][C:11]([OH:24])([C:14]...,"{\n ""Reactants, Solvents, Catalysts"": {\n ...",{'A': 'C(C1=CC=CC=C1)N1CCC(CC1)(C1=CC=C(C=C1)C...,[A.B.C.D.E>F>G>H],[C(C1=CC=CC=C1)N1CCC(CC1)(C1=CC=C(C=C1)C(F)(F)...
5,['many_reagent (> 5)'],methyl 3(RS)-hydroxy-2-(3(RS)-[tetrahydropyran...,A solution of 1.47 parts of 3(RS)-(tetrahydrop...,[O:1]1[CH2:6][CH2:5][CH2:4][CH2:3][CH:2]1[O:7]...,"{\n ""Reactants, Solvents, Catalysts"": {\n ...","{'A': 'O1C(CCCC1)OC(C#C)CCCCC', 'B': 'CCOCC', ...",[A.B.C.D>E>F>G.H>L],[O1C(CCCC1)OC(C#C)CCCCC.CCOCC.C(CCC)[Li].CCCCC...
6,['many_reagent (> 5)'],"7-ethoxy-3,7-dimethyloctan-1-al","A mixture of 1.9 of 7-ethoxy-3,7-dimethyloctan...",[CH2:1]([O:3][C:4]([CH3:14])([CH3:13])[CH2:5][...,"{\n ""Reactants, Solvents, Catalysts"": {\n ...","{'A': 'C(C)OC(CCCC(CCO)C)(C)C', 'B': 'N1=CC=CC...",[A.B.C>D>K],[C(C)OC(CCCC(CCO)C)(C)C.N1=CC=CC=C1.[O-2].[O-2...
7,['many_reagent (> 5)'],"4,8-dimethylnon-7-en-2-one","A solution of 47 g. of 4,8-dimethylnon-7-en-2-...",[CH3:1][CH:2]([CH2:7][CH2:8][CH:9]=[C:10]([CH3...,"{\n ""Reactants, Solvents, Catalysts"": {\n ...","{'A': 'CC(CC(C)O)CCC=C(C)C', 'B': 'C(Cl)Cl', '...",[A.B.C.D>D.E>I],[CC(CC(C)O)CCC=C(C)C.C(Cl)Cl.[Cr](=O)(=O)([O-]...
8,['many_reagent (> 5)'],"3,7,11-trimethyldodeca-2,4,10-trien-1-ol","To a solution of 2 g. of methyl 3,7,11-trimeth...",[CH3:1][C:2]([CH:8]=[CH:9][CH2:10][CH:11]([CH3...,"{\n ""Reactants, Solvents, Catalysts"": {\n ...","{'A': 'CC(=CC(=O)OC)C=CCC(CCC=C(C)C)C', 'B': '...",[A.B.C>D>G],[CC(=CC(=O)OC)C=CCC(CCC=C(C)C)C.CCOCC.[AlH4-]....
9,['many_reagent (> 5)'],"11-methoxy-3,7,11-trimethyldodeca-2,4-dien-1-ol","To a solution of 6.2 g. of ethyl 11-methoxy-3,...",[CH3:1][O:2][C:3]([CH3:21])([CH3:20])[CH2:4][C...,"{\n ""Reactants, Solvents, Catalysts"": {\n ...","{'A': 'COC(CCCC(CC=CC(=CC(=O)OCC)C)C)(C)C', 'B...",[A.B.C>D.E>H],[COC(CCCC(CC=CC(=CC(=O)OCC)C)C)(C)C.CCOCC.[AlH...


In [None]:
# Save results to a CSV
result_df.to_csv('GPT_reaction_smiles.csv', encoding='utf-8-sig', index=True)