In [1]:
import re
import pandas as pd
import ast

import json
from collections import OrderedDict

In [2]:
def expand_reactants(current_reactants, intermediates):
    expanded = set()
    for reactant in current_reactants:
        if reactant in intermediates:
            # Recursively expand reactant if it's an intermediate
            expanded.update(expand_reactants(intermediates[reactant], intermediates))
        else:
            expanded.add(reactant)
    return expanded

def make_intermediates_list(response):
    product_reactants = {}
    products_list = list(response["Product"].keys())
    intermediates = {}

    reaction_steps = set()
    work_up_invovle_chemicals = set()
    work_up_chemicals = set()

    reaction_label =[]
    work_up_label = []
    work_up_invovle_label = []

    if not isinstance(response, dict):
        raise TypeError("response should be a dictionary.")
    if not isinstance(products_list, list):
        raise TypeError("products_list should be a list.")
    
    # Process each reaction step
    for step, equation in response["Reaction Steps"].items():
        reactants, products = equation.split("->")
        reactants = set(reactants.split("+"))
        products = products.split("+")    
        if "Reaction" in step or "reaction" in step:
            reaction_label.append(int(step.split(' ')[0])-1)
            reaction_steps.update(reactants, products)
        elif "Work-up-involve" in step or "work-up-involve" in step:
            work_up_invovle_label.append(int(step.split(' ')[0])-1)
            work_up_invovle_chemicals.update(reactants, products)
        else:
            work_up_label.append(int(step.split(' ')[0])-1)
            work_up_chemicals.update(reactants, products)
        # Store intermediate products
        for product in products:
            if product in response["Product"]:
                # If the product is a final product, record the expanded reactants
                expanded_reactants = expand_reactants(reactants, intermediates)
                product_reactants[product] = expanded_reactants
            # Update or add new intermediates including current reactants
            intermediates[product] = reactants

    temp_intermediates_list = list(zip(range(len(intermediates)),intermediates.keys(),intermediates.values()))
    grouped = {}
    for index, name, sets in temp_intermediates_list:
        # Convert set to a frozenset for use as a dictionary key
        key = frozenset(sets)
        if key in grouped:
            grouped[key].append(name)
        else:
            grouped[key] = [name]

    intermediates_list = []
    for i, (sets, names) in enumerate(grouped.items()):
    # Join names with a dot if there are multiple names for the same set
        merged_name = '.'.join(sorted(names))
        intermediates_list.append((i, merged_name, sets))

    work_up_chemicals = work_up_chemicals - set(products_list)
    work_up_chemicals = work_up_chemicals - set(reaction_steps)
    return intermediates_list, products_list, work_up_chemicals, work_up_label, work_up_invovle_chemicals, work_up_invovle_label, reaction_label

In [3]:
def join_reaction(data):
    result = []
    for item in data:
        if isinstance(item, frozenset):
            # Convert set to sorted list and then to string
            sorted_items = sorted(item)
            filtered = []
            for temp in sorted_items:
                if 'mixture' not in temp:
                    filtered.append(temp)
            joined_string = '.'.join(filtered)
            result.append(joined_string)
        elif isinstance(item, str):
            # Directly append strings and symbols
            result.append(item)
    return ''.join(result)
        

In [4]:
def filter_intermediates_list(intermediates_list):
    # Create a new list to store updated data
    new_intermediates_list = []
    
    for step, intermediates, reactants in intermediates_list:
        # Create a new set for filtered reactants
        new_reactants_set = set()
        
        # Filter out reactants containing "mixture"
        for reactant in reactants:
            if 'mixture' not in reactant:
                new_reactants_set.add(reactant)

        # Append the updated tuple to the new list
        new_intermediates_list.append((step, intermediates, frozenset(new_reactants_set)))
    
    return new_intermediates_list

In [5]:
def segmentation(intermediates_list,products_list):
    breakpoints = set()
    for i, intermediates in enumerate(intermediates_list):
        if any(product in intermediates[1] for product in products_list):
            breakpoints.add(i)
                
    segments = []
    start = 0
    for point in sorted(breakpoints):
        segments.append(list(range(start,point+1)))
        start = point+1
    return segments

def process_reaction_data(data):
    try:
        intermediates_list, products_list, work_up_chemicals, work_up_label, work_up_invovle_chemicals, work_up_involve_label, reaction_label = make_intermediates_list(data)
        
        final_result = {}
    
        for steps_list in segmentation(intermediates_list, products_list):
            temp_reaction_equation = []
            product = intermediates_list[steps_list[-1]][1]
            final_result[product] = temp_reaction_equation
            for step in steps_list:
                if step in reaction_label and len(filter_intermediates_list(intermediates_list)[step][2]) != 0:
                    temp_reaction_equation.append(filter_intermediates_list(intermediates_list)[step][2])
                    temp_reaction_equation.append('>')
                elif step in work_up_involve_label and len(filter_intermediates_list(intermediates_list)[step][2]) != 0:
                        temp_reaction_equation.append('>')
                        temp_reaction_equation.append(filter_intermediates_list(intermediates_list)[step][2])
                        temp_reaction_equation.append('>')            
            final_result[product] = join_reaction(temp_reaction_equation)
        final_reaction_formulas = {}
        for product, rxn_eqn in final_result.items():
            if rxn_eqn.startswith('>'):
                rxn_eqn = rxn_eqn.lstrip('>')
            if rxn_eqn.endswith('>'):
                final_reaction_formulas[product] = (rxn_eqn+ product).replace('>>','>')
            if rxn_eqn.endswith(''):
                final_reaction_formulas[product] = (rxn_eqn+ '>'+ product).replace('>>','>')
        
        for product, rxn_eqn in list(final_reaction_formulas.items()):
             if rxn_eqn.startswith('>'):
                 del final_reaction_formulas[product]   
        return list(final_reaction_formulas.values())
    
    except Exception as e:
            error_message = "Error: "+ str(e) +';reactant from the previous step is stated as the product in the next step'
            print(error_message)
            return None

In [6]:
def replace_with_smiles(rxn_code, smiles_dict):
    # Split the fixed reaction code into parts separated by '>'
    parts = rxn_code.split('>')
    # Replace each code in the parts with the corresponding SMILES string
    replaced_parts = []
    for part in parts:
        codes = part.split('.')
        replaced_codes = [smiles_dict.get(code, code) for code in codes]  
        replaced_parts.append('.'.join(replaced_codes))
    replaced_rxn_code = '>'.join(replaced_parts)
    return replaced_rxn_code

def process_smiles_data(merged_json_responses, merged_smiles_dict):
    skeleton_smiles = []
    final_smiles = []
    skeleton_error_smiles = []
    
    for i, (json_response, smiles_dict) in enumerate(zip(merged_json_responses, merged_smiles_dict)):
        try:
            # Evaluate the string representation to convert back to dictionary or list
            if isinstance(json_response, str):
                json_response = ast.literal_eval(json_response.strip())
            elif not isinstance(json_response, dict):
                raise ValueError(f"json_response at index {i} is not a valid JSON string or dictionary.")

            if isinstance(smiles_dict, str):
                smiles_dict = ast.literal_eval(smiles_dict.strip())
            elif not isinstance(smiles_dict, dict):
                raise ValueError(f"smiles_dict at index {i} is not a valid JSON string or dictionary.")
            
            rxn_codes = process_reaction_data(json_response)
            skeleton_smiles.append(rxn_codes)
            fixed_rxn_codes = []
            if rxn_codes:
                for rxn_code in rxn_codes:
                    reaction_equation = replace_with_smiles(rxn_code, smiles_dict)
                    fixed_rxn_codes.append(reaction_equation)
            else:
                fixed_rxn_codes.append("Error: empty rxn_codes")
            final_smiles.append(fixed_rxn_codes)
        except Exception as e:
            print(f"Exception at index {i}: {e}")
            final_smiles.append(f"{i} Error: empty rxn_codes")
            skeleton_smiles.append(f"{i} Error: empty rxn_codes")
            skeleton_error_smiles.append(f"{i} Error: empty rxn_codes")

    # Return a tuple of all collected lists
    return skeleton_smiles, final_smiles, skeleton_error_smiles


## Run

In [8]:
result_df = pd.read_csv('GPT_response_with_smiles.csv', encoding='utf-8-sig')
configs = [('GPT_finetuned_five', 'GPT_finetuned_five_smiles')]
for model, smiles_col in configs:
    responses = result_df[model].tolist()
    smiles = result_df[smiles_col].tolist()
    skeleton_smiles, final_smiles, skeleton_error_smiles = process_smiles_data(responses, smiles)

    # Store results in the DataFrame
    result_df[f'{model}_skeleton'] = skeleton_smiles
    result_df[f'{model}_rxn'] = final_smiles
    result_df[f'{model}_smiles'] = smiles

Error: maximum recursion depth exceeded;reactant from the previous step is stated as the product in the next step
Exception at index 317: unterminated string literal (detected at line 103) (<unknown>, line 103)
Error: too many values to unpack (expected 2);reactant from the previous step is stated as the product in the next step


In [9]:
result_df.head(10)

Unnamed: 0,Issue,title,paragraph,GPT_finetuned_five,GPT_finetuned_five_smiles,GPT_finetuned_five_skeleton,GPT_finetuned_five_rxn
0,['many_product (> 1)'],(+)-tartrate,"(±)-Ethyl nipecotate (70.16 g, 71ml) and (+)-t...","{\n ""Reactants, Solvents, Catalysts"": {\n ""A""...","{'A': 'CCOC(=O)C1CCCNC1', 'B': 'C(C(C(=O)O)O)(...",[A.B.C>D],[CCOC(=O)C1CCCNC1.C(C(C(=O)O)O)(C(=O)O)O.CCO>[...
1,['many_product (> 1)'],2-{[[4-(3-Chloropropylthio)-3-methoxy-2-pyridi...,2-Mercapto-1H-benzimidazole (10 g) and 2-chlor...,"{\n ""Reactants, Solvents, Catalysts"": {\n ""A""...","{'A': 'C1=CC=C2C(=C1)NC(=S)N2', 'B': 'COC1=C(C...",[A.B.C.D>E],[C1=CC=C2C(=C1)NC(=S)N2.COC1=C(C=CN=C1CCl)SCCC...
2,['many_product (> 1)'],"4-(3,4-dichloroanilino)-6-fluoroquinazoline hy...",A mixture of 4-chloro-6-fluoroquinazoline (2 g...,"{\n ""Reactants, Solvents, Catalysts"": {\n ""A""...","{'A': 'C1=CC2=C(C=C1F)C(=NC=N2)Cl', 'B': 'C1=C...",[A.B.C>E],[C1=CC2=C(C=C1F)C(=NC=N2)Cl.C1=CC(=C(C=C1N)Cl)...
3,['many_product (> 1)'],"2-[4-[4-[4-[4-[[2-(2,4-difluorophenyl)-2-(1H-1...",Compound 23 (0.00359 mol) was dissolved in 2-p...,"{\n ""Reactants, Solvents, Catalysts"": {\n ""A""...",{'A': 'CCC(C)N[C@@H](CC1=CC=CC=C1N2C(=O)N(C=N2...,[A.B>B.C>D],[CCC(C)N[C@@H](CC1=CC=CC=C1N2C(=O)N(C=N2)C3=CC...
4,['many_product (> 1)'],"(+)-(5)6-Methoxy-2-[[(4-methoxy-3,5-dimethyl-2...",To a stirring suspension of 650 mg (1.89 mmol)...,"{\n ""Reactants, Solvents, Catalysts"": {\n ""A""...",{'A': 'CC1=CN=C(C(=C1OC)C)C[S@](=O)C2=NC3=C(N2...,[A.B.C.D>E>G],[CC1=CN=C(C(=C1OC)C)C[S@](=O)C2=NC3=C(N2)C=C(C...
5,['many_product (> 1)'],o-Bromobenzoic Acid Potassium Salt,To a solution of o-bromobenzoic acid (201.03 g...,"{\n ""Reactants, Solvents, Catalysts"": {\n ""A""...","{'A': 'C1=CC=C(C(=C1)C(=O)O)Br', 'B': 'CO', 'C...",[A.B.C>D],[C1=CC=C(C(=C1)C(=O)O)Br.CO.C(=O)([O-])[O-].[K...
6,['many_product (> 1)'],"5,6-dihydro-4H-1,3a,6-triaza-as-indacene-8-car...","A solution of 5,6-dihydro-4H-1,3a,6-triaza-as-...","{\n ""Reactants, Solvents, Catalysts"": {\n ""A""...","{'A': 'COC(=O)C1=CNC2=C1C3=NC=CN3CC2', 'B': 'B...",[A.B>D],[COC(=O)C1=CNC2=C1C3=NC=CN3CC2.Br>C1CN2C=CN=C2...
7,['many_product (> 1)'],"2-Amino-1-(2,4-difluorophenyl)-ethanol-hydroch...","N-[2-(2,4-difluorophenyl)-2-hydroxy-ethyl]-for...","{\n ""Reactants, Solvents, Catalysts"": {\n ""A""...","{'A': 'C1=CC(=C(C=C1F)F)C(CNC=O)O', 'B': 'CO',...",[A.B.C>E],[C1=CC(=C(C=C1F)F)C(CNC=O)O.CO.Cl>C1=CC(=C(C=C...
8,['many_product (> 1)'],magnesium valproate,"Surprisingly, the inventors have discovered th...","{\n ""Reactants, Solvents, Catalysts"": {\n ""A""...",{'A': 'CCCC(CCC)C(=O)[O-].CCCC(CCC)C(=O)[O-].[...,[A.B>C],[CCCC(CCC)C(=O)[O-].CCCC(CCC)C(=O)[O-].[Mg+2]....
9,['many_product (> 1)'],Biphenyl-2-ylcarbamic Acid 1-{9-[(R)-2-Hydroxy...,Biphenyl-2-ylcarbamic acid 1-{9-[(R)-2-hydroxy...,"{\n ""Reactants, Solvents, Catalysts"": {\n ""A""...",{'A': 'C1CN(CCC1OC(=O)NC2=CC=CC=C2C3=CC=CC=C3)...,[A.B.C>D],[C1CN(CCC1OC(=O)NC2=CC=CC=C2C3=CC=CC=C3)CCCCCC...


In [None]:
# Save results to a CSV
result_df.to_csv('GPT_reaction_smiles.csv', encoding='utf-8-sig', index=True)