In [38]:
import os
import pandas as pd
from tqdm import tqdm

import asyncio
import nest_asyncio
import aiohttp

import logging
import pickle
import threading

import importlib
import file_io_utils, smiles_fetch_utils, chemspider_utils, reaction_smiles_processing_utils
importlib.reload(file_io_utils)
importlib.reload(smiles_fetch_utils)
importlib.reload(reaction_smiles_processing_utils)

from smiles_fetch_utils import process_batch, reprocess_no_smi
from file_io_utils import save_smiles_dict, monitor_log, ensure_directory, add_smiles_dict


from reaction_smiles_processing_utils import process_smiles_data


In [41]:
output_directory = "smiles_batches"
result_directory = "result"

GPT_response_path = "./data/GPT_response.csv"
GPT_response_column = "GPT_finetuned_five"
GPT_response_with_smiles_path = f"./{result_directory}/GPT_response_with_smiles.csv"
GPT_reaction_smiles_path = f'./{result_directory}/GPT_reaction_smiles.csv'

temp_smiles_dict_json = "smiles_dict_final_ver1.json"
final_smiles_dict_json = "smiles_dict_final_updated_ver2.json"
log_path = 'smiles_fetch.log'

ensure_directory(result_directory)

## Step 1: Fetch SMILES from database

In [35]:
def make_smiles_dict(df, batch_size=100, output_dir=output_directory):
    # Ensure the output directory exists
    ensure_directory(output_directory)

    smiles_dict = []
    semaphore = asyncio.Semaphore(60)
    
    # Process batches
    for i in tqdm(range(0, len(df), batch_size), desc="Processing GPT Responses in Batches"):
        batch = df[i:i + batch_size]
        batch_number = i // batch_size + 1
        try:
            temp_smiles_dicts = asyncio.run(process_batch(batch, fix_name_bool=False, semaphore=semaphore))
            smiles_dict.extend(temp_smiles_dicts)
            logging.info(f"Completed batch {batch_number}/{(len(df) + batch_size - 1) // batch_size}")
            
            # Save intermediate results
            save_smiles_dict(smiles_dict, os.path.join(output_dir, f'smiles_dict_batch_{batch_number}.json'))

            # Save the cache periodically
            with open(os.path.join(output_dir, 'smiles_cache.pkl'), 'wb') as f:
                pickle.dump(smiles_cache, f)

        except Exception as e:
            smiles_dict.extend(["Error"] * len(batch))
            logging.error(f"Error in batch {batch_number}: {e}")
    
    save_smiles_dict(smiles_dict, os.path.join(output_dir, temp_smiles_dict_json))
    return smiles_dict

def run_processing():
    make_smiles_dict(df[GPT_response_column], output_dir=output_directory)
    logging.info("Processing completed")
    print("Processing completed")

async def main_reprocess():
    semaphore = asyncio.Semaphore(40)  # Limit concurrent tasks
    async with aiohttp.ClientSession() as session:
        stop_event = threading.Event()
        processing_thread = threading.Thread(target=monitor_log, args=(log_path, stop_event))
        processing_thread.start()

        await reprocess_no_smi(f'./{output_directory}/{temp_smiles_dict_json}', f'./{output_directory}/{final_smiles_dict_json}.json', session, semaphore, batch_size=100)
        stop_event.set()
        processing_thread.join()   

In [36]:
logging.basicConfig(
    filename= log_path,
    filemode='w',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

df = pd.read_csv(GPT_response_path)

# Create a stop event for the log monitoring
stop_event = threading.Event()

# Start the processing thread
processing_thread = threading.Thread(target=run_processing)
processing_thread.start()

# Monitor the log in real-time
try:
    monitor_log(log_path, stop_event)
finally:
    # Once processing is done, signal the log monitoring to stop
    stop_event.set()
    processing_thread.join()

Processing GPT Responses in Batches:   0%|          | 0/5 [00:00<?, ?it/s]

2024-09-25 16:59:20,436 - INFO - Found SMILES for methanol from OPSIN: CO


Processing GPT Responses in Batches: 100%|██████████| 5/5 [02:12<00:00, 26.48s/it]

Processing completed
All batches processed. Stopping log monitoring.





In [39]:
# Run the async main function
asyncio.run(main_reprocess())

Processing Batches:   0%|          | 0/5 [00:00<?, ?batch/s]

2024-09-25 17:04:53,149 - ERROR - [ChemSpider] No results found for Et2O


Processing Batches:  20%|██        | 1/5 [00:42<02:49, 42.42s/batch]

[BATCH 1] 17.25% of entries don't have a corresponding SMILES representation


Processing Batches:  40%|████      | 2/5 [02:34<04:10, 83.44s/batch]

[BATCH 2] 14.33% of entries don't have a corresponding SMILES representation


Processing Batches:  60%|██████    | 3/5 [04:22<03:08, 94.40s/batch]

[BATCH 3] 9.79% of entries don't have a corresponding SMILES representation


Processing Batches:  80%|████████  | 4/5 [05:18<01:19, 79.29s/batch]

[BATCH 4] 8.9% of entries don't have a corresponding SMILES representation


Processing Batches: 100%|██████████| 5/5 [07:45<00:00, 93.19s/batch] 

[BATCH 5] 4.46% of entries don't have a corresponding SMILES representation





In [40]:
add_smiles_dict(column_name =f'{GPT_response_column}_smiles', exisitng_file_path=GPT_response_path, smiles_dict_file_path=f"./{output_directory}/{final_smiles_dict_json}.json", output_path = GPT_response_with_smiles_path )

OSError: Cannot save file into a non-existent directory: 'result'

## Step 2: Generate reaction smiles

In [None]:
result_df = pd.read_csv(GPT_response_with_smiles_path, encoding='utf-8-sig')
configs = [(f'{GPT_response_column}', f'{GPT_finetuned_five}_smiles')]
for model, smiles_col in configs:
    responses = result_df[model].tolist()
    smiles = result_df[smiles_col].tolist()
    skeleton_smiles, final_smiles, skeleton_error_smiles = process_smiles_data(responses, smiles)

    # Store results in the DataFrame
    result_df[f'{model}_skeleton'] = skeleton_smiles
    result_df[f'{model}_rxn'] = final_smiles
    result_df[f'{model}_smiles'] = smiles

In [28]:
# Save results to a CSV
result_df.to_csv(GPT_reaction_smiles_path, encoding='utf-8-sig', index=True)