In [1]:
import pandas as pd
import requests
import time

# Input file with full ion names
input_file = '../data/unique_ions_with_names.csv'
# Output file for the results
output_file = '../data/ions_with_smiles.csv'

try:
    df_ions = pd.read_csv(input_file)
    # Drop any rows where the ion_name might be missing, just in case
    df_ions.dropna(subset=['ion_name'], inplace=True)
    print(f"Loaded {len(df_ions)} unique ion names from '{input_file}'")

    # Create a new column to store the SMILES strings
    df_ions['smiles'] = ''

    # --- Loop through each ion and fetch its SMILES from PubChem ---
    for index, row in df_ions.iterrows():
        ion_name = row['ion_name']

        # Construct the URL for the PubChem API
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{ion_name}/property/CanonicalSMILES/TXT"

        try:
            # Make the web request
            response = requests.get(url, timeout=10) # Added a timeout

            if response.status_code == 200:
                # If successful, the SMILES is the text content of the response
                smiles = response.text.strip()
                df_ions.at[index, 'smiles'] = smiles
                print(f"SUCCESS: Found SMILES for '{ion_name}'")
            else:
                # If the ion is not found, the status code will be 404
                print(f"WARNING: Could not find '{ion_name}' in PubChem (Status: {response.status_code})")
                df_ions.at[index, 'smiles'] = 'NOT_FOUND'

        except requests.exceptions.RequestException as e:
            print(f"ERROR: A network error occurred for '{ion_name}': {e}")
            df_ions.at[index, 'smiles'] = 'REQUEST_ERROR'

        # Be polite to the server and wait a moment between requests
        time.sleep(0.3)

        # Save the final DataFrame with the new smiles column
    df_ions.to_csv(output_file, index=False)

    print(f"\nProcess complete. Results saved to '{output_file}'")
    print("\n--- Preview of the Results ---")
    print(df_ions.head())

except FileNotFoundError:
    print(f"ERROR: Could not find the input file at '{input_file}'")
except Exception as e:
    print(f"An error occurred: {e}")

Loaded 84 unique ion names from '../data/unique_ions_with_names.csv'
ERROR: A network error occurred for '1,3-diethoxyimidazolium': HTTPSConnectionPool(host='pubchem.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
SUCCESS: Found SMILES for '1,3-dibutylimidazolium'
SUCCESS: Found SMILES for '1-butyl-3-methyl-imidazolium'
SUCCESS: Found SMILES for '1-butyl-2,3-dimethylimidazolium'
SUCCESS: Found SMILES for '1-dodecyl-3-methylimidazolium'
SUCCESS: Found SMILES for '1-butyl-3-methylpyridinium'
SUCCESS: Found SMILES for '1-butylpyridinium'
ERROR: A network error occurred for '1-ethyl-3-methyl-imidazolium': HTTPSConnectionPool(host='pubchem.ncbi.nlm.nih.gov', port=443): Read timed out. (read timeout=10)
SUCCESS: Found SMILES for '1-hexyl-2,3-dimethylimidazolium'
SUCCESS: Found SMILES for 'choline'
SUCCESS: Found SMILES for 'methyltrioctylammonium'
SUCCESS: Found SMILES for 'tetrabutylphosphonium'
SUCCESS: Found SMILES for 'trihexyl(tetradecyl)phosphonium'
SUCCESS: Found SMILE