In [2]:
import pandas as pd
import re
import os

def parse_orca_output(filepath):
    """
    Parses a single ORCA output file to extract the full set of key descriptors,
    only if the calculation terminated normally.
    """
    try:
        with open(filepath, 'r') as f:
            content = f.read()
    except Exception as e:
        return None

    if "****ORCA TERMINATED NORMALLY****" not in content:
        return None

    data = {}

    # --- Extract All Descriptors ---

    # 1. Final Energy
    energy_match = re.search(r"FINAL SINGLE POINT ENERGY\s+(-?\d+\.\d+)", content)
    if energy_match:
        data['Energy_Hartree'] = float(energy_match.group(1))

    # 2. Dipole Moment
    dipole_match = re.search(r"Total Dipole Moment\s*:\s*(\d+\.\d+)", content)
    if dipole_match:
        data['Dipole_Moment_Debye'] = float(dipole_match.group(1))

    # 3. Isotropic Polarizability
    polar_match = re.search(r"Isotropic polarizability\s*\.\.\.\s*(\d+\.\d+)", content)
    if polar_match:
        data['Polarizability_au'] = float(polar_match.group(1))

    # 4. Molecular Volume
    vol_match = re.search(r"Volume per molecule\s*:\s*(\d+\.\d+)", content)
    if vol_match:
        data['Volume_A3'] = float(vol_match.group(1))

    # 5. Thermodynamic Properties
    enthalpy_match = re.search(r"Total Enthalpy\s*:\s*(-?\d+\.\d+)", content)
    if enthalpy_match:
        data['Total_Enthalpy_Hartree'] = float(enthalpy_match.group(1))

    gibbs_match = re.search(r"Final Gibbs free energy\s*:\s*(-?\d+\.\d+)", content)
    if gibbs_match:
        data['Gibbs_Free_Energy_Hartree'] = float(gibbs_match.group(1))

    # 6. Robust HOMO/LUMO Extraction
    try:
        orbital_section = content.split("ORBITAL ENERGIES")[-1]
        homo_energy = -999.0
        lumo_energy = 999.0

        for line in orbital_section.split('\n'):
            parts = line.split()
            if len(parts) == 4 and parts[0].isdigit():
                occupation = float(parts[1])
                energy = float(parts[2])

                if occupation > 0.0 and energy > homo_energy:
                    homo_energy = energy

                if occupation == 0.0 and energy < lumo_energy:
                    lumo_energy = energy

        if homo_energy != -999.0: data['HOMO_Eh'] = homo_energy
        if lumo_energy != 999.0: data['LUMO_Eh'] = lumo_energy
    except Exception:
        pass # Ignore if HOMO/LUMO parsing fails for any reason

    return data

# --- Main script to process all .out files ---
output_dir = '../calculations_individual/'
ions_list_file = '../data/ions_with_smiles.csv'
final_csv_path = '../data/orca_descriptors_individual.csv'

all_results = []
df_ions = pd.read_csv(ions_list_file)

for index, row in df_ions.iterrows():
    ion_abbr = row['ion_abbreviation']
    safe_name = ion_abbr.strip('[]').replace('+', '_p').replace('-', '_m').replace('(', '_').replace(')', '_')
    filepath = os.path.join(output_dir, safe_name + '.out')

    print(f"Processing {filepath}...")
    descriptors = parse_orca_output(filepath)

    if descriptors:
        descriptors['ion_abbreviation'] = ion_abbr
        all_results.append(descriptors)
    else:
        print(f"--> SKIPPING: Could not extract data for {ion_abbr} (likely failed calculation).")

df_results = pd.DataFrame(all_results)
df_results.to_csv(final_csv_path, index=False)

print("\n--- Process complete ---")
print(f"Descriptor data for all successful calculations saved to '{final_csv_path}'")
print("\nPreview of the final, complete descriptor set:")
print(df_results.head())

Processing ../calculations_individual/_ETO_2IM.out...
Processing ../calculations_individual/BBIM.out...
Processing ../calculations_individual/BMIM.out...
Processing ../calculations_individual/BMMIM.out...
Processing ../calculations_individual/BMPYR.out...
Processing ../calculations_individual/C12MIM.out...
Processing ../calculations_individual/C3MPYR.out...
Processing ../calculations_individual/C3OMIM.out...
Processing ../calculations_individual/C4MPY.out...
Processing ../calculations_individual/C4PY.out...
Processing ../calculations_individual/C5MIM.out...
Processing ../calculations_individual/C5MPYR.out...
Processing ../calculations_individual/C5O2MIM.out...
Processing ../calculations_individual/C6F9MIM.out...
Processing ../calculations_individual/C7MIM.out...
Processing ../calculations_individual/C7MPYR.out...
Processing ../calculations_individual/C8F13MIM.out...
Processing ../calculations_individual/C9MPYR.out...
Processing ../calculations_individual/COCMPYR.out...
Processing ../ca