In [10]:
import pandas as pd

# --- File Paths ---
experimental_file = '../data/cleaned_experimental_data_combined.csv'
descriptor_file = '../data/orca_descriptors_individual.csv'
ion_name_file = '../data/unique_ions_with_names.csv'
output_file = '../data/final_model_dataset.csv'

def standardize_name(series):
    return series.str.strip().str.replace(r'[+\[\]()-]', '', regex=True)

try:
    # --- Load all datasets ---
    df_exp = pd.read_csv(experimental_file)
    df_desc = pd.read_csv(descriptor_file)
    df_ion_types = pd.read_csv(ion_name_file)
    print("All source files loaded successfully.")

    # --- Prepare Descriptor Data ---
    df_desc = pd.merge(df_desc, df_ion_types[['ion_abbreviation', 'ion_type']], on='ion_abbreviation', how='left')
    df_desc['merge_key'] = standardize_name(df_desc['ion_abbreviation'])

    df_cat_desc = df_desc[df_desc['ion_type'] == 'cation'].copy()
    df_an_desc = df_desc[df_desc['ion_type'] == 'anion'].copy()

    df_cat_desc.columns = ['cation_' + col if col not in ['ion_abbreviation', 'merge_key'] else col for col in df_cat_desc.columns]
    df_an_desc.columns = ['anion_' + col if col not in ['ion_abbreviation', 'merge_key'] else col for col in df_an_desc.columns]

    # --- Prepare Experimental Data ---
    df_exp['cation_merge_key'] = standardize_name(df_exp['cation'])
    df_exp['anion_merge_key'] = standardize_name(df_exp['anion'])

    # --- Perform Merges ---
    df_merged = pd.merge(df_exp, df_cat_desc, left_on='cation_merge_key', right_on='merge_key', how='inner')
    df_final = pd.merge(df_merged, df_an_desc, left_on='anion_merge_key', right_on='merge_key', how='inner')

    # --- Clean up and Save ---
    df_final.drop(columns=['cation_merge_key', 'anion_merge_key', 'ion_abbreviation_x', 'merge_key_x', 'cation_ion_type', 'ion_abbreviation_y', 'merge_key_y', 'anion_ion_type'], inplace=True, errors='ignore')

    # --- NEW: Replace all missing values (like the dipole moments for symmetrical ions) with 0 ---
    df_final.fillna(0, inplace=True)

    print(f"\nSuccessfully merged the datasets. Final dataset has {len(df_final)} data points.")

    df_final.to_csv(output_file, index=False)
    print(f"Final dataset for modeling saved to '{output_file}'")

    print("\n--- Preview of the Final, Merged Dataset ---")
    print(df_final.head())

except Exception as e:
    print(f"An error occurred: {e}")

All source files loaded successfully.

Successfully merged the datasets. Final dataset has 11256 data points.
Final dataset for modeling saved to '../data/final_model_dataset.csv'

--- Preview of the Final, Merged Dataset ---
       cation   anion  Temperature_K  Pressure_kPa  CO2_solubility  \
0  [(ETO)2IM]  [Tf2N]         323.85       22330.0           0.778   
1  [(ETO)2IM]  [Tf2N]         323.95       10340.0           0.712   
2  [(ETO)2IM]  [Tf2N]         343.55        7510.0           0.554   
3  [(ETO)2IM]  [Tf2N]         324.15        7110.0           0.626   
4  [(ETO)2IM]  [Tf2N]         363.15        5250.0           0.396   

   cation_Energy_Hartree  cation_HOMO_Eh  cation_LUMO_Eh  \
0            -534.028453       -0.416496       -0.216006   
1            -534.028453       -0.416496       -0.216006   
2            -534.028453       -0.416496       -0.216006   
3            -534.028453       -0.416496       -0.216006   
4            -534.028453       -0.416496       -0.216