In [5]:
import pandas as pd

# --- File Paths ---
experimental_file = '../data/cleaned_experimental_data_combined.csv'
descriptor_file = '../data/orca_descriptors_individual.csv'
atom_count_file = '../data/ions_with_atomic_counts.csv' # The file with your atom counts
output_file = '../data/final_model_dataset_v2.csv'

# This function cleans the ion abbreviations so they match
def standardize_name(series):
    return series.str.strip().str.replace(r'[+\[\]()-]', '', regex=True)

try:
    # --- 1. Load all data sources ---
    df_exp = pd.read_csv(experimental_file)
    df_desc = pd.read_csv(descriptor_file)
    df_atoms = pd.read_csv(atom_count_file)
    print("All source files loaded successfully.")

    # --- 2. NEW: Combine ORCA descriptors with atom counts first ---
    # We only need the atom count columns and the abbreviation to merge
    df_atom_counts_only = df_atoms.drop(columns=['ion_name', 'ion_type', 'smiles'])
    df_all_features = pd.merge(df_desc, df_atom_counts_only, on='ion_abbreviation', how='left')

    # Now add the ion_type and full names from the atom count file
    df_all_features = pd.merge(df_all_features, df_atoms[['ion_abbreviation', 'ion_name', 'ion_type']], on='ion_abbreviation', how='left')
    print("Successfully combined ORCA descriptors with atomic counts.")

    # --- 3. Prepare data for merging ---
    df_all_features['merge_key'] = standardize_name(df_all_features['ion_abbreviation'])
    df_exp['cation_merge_key'] = standardize_name(df_exp['cation'])
    df_exp['anion_merge_key'] = standardize_name(df_exp['anion'])

    df_cat_features = df_all_features[df_all_features['ion_type'] == 'cation'].copy()
    df_an_features = df_all_features[df_all_features['ion_type'] == 'anion'].copy()

    df_cat_features.columns = ['cation_' + col if col not in ['ion_abbreviation', 'merge_key'] else col for col in df_cat_features.columns]
    df_an_features.columns = ['anion_' + col if col not in ['ion_abbreviation', 'merge_key'] else col for col in df_an_features.columns]

    # --- 4. Perform Merges ---
    df_merged = pd.merge(df_exp, df_cat_features, left_on='cation_merge_key', right_on='merge_key', how='inner')
    df_final = pd.merge(df_merged, df_an_features, left_on='anion_merge_key', right_on='merge_key', how='inner')

    df_final.fillna(0, inplace=True)

    # --- 5. Reorder columns to the desired structure ---
    id_cols = ['cation', 'cation_ion_name', 'anion', 'anion_ion_name']
    atom_count_cols = sorted([col for col in df_final.columns if 'num_' in col])
    condition_cols = ['Temperature_K', 'Pressure_kPa']
    descriptor_cols = sorted([col for col in df_final.columns if col.endswith(('_Eh', '_Debye', '_au', '_A3', '_Hartree'))])
    target_col = ['CO2_solubility']

    final_order = id_cols + atom_count_cols + condition_cols + descriptor_cols + target_col
    final_order_existing = [col for col in final_order if col in df_final.columns]
    df_final = df_final[final_order_existing]

    print("\nColumns have been reordered successfully.")

    # --- 6. Save the final file ---
    df_final.to_csv(output_file, index=False)
    print(f"Final dataset for modeling saved to '{output_file}'")

    print("\n--- Preview of Final Dataset with Atom Counts ---")
    pd.set_option('display.max_columns', None)
    print(df_final.head())

except Exception as e:
    print(f"An error occurred: {e}")

All source files loaded successfully.
Successfully combined ORCA descriptors with atomic counts.

Columns have been reordered successfully.
Final dataset for modeling saved to '../data/final_model_dataset_v2.csv'

--- Preview of Final Dataset with Atom Counts ---
       cation          cation_ion_name   anion  \
0  [(ETO)2IM]  1,3-diethoxyimidazolium  [Tf2N]   
1  [(ETO)2IM]  1,3-diethoxyimidazolium  [Tf2N]   
2  [(ETO)2IM]  1,3-diethoxyimidazolium  [Tf2N]   
3  [(ETO)2IM]  1,3-diethoxyimidazolium  [Tf2N]   
4  [(ETO)2IM]  1,3-diethoxyimidazolium  [Tf2N]   

                      anion_ion_name  anion_num_Ac  anion_num_Ag  \
0  bis(trifluoromethylsulfonyl)amide             0             0   
1  bis(trifluoromethylsulfonyl)amide             0             0   
2  bis(trifluoromethylsulfonyl)amide             0             0   
3  bis(trifluoromethylsulfonyl)amide             0             0   
4  bis(trifluoromethylsulfonyl)amide             0             0   

   anion_num_Al  anion_num