In [1]:
import os

In [None]:
def rename_files(base_dir):
    """
    Standardize ASE/VASP trajectory filenames for downstream RMG database processing.

    This function iterates over compound subdirectories inside `base_dir` and renames
    ASE trajectory filenames to a canonical format that encodes the compound name
    in the filename:

        ads60_<compound>.traj
        relax_restart_<compound>.traj
        structure_<compound>.traj

    This normalizes outputs from different VASP/ASE pipelines (e.g., vasprun_* vs *.traj)
    so that later scripts can reliably identify adsorption, relaxation, and reference
    structures by filename.

    Parameters
    ----------
    base_dir : str
        Path containing one subdirectory per compound, each holding ASE trajectory files.
    """
    for compound_name in os.listdir(base_dir):
        compound_dir = os.path.join(base_dir, compound_name)
        print(f"Processing directory: {compound_dir}")

        if os.path.isdir(compound_dir):
            old_ads60_names = ['vasprun_ads60.traj', 'ads60.traj']
            old_relax_names = ['vasprun_relax_restart.traj', 'relax_restart.traj']
            old_structure_names = ['structure.traj']

            new_ads60_name = f"ads60_{compound_name}.traj"
            new_relax_name = f"relax_restart_{compound_name}.traj"
            new_structure_name = f"structure_{compound_name}.traj"

            # Rename ads60 files
            for old_name in old_ads60_names:
                old_path = os.path.join(compound_dir, old_name)
                if os.path.exists(old_path):
                    new_path = os.path.join(compound_dir, new_ads60_name)
                    os.rename(old_path, new_path)
                    print(f"Renamed {old_path} to {new_path}")

            # Rename relax_restart files
            for old_name in old_relax_names:
                old_path = os.path.join(compound_dir, old_name)
                if os.path.exists(old_path):
                    new_path = os.path.join(compound_dir, new_relax_name)
                    os.rename(old_path, new_path)
                    print(f"Renamed {old_path} to {new_path}")

            # Rename structure files
            for old_name in old_structure_names:
                old_path = os.path.join(compound_dir, old_name)
                if os.path.exists(old_path):
                    new_path = os.path.join(compound_dir, new_structure_name)
                    os.rename(old_path, new_path)
                    print(f"Renamed {old_path} to {new_path}")

In [3]:
base_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..', 'DFT_Data', 'Cu111'))

In [4]:
rename_files(base_dir)

Processing directory: /home/ssun30/Work/Electrocat/CO2_RR_DFT/DFT_Data/Cu111/CH2OHCHOX
Renamed /home/ssun30/Work/Electrocat/CO2_RR_DFT/DFT_Data/Cu111/CH2OHCHOX/structure.traj to /home/ssun30/Work/Electrocat/CO2_RR_DFT/DFT_Data/Cu111/CH2OHCHOX/structure_CH2OHCHOX.traj
Processing directory: /home/ssun30/Work/Electrocat/CO2_RR_DFT/DFT_Data/Cu111/CHX
Renamed /home/ssun30/Work/Electrocat/CO2_RR_DFT/DFT_Data/Cu111/CHX/structure.traj to /home/ssun30/Work/Electrocat/CO2_RR_DFT/DFT_Data/Cu111/CHX/structure_CHX.traj
Processing directory: /home/ssun30/Work/Electrocat/CO2_RR_DFT/DFT_Data/Cu111/COX
Renamed /home/ssun30/Work/Electrocat/CO2_RR_DFT/DFT_Data/Cu111/COX/structure.traj to /home/ssun30/Work/Electrocat/CO2_RR_DFT/DFT_Data/Cu111/COX/structure_COX.traj
Processing directory: /home/ssun30/Work/Electrocat/CO2_RR_DFT/DFT_Data/Cu111/HCOOH
Renamed /home/ssun30/Work/Electrocat/CO2_RR_DFT/DFT_Data/Cu111/HCOOH/structure.traj to /home/ssun30/Work/Electrocat/CO2_RR_DFT/DFT_Data/Cu111/HCOOH/structure_HCO

In [5]:
import pandas as pd

In [None]:
def process_vibrational_frequencies(file_path):
    """
    Parse vibrational frequencies from an Excel sheet and convert them into
    a table for later processing (one molecule per row, flattened vibrational modes).

    The input Excel file should contain a sheet named 'Vibrational_freq' with
    the following structure:

        - Column 0: molecule name
        - Column 1: adsorption site (ignored here)
        - Columns 2+: vibrational frequencies in cm^-1
        - Real frequencies appear first
        - Imaginary frequencies appear after a blank-column separator

    Imaginary frequencies are replaced by 12 cm^-1.

    Parameters
    ----------
    file_path : str
        Path to the Excel file containing vibrational frequency data.

    Returns
    -------
    processed_df : pandas.DataFrame
        Table with one row per molecule and columns:
            molecule_name, vib_freq_1, vib_freq_2, ...
    """
    xls = pd.ExcelFile(file_path)

    df = pd.read_excel(xls, 'Vibrational_freq', skiprows=1)  # Skip the first header row
    df = df.dropna(how='all')

    processed_data = {}
    current_molecule = None
    current_freqs = []

    # We ignore the vib freqs in meVs for now
    ignore_data = False

    # Iterate over the rows of the dataframe
    for index, row in df.iterrows():
        # Check if we need to ignore data based on headers for meV section
        if ignore_data:
            break

        molecule_name = row[0]
        vib_freqs = row[2:].values

        # Check if the current row starts the meV section
        if molecule_name == 'molecule' and row[1] == 'site' and row[2] == 'vib freq (meV)':
            ignore_data = True
            continue

        if molecule_name != current_molecule:
            # If we encounter a new molecule, save the previous molecule's data
            if current_molecule is not None:
                processed_data[current_molecule] = current_freqs
            # Update the current molecule and reset frequencies
            current_molecule = molecule_name
            current_freqs = []

        # Separate real and imaginary frequencies
        real_freqs = []
        imaginary_freqs = []
        imaginary_found = False

        for freq in vib_freqs:
            if pd.isna(freq) and not imaginary_found:
                imaginary_found = True
                continue

            if not imaginary_found:
                real_freqs.append(freq)
            elif not pd.isna(freq):
                imaginary_freqs.append(freq)

        if imaginary_found and imaginary_freqs:
            imaginary_freqs = [12]*len(imaginary_freqs)
        else:
            imaginary_freqs = []

        # Combine the real and imaginary frequencies back
        all_freqs = list(real_freqs) + list(imaginary_freqs)
        current_freqs.extend(all_freqs)

    # Save the last molecule's data
    if current_molecule is not None:
        processed_data[current_molecule] = current_freqs

    # Create a new DataFrame for the processed data
    max_len = max(len(freqs) for freqs in processed_data.values())
    columns = ['molecule_name'] + [f'vib_freq_{i+1}' for i in range(max_len)]
    processed_df = pd.DataFrame(columns=columns)

    for molecule_name, freqs in processed_data.items():
        row_data = [molecule_name] + freqs + [None] * (max_len - len(freqs))
        processed_df = processed_df.append(pd.Series(row_data, index=columns), ignore_index=True)

    return processed_df

In [7]:
fpath = os.path.abspath(os.path.join(os.getcwd(), '..', '..', 'DFT_Data', 'Cu111', 'vibrational_freq_zpe_Cu111.xlsx'))

In [8]:
processed_df = process_vibrational_frequencies(fpath)

In [9]:
processed_df

Unnamed: 0,molecule_name,vib_freq_1,vib_freq_2,vib_freq_3,vib_freq_4,vib_freq_5,vib_freq_6,vib_freq_7,vib_freq_8,vib_freq_9,...,vib_freq_18,vib_freq_19,vib_freq_20,vib_freq_21,vib_freq_22,vib_freq_23,vib_freq_24,vib_freq_25,vib_freq_26,vib_freq_27
0,CX,500.308053,478.032005,478.020487,,,,,,,...,,,,,,,,,,
1,CH2X,2973.544669,2928.440433,1289.601838,632.191103,462.225953,410.945788,296.738582,257.143446,103.999808,...,,,,,,,,,,
2,COX,1819.058007,283.196878,232.072104,230.642449,114.014039,102.662483,,,,...,,,,,,,,,,
3,COHX,3770.241081,1547.476389,394.151303,158.24584,149.850787,12.0,12.0,12.0,12.0,...,,,,,,,,,,
4,CH2CHOX,3158.019347,3028.39928,2957.55139,1557.936944,1350.100771,1281.297796,1130.087328,925.645061,907.98689,...,12.0,,,,,,,,,
5,CH2O,2857.756757,2806.467801,1761.822431,1477.605378,1213.583381,1144.956512,45.505366,26.515595,12.0,...,,,,,,,,,,
6,CH2OHX,3648.999668,2911.10358,2860.863806,1396.078299,1228.036072,1151.450572,1095.297184,928.282798,390.752548,...,,,,,,,,,,
7,CH2OHCHOX,3373.813596,2977.267671,2861.599822,2846.236831,1696.000469,1375.931844,1345.399835,1325.626374,1292.361041,...,211.453217,84.013312,59.305397,21.469181,12.0,12.0,12.0,,,
8,CH3CH2OX,3038.705282,3007.874458,2980.771478,2950.684164,2930.634224,1450.058192,1422.597582,1417.285284,1345.79188,...,303.803829,278.331044,231.276439,169.527998,94.148982,65.439613,16.911236,,,
9,CH3CH2OHX,3756.669971,3057.711924,3056.842696,2979.8187,2940.084174,2912.497025,1473.037866,1450.044194,1433.09936,...,795.906397,407.728798,287.832694,222.852958,73.926627,49.995576,43.634214,24.896307,12.0,12.0


In [None]:
def save_frequencies_to_txt(df, base_dir):
    """
    Write vibrational frequency of each molecule to text format.

    For each molecule in the input DataFrame, this function creates a file

        zpe_log_<molecule>.txt

    inside the corresponding molecule directory under `base_dir`. The file
    contains a sorted list of vibrational frequencies with both meV and cm^-1
    units, in a format compatible with downstream zero-point energy (ZPE) and
    thermochemistry parsing scripts used for RMG database generation.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame with columns:
            - molecule_name
            - vib_freq_1, vib_freq_2, ...
        as produced by `process_vibrational_frequencies()`.

    base_dir : str
        Path containing one subdirectory per molecule, where output files
        will be written.
    """
    # Iterate through the rows of the dataframe
    for index, row in df.iterrows():
        molecule_name = row['molecule_name']

        # Find the folder corresponding to the molecule
        molecule_folder = os.path.join(base_dir, molecule_name)

        if not os.path.exists(molecule_folder):
            print(f"Error: Folder for molecule '{molecule_name}' not found.")
            continue

        # Create the txt file path
        txt_file_path = os.path.join(molecule_folder, f'zpe_log_{molecule_name}.txt')

        # Open the file for writing
        with open(txt_file_path, 'w') as file:
            # Write the header
            file.write('---------------------\n')
            file.write('  #    meV     cm^-1\n')
            file.write('---------------------\n')

            # Collect vibrational frequencies and sort them
            freqs = [freq for freq in row[1:] if not pd.isna(freq)]  # Exclude NaNs
            freqs.sort()

            # Write the sorted vibrational frequencies with 2 decimal places
            for i, freq in enumerate(freqs, start=0):
                freq_mev = freq / 8.0655429
                file.write(f'{i:3}    {freq_mev:6.2f}    {freq:6.2f}\n')

In [12]:
save_frequencies_to_txt(processed_df, base_dir)

In [None]:
def read_zpe_data(file_path):
    """
    Read zero-point energy (ZPE) data from an Excel file and return a standardized table in text format.

    The input Excel file should contain a sheet named 'ZPE' with columns:

        - molecule : molecule / adsorbate name
        - site     : adsorption site (ignored here)
        - zpe      : zero-point energy in eV

    The site column is dropped and the remaining data are returned as a two-column
    DataFrame mapping molecule name to ZPE (in eV).

    Parameters
    ----------
    file_path : str
        Path to the Excel file containing ZPE data.

    Returns
    -------
    df_zpe : pandas.DataFrame
        DataFrame with columns:
            - molecule_name
            - zpe_eV
    """
    xls = pd.ExcelFile(file_path)
    df_zpe = pd.read_excel(xls, 'ZPE', header=0)
    df_zpe = df_zpe.drop(columns=['site'])

    # Rename columns for clarity (optional)
    df_zpe.columns = ['molecule_name', 'zpe_eV']

    return df_zpe

In [14]:
zpe_df = read_zpe_data(fpath)

In [None]:
def append_zpe_to_txt(df, base_dir):
    """
    Append total zero-point energy (ZPE) values to existing vibrational frequency logs.

    For each molecule in the input DataFrame, this function locates the corresponding
    file

        zpe_log_<molecule>.txt

    inside the molecule directory under `base_dir` and appends a final line containing
    the total zero-point energy in eV.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame with columns:
            - molecule_name
            - zpe_eV
        as produced by `read_zpe_data()`.

    base_dir : str
        Path containing one subdirectory per molecule with existing
        zpe_log_<molecule>.txt files.
    """
    # Iterate through the rows of the dataframe
    for index, row in df.iterrows():
        molecule_name = row['molecule_name']
        zpe = row['zpe_eV']

        # Find the folder corresponding to the molecule
        molecule_folder = os.path.join(base_dir, molecule_name)

        if not os.path.exists(molecule_folder):
            print(f"Error: Folder for molecule '{molecule_name}' not found.")
            continue

        # Create the txt file path
        txt_file_path = os.path.join(molecule_folder, f'zpe_log_{molecule_name}.txt')

        # Check if the file exists before appending
        if not os.path.isfile(txt_file_path):
            print(f"Error: File '{txt_file_path}' not found.")
            continue

        # Open the file for appending
        with open(txt_file_path, 'a') as file:
            # Write the ZPE line at the end of the file
            file.write('---------------------\n')
            file.write(f"Zero-point energy: {zpe} eV\n")

In [16]:
append_zpe_to_txt(zpe_df, base_dir)