In [2]:
import pandas as pd
from rdkit import Chem

# List of all 118 elements
ALL_ELEMENTS = [
    'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P',
    'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',
    'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru',
    'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce',
    'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf',
    'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn',
    'Fr', 'Ra', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm',
    'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl',
    'Mc', 'Lv', 'Ts', 'Og'
]

def count_all_atoms(smiles):
    """Counts atoms against a master list of all elements."""
    # Start with a dictionary of all elements set to 0
    atom_counts = {el: 0 for el in ALL_ELEMENTS}
    try:
        mol = Chem.MolFromSmiles(smiles)
        mol = Chem.AddHs(mol)
        for atom in mol.GetAtoms():
            symbol = atom.GetSymbol()
            if symbol in atom_counts:
                atom_counts[symbol] += 1
        return atom_counts
    except:
        return atom_counts # Return the zero-filled dict if SMILES is invalid

# --- Main Script ---
input_file = '../data/ions_with_smiles.csv'
output_file = '../data/ions_with_atomic_counts.csv'

try:
    df_ions = pd.read_csv(input_file)
    df_ions['atom_counts'] = df_ions['smiles'].apply(count_all_atoms)

    # Explode the dictionary into 118 'num_X' columns
    df_counts = df_ions['atom_counts'].apply(pd.Series)
    df_counts = df_counts.add_prefix('num_')

    df_final = pd.concat([df_ions, df_counts], axis=1).drop(columns=['atom_counts'])
    df_final.to_csv(output_file, index=False)

    print(f"Successfully calculated full atomic compositions and saved to '{output_file}'")
    print(f"New descriptor file has {len(df_final.columns)} columns.")

except Exception as e:
    print(f"An error occurred: {e}")

Successfully calculated full atomic compositions and saved to '../data/ions_with_atomic_counts.csv'
New descriptor file has 122 columns.
