From 3063126d057bbbebdaa62624ee4028438f4c3302 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 11 Sep 2025 14:06:34 +0000 Subject: [PATCH 1/3] Initial plan From c36c520490bac437ecafc2cfc025863bf4e44f70 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 11 Sep 2025 14:19:57 +0000 Subject: [PATCH 2/3] Implement atomic number refactor with hash-backed inference and backward compatibility Co-authored-by: scanberg <706523+scanberg@users.noreply.github.com> --- CMakeLists.txt | 1 + src/core/md_atomic.h | 166 +++++++++++++++++++++ src/md_atomic_infer.c | 315 ++++++++++++++++++++++++++++++++++++++++ src/md_util.c | 139 +----------------- src/md_util.h | 17 +-- unittest/CMakeLists.txt | 1 + unittest/test_atomic.c | 122 ++++++++++++++++ 7 files changed, 613 insertions(+), 148 deletions(-) create mode 100644 src/core/md_atomic.h create mode 100644 src/md_atomic_infer.c create mode 100644 unittest/test_atomic.c diff --git a/CMakeLists.txt b/CMakeLists.txt index d12e7f4..b21051b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,7 @@ file(GLOB_RECURSE EXT_FILES ext/*) file(GLOB_RECURSE PROJ_FILES cmake/*) set(SRC_FILES + src/md_atomic_infer.c src/md_csv.c src/md_csv.h src/md_cube.c diff --git a/src/core/md_atomic.h b/src/core/md_atomic.h new file mode 100644 index 0000000..af63843 --- /dev/null +++ b/src/core/md_atomic.h @@ -0,0 +1,166 @@ +#pragma once + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Atomic number type: 0 = unknown, 1-118 = element atomic numbers +typedef uint8_t md_atomic_number_t; + +// Legacy alias for compatibility +typedef md_atomic_number_t md_element_t; + +// Forward declaration of molecule type +struct md_molecule_t; + +// Atomic number constants for all elements (Z values) +enum { + MD_Z_X = 0, // Unknown + MD_Z_H = 1, // Hydrogen + MD_Z_HE = 2, // Helium + MD_Z_LI = 3, // Lithium + MD_Z_BE = 4, // Beryllium + MD_Z_B = 5, // Boron + MD_Z_C = 6, // Carbon + MD_Z_N = 7, // Nitrogen + MD_Z_O = 8, // Oxygen + MD_Z_F = 9, // Fluorine + MD_Z_NE = 10, // Neon + MD_Z_NA = 11, // Sodium + MD_Z_MG = 12, // Magnesium + MD_Z_AL = 13, // Aluminium + MD_Z_SI = 14, // Silicon + MD_Z_P = 15, // Phosphorus + MD_Z_S = 16, // Sulfur + MD_Z_CL = 17, // Chlorine + MD_Z_AR = 18, // Argon + MD_Z_K = 19, // Potassium + MD_Z_CA = 20, // Calcium + MD_Z_SC = 21, // Scandium + MD_Z_TI = 22, // Titanium + MD_Z_V = 23, // Vanadium + MD_Z_CR = 24, // Chromium + MD_Z_MN = 25, // Manganese + MD_Z_FE = 26, // Iron + MD_Z_CO = 27, // Cobalt + MD_Z_NI = 28, // Nickel + MD_Z_CU = 29, // Copper + MD_Z_ZN = 30, // Zinc + MD_Z_GA = 31, // Gallium + MD_Z_GE = 32, // Germanium + MD_Z_AS = 33, // Arsenic + MD_Z_SE = 34, // Selenium + MD_Z_BR = 35, // Bromine + MD_Z_KR = 36, // Krypton + MD_Z_RB = 37, // Rubidium + MD_Z_SR = 38, // Strontium + MD_Z_Y = 39, // Yttrium + MD_Z_ZR = 40, // Zirconium + MD_Z_NB = 41, // Niobium + MD_Z_MO = 42, // Molybdenum + MD_Z_TC = 43, // Technetium + MD_Z_RU = 44, // Ruthenium + MD_Z_RH = 45, // Rhodium + MD_Z_PD = 46, // Palladium + MD_Z_AG = 47, // Silver + MD_Z_CD = 48, // Cadmium + MD_Z_IN = 49, // Indium + MD_Z_SN = 50, // Tin + MD_Z_SB = 51, // Antimony + MD_Z_TE = 52, // Tellurium + MD_Z_I = 53, // Iodine + MD_Z_XE = 54, // Xenon + MD_Z_CS = 55, // Caesium + MD_Z_BA = 56, // Barium + MD_Z_LA = 57, // Lanthanum + MD_Z_CE = 58, // Cerium + MD_Z_PR = 59, // Praseodymium + MD_Z_ND = 60, // Neodymium + MD_Z_PM = 61, // Promethium + MD_Z_SM = 62, // Samarium + MD_Z_EU = 63, // Europium + MD_Z_GD = 64, // Gadolinium + MD_Z_TB = 65, // Terbium + MD_Z_DY = 66, // Dysprosium + MD_Z_HO = 67, // Holmium + MD_Z_ER = 68, // Erbium + MD_Z_TM = 69, // Thulium + MD_Z_YB = 70, // Ytterbium + MD_Z_LU = 71, // Lutetium + MD_Z_HF = 72, // Hafnium + MD_Z_TA = 73, // Tantalum + MD_Z_W = 74, // Tungsten + MD_Z_RE = 75, // Rhenium + MD_Z_OS = 76, // Osmium + MD_Z_IR = 77, // Iridium + MD_Z_PT = 78, // Platinum + MD_Z_AU = 79, // Gold + MD_Z_HG = 80, // Mercury + MD_Z_TL = 81, // Thallium + MD_Z_PB = 82, // Lead + MD_Z_BI = 83, // Bismuth + MD_Z_PO = 84, // Polonium + MD_Z_AT = 85, // Astatine + MD_Z_RN = 86, // Radon + MD_Z_FR = 87, // Francium + MD_Z_RA = 88, // Radium + MD_Z_AC = 89, // Actinium + MD_Z_TH = 90, // Thorium + MD_Z_PA = 91, // Protactinium + MD_Z_U = 92, // Uranium + MD_Z_NP = 93, // Neptunium + MD_Z_PU = 94, // Plutonium + MD_Z_AM = 95, // Americium + MD_Z_CM = 96, // Curium + MD_Z_BK = 97, // Berkelium + MD_Z_CF = 98, // Californium + MD_Z_ES = 99, // Einsteinium + MD_Z_FM = 100, // Fermium + MD_Z_MD = 101, // Mendelevium + MD_Z_NO = 102, // Nobelium + MD_Z_LR = 103, // Lawrencium + MD_Z_RF = 104, // Rutherfordium + MD_Z_DB = 105, // Dubnium + MD_Z_SG = 106, // Seaborgium + MD_Z_BH = 107, // Bohrium + MD_Z_HS = 108, // Hassium + MD_Z_MT = 109, // Meitnerium + MD_Z_DS = 110, // Darmstadtium + MD_Z_RG = 111, // Roentgenium + MD_Z_CN = 112, // Copernicium + MD_Z_NH = 113, // Nihonium + MD_Z_FL = 114, // Flerovium + MD_Z_MC = 115, // Moscovium + MD_Z_LV = 116, // Livermorium + MD_Z_TS = 117, // Tennessine + MD_Z_OG = 118, // Oganesson +}; + +// New preferred API names + +// Element symbol and name lookup functions +md_atomic_number_t md_atomic_number_from_symbol(str_t sym); +md_atomic_number_t md_atomic_number_from_symbol_icase(str_t sym); +str_t md_symbol_from_atomic_number(md_atomic_number_t z); +str_t md_name_from_atomic_number(md_atomic_number_t z); + +// Element property functions +float md_atomic_mass(md_atomic_number_t z); +float md_vdw_radius(md_atomic_number_t z); +float md_covalent_radius(md_atomic_number_t z); +int md_max_valence(md_atomic_number_t z); +uint32_t md_cpk_color(md_atomic_number_t z); + +// Per-atom inference from labels (atom name + residue) +md_atomic_number_t md_atom_infer_atomic_number(str_t atom_name, str_t res_name); + +// Batch form wired to molecule structure +bool md_atoms_infer_atomic_numbers(md_atomic_number_t out[], size_t n, const struct md_molecule_t* mol); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/src/md_atomic_infer.c b/src/md_atomic_infer.c new file mode 100644 index 0000000..386acc6 --- /dev/null +++ b/src/md_atomic_infer.c @@ -0,0 +1,315 @@ +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +// Static hashmaps for element inference +static md_hashmap32_t residue_atom_map = {0}; +static md_hashmap32_t atom_only_map = {0}; +static bool maps_initialized = false; + +// Helper functions +static inline char to_upper_c(char c) { + return (c >= 'a' && c <= 'z') ? (c - 'a' + 'A') : c; +} + +static inline bool is_alpha_c(char c) { + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); +} + +static inline bool is_digit_c(char c) { + return c >= '0' && c <= '9'; +} + +// Strip digits from end of string +static str_t strip_digits(str_t str) { + while (str.len > 0 && is_digit_c(str.ptr[str.len - 1])) { + str.len--; + } + return str; +} + +// Create normalized uppercase string for hashing +static void normalize_to_upper(char* dst, str_t src, bool strip_digits_flag) { + str_t s = strip_digits_flag ? strip_digits(src) : src; + for (size_t i = 0; i < s.len; ++i) { + dst[i] = to_upper_c(s.ptr[i]); + } + dst[s.len] = '\0'; +} + +// Initialize lookup tables +static void init_lookup_tables(void) { + if (maps_initialized) return; + + md_allocator_i* alloc = md_get_heap_allocator(); + residue_atom_map.allocator = alloc; + atom_only_map.allocator = alloc; + + md_hashmap_reserve(&residue_atom_map, 512); + md_hashmap_reserve(&atom_only_map, 256); + + // Water variants + struct { const char* res; const char* atom; md_atomic_number_t z; } water_entries[] = { + {"HOH", "O", MD_Z_O}, {"HOH", "OW", MD_Z_O}, {"HOH", "OH2", MD_Z_O}, + {"HOH", "H", MD_Z_H}, {"HOH", "H1", MD_Z_H}, {"HOH", "H2", MD_Z_H}, {"HOH", "HW1", MD_Z_H}, {"HOH", "HW2", MD_Z_H}, {"HOH", "HW", MD_Z_H}, + {"WAT", "O", MD_Z_O}, {"WAT", "OW", MD_Z_O}, {"WAT", "OH2", MD_Z_O}, + {"WAT", "H", MD_Z_H}, {"WAT", "H1", MD_Z_H}, {"WAT", "H2", MD_Z_H}, {"WAT", "HW1", MD_Z_H}, {"WAT", "HW2", MD_Z_H}, {"WAT", "HW", MD_Z_H}, + {"TIP3", "O", MD_Z_O}, {"TIP3", "OW", MD_Z_O}, {"TIP3", "OH2", MD_Z_O}, + {"TIP3", "H", MD_Z_H}, {"TIP3", "H1", MD_Z_H}, {"TIP3", "H2", MD_Z_H}, {"TIP3", "HW1", MD_Z_H}, {"TIP3", "HW2", MD_Z_H}, {"TIP3", "HW", MD_Z_H}, + {"TIP4", "O", MD_Z_O}, {"TIP4", "OW", MD_Z_O}, {"TIP4", "OH2", MD_Z_O}, + {"TIP4", "H", MD_Z_H}, {"TIP4", "H1", MD_Z_H}, {"TIP4", "H2", MD_Z_H}, {"TIP4", "HW1", MD_Z_H}, {"TIP4", "HW2", MD_Z_H}, {"TIP4", "HW", MD_Z_H}, + {"TIP5", "O", MD_Z_O}, {"TIP5", "OW", MD_Z_O}, {"TIP5", "OH2", MD_Z_O}, + {"TIP5", "H", MD_Z_H}, {"TIP5", "H1", MD_Z_H}, {"TIP5", "H2", MD_Z_H}, {"TIP5", "HW1", MD_Z_H}, {"TIP5", "HW2", MD_Z_H}, {"TIP5", "HW", MD_Z_H}, + {"SPC", "O", MD_Z_O}, {"SPC", "OW", MD_Z_O}, {"SPC", "OH2", MD_Z_O}, + {"SPC", "H", MD_Z_H}, {"SPC", "H1", MD_Z_H}, {"SPC", "H2", MD_Z_H}, {"SPC", "HW1", MD_Z_H}, {"SPC", "HW2", MD_Z_H}, {"SPC", "HW", MD_Z_H}, + {"SOL", "O", MD_Z_O}, {"SOL", "OW", MD_Z_O}, {"SOL", "OH2", MD_Z_O}, + {"SOL", "H", MD_Z_H}, {"SOL", "H1", MD_Z_H}, {"SOL", "H2", MD_Z_H}, {"SOL", "HW1", MD_Z_H}, {"SOL", "HW2", MD_Z_H}, {"SOL", "HW", MD_Z_H}, + {"H2O", "O", MD_Z_O}, {"H2O", "OW", MD_Z_O}, {"H2O", "OH2", MD_Z_O}, + {"H2O", "H", MD_Z_H}, {"H2O", "H1", MD_Z_H}, {"H2O", "H2", MD_Z_H}, {"H2O", "HW1", MD_Z_H}, {"H2O", "HW2", MD_Z_H}, {"H2O", "HW", MD_Z_H}, + }; + + // Add water entries to residue+atom map + for (size_t i = 0; i < ARRAY_SIZE(water_entries); ++i) { + char key_str[64]; + snprintf(key_str, sizeof(key_str), "%s\t%s", water_entries[i].res, water_entries[i].atom); + uint64_t key = md_hash64(key_str, strlen(key_str), 0); + md_hashmap_add(&residue_atom_map, key, water_entries[i].z); + } + + // Amino acid backbone and sidechain entries + struct { const char* res; const char* atom; md_atomic_number_t z; } amino_entries[] = { + // Common backbone atoms for all amino acids + {"ALA", "CA", MD_Z_C}, {"ALA", "N", MD_Z_N}, {"ALA", "C", MD_Z_C}, {"ALA", "O", MD_Z_O}, {"ALA", "OXT", MD_Z_O}, + {"ARG", "CA", MD_Z_C}, {"ARG", "N", MD_Z_N}, {"ARG", "C", MD_Z_C}, {"ARG", "O", MD_Z_O}, {"ARG", "OXT", MD_Z_O}, + {"ASN", "CA", MD_Z_C}, {"ASN", "N", MD_Z_N}, {"ASN", "C", MD_Z_C}, {"ASN", "O", MD_Z_O}, {"ASN", "OXT", MD_Z_O}, + {"ASP", "CA", MD_Z_C}, {"ASP", "N", MD_Z_N}, {"ASP", "C", MD_Z_C}, {"ASP", "O", MD_Z_O}, {"ASP", "OXT", MD_Z_O}, + {"CYS", "CA", MD_Z_C}, {"CYS", "N", MD_Z_N}, {"CYS", "C", MD_Z_C}, {"CYS", "O", MD_Z_O}, {"CYS", "OXT", MD_Z_O}, + {"GLN", "CA", MD_Z_C}, {"GLN", "N", MD_Z_N}, {"GLN", "C", MD_Z_C}, {"GLN", "O", MD_Z_O}, {"GLN", "OXT", MD_Z_O}, + {"GLU", "CA", MD_Z_C}, {"GLU", "N", MD_Z_N}, {"GLU", "C", MD_Z_C}, {"GLU", "O", MD_Z_O}, {"GLU", "OXT", MD_Z_O}, + {"GLY", "CA", MD_Z_C}, {"GLY", "N", MD_Z_N}, {"GLY", "C", MD_Z_C}, {"GLY", "O", MD_Z_O}, {"GLY", "OXT", MD_Z_O}, + {"HIS", "CA", MD_Z_C}, {"HIS", "N", MD_Z_N}, {"HIS", "C", MD_Z_C}, {"HIS", "O", MD_Z_O}, {"HIS", "OXT", MD_Z_O}, + {"ILE", "CA", MD_Z_C}, {"ILE", "N", MD_Z_N}, {"ILE", "C", MD_Z_C}, {"ILE", "O", MD_Z_O}, {"ILE", "OXT", MD_Z_O}, + {"LEU", "CA", MD_Z_C}, {"LEU", "N", MD_Z_N}, {"LEU", "C", MD_Z_C}, {"LEU", "O", MD_Z_O}, {"LEU", "OXT", MD_Z_O}, + {"LYS", "CA", MD_Z_C}, {"LYS", "N", MD_Z_N}, {"LYS", "C", MD_Z_C}, {"LYS", "O", MD_Z_O}, {"LYS", "OXT", MD_Z_O}, + {"MET", "CA", MD_Z_C}, {"MET", "N", MD_Z_N}, {"MET", "C", MD_Z_C}, {"MET", "O", MD_Z_O}, {"MET", "OXT", MD_Z_O}, + {"PHE", "CA", MD_Z_C}, {"PHE", "N", MD_Z_N}, {"PHE", "C", MD_Z_C}, {"PHE", "O", MD_Z_O}, {"PHE", "OXT", MD_Z_O}, + {"PRO", "CA", MD_Z_C}, {"PRO", "N", MD_Z_N}, {"PRO", "C", MD_Z_C}, {"PRO", "O", MD_Z_O}, {"PRO", "OXT", MD_Z_O}, + {"SER", "CA", MD_Z_C}, {"SER", "N", MD_Z_N}, {"SER", "C", MD_Z_C}, {"SER", "O", MD_Z_O}, {"SER", "OXT", MD_Z_O}, + {"THR", "CA", MD_Z_C}, {"THR", "N", MD_Z_N}, {"THR", "C", MD_Z_C}, {"THR", "O", MD_Z_O}, {"THR", "OXT", MD_Z_O}, + {"TRP", "CA", MD_Z_C}, {"TRP", "N", MD_Z_N}, {"TRP", "C", MD_Z_C}, {"TRP", "O", MD_Z_O}, {"TRP", "OXT", MD_Z_O}, + {"TYR", "CA", MD_Z_C}, {"TYR", "N", MD_Z_N}, {"TYR", "C", MD_Z_C}, {"TYR", "O", MD_Z_O}, {"TYR", "OXT", MD_Z_O}, + {"VAL", "CA", MD_Z_C}, {"VAL", "N", MD_Z_N}, {"VAL", "C", MD_Z_C}, {"VAL", "O", MD_Z_O}, {"VAL", "OXT", MD_Z_O}, + // Sidechain specific atoms + {"SER", "OG", MD_Z_O}, {"THR", "OG1", MD_Z_O}, {"TYR", "OH", MD_Z_O}, + {"CYS", "SG", MD_Z_S}, {"MET", "SD", MD_Z_S}, + }; + + // Add amino acid entries to residue+atom map + for (size_t i = 0; i < ARRAY_SIZE(amino_entries); ++i) { + char key_str[64]; + snprintf(key_str, sizeof(key_str), "%s\t%s", amino_entries[i].res, amino_entries[i].atom); + uint64_t key = md_hash64(key_str, strlen(key_str), 0); + md_hashmap_add(&residue_atom_map, key, amino_entries[i].z); + } + + // Nucleic acid entries + struct { const char* res; const char* atom; md_atomic_number_t z; } nucleic_entries[] = { + // DNA + {"DA", "P", MD_Z_P}, {"DA", "OP1", MD_Z_O}, {"DA", "OP2", MD_Z_O}, {"DA", "O1P", MD_Z_O}, {"DA", "O2P", MD_Z_O}, + {"DC", "P", MD_Z_P}, {"DC", "OP1", MD_Z_O}, {"DC", "OP2", MD_Z_O}, {"DC", "O1P", MD_Z_O}, {"DC", "O2P", MD_Z_O}, + {"DG", "P", MD_Z_P}, {"DG", "OP1", MD_Z_O}, {"DG", "OP2", MD_Z_O}, {"DG", "O1P", MD_Z_O}, {"DG", "O2P", MD_Z_O}, + {"DT", "P", MD_Z_P}, {"DT", "OP1", MD_Z_O}, {"DT", "OP2", MD_Z_O}, {"DT", "O1P", MD_Z_O}, {"DT", "O2P", MD_Z_O}, + // RNA + {"A", "P", MD_Z_P}, {"A", "OP1", MD_Z_O}, {"A", "OP2", MD_Z_O}, {"A", "O1P", MD_Z_O}, {"A", "O2P", MD_Z_O}, + {"C", "P", MD_Z_P}, {"C", "OP1", MD_Z_O}, {"C", "OP2", MD_Z_O}, {"C", "O1P", MD_Z_O}, {"C", "O2P", MD_Z_O}, + {"G", "P", MD_Z_P}, {"G", "OP1", MD_Z_O}, {"G", "OP2", MD_Z_O}, {"G", "O1P", MD_Z_O}, {"G", "O2P", MD_Z_O}, + {"U", "P", MD_Z_P}, {"U", "OP1", MD_Z_O}, {"U", "OP2", MD_Z_O}, {"U", "O1P", MD_Z_O}, {"U", "O2P", MD_Z_O}, + }; + + // Add nucleic acid entries to residue+atom map + for (size_t i = 0; i < ARRAY_SIZE(nucleic_entries); ++i) { + char key_str[64]; + snprintf(key_str, sizeof(key_str), "%s\t%s", nucleic_entries[i].res, nucleic_entries[i].atom); + uint64_t key = md_hash64(key_str, strlen(key_str), 0); + md_hashmap_add(&residue_atom_map, key, nucleic_entries[i].z); + } + + // Selenomethionine + char mse_key[64]; + snprintf(mse_key, sizeof(mse_key), "MSE\tSE"); + uint64_t mse_hash = md_hash64(mse_key, strlen(mse_key), 0); + md_hashmap_add(&residue_atom_map, mse_hash, MD_Z_SE); + + // Atom-only fallbacks + struct { const char* atom; md_atomic_number_t z; } atom_entries[] = { + {"H", MD_Z_H}, {"C", MD_Z_C}, {"N", MD_Z_N}, {"O", MD_Z_O}, {"S", MD_Z_S}, {"P", MD_Z_P}, + {"F", MD_Z_F}, {"CL", MD_Z_CL}, {"BR", MD_Z_BR}, {"I", MD_Z_I}, + {"OW", MD_Z_O}, {"OH", MD_Z_O}, {"HW", MD_Z_H}, + {"CA", MD_Z_C}, {"CB", MD_Z_C}, {"CG", MD_Z_C}, {"CD", MD_Z_C}, {"CE", MD_Z_C}, {"CZ", MD_Z_C}, + {"OXT", MD_Z_O}, + // Common ions + {"NA", MD_Z_NA}, {"K", MD_Z_K}, {"MG", MD_Z_MG}, {"ZN", MD_Z_ZN}, {"FE", MD_Z_FE}, + {"MN", MD_Z_MN}, {"CU", MD_Z_CU}, {"CO", MD_Z_CO}, {"NI", MD_Z_NI}, {"CD", MD_Z_CD}, + {"SR", MD_Z_SR}, {"BA", MD_Z_BA}, {"LI", MD_Z_LI}, {"CS", MD_Z_CS}, {"RB", MD_Z_RB}, + {"AL", MD_Z_AL}, {"TI", MD_Z_TI}, {"CR", MD_Z_CR}, {"HG", MD_Z_HG}, {"PB", MD_Z_PB}, + {"AG", MD_Z_AG}, {"AU", MD_Z_AU}, {"PT", MD_Z_PT}, + }; + + // Add atom-only entries + for (size_t i = 0; i < ARRAY_SIZE(atom_entries); ++i) { + char norm_atom[16]; + str_t atom_str = {atom_entries[i].atom, strlen(atom_entries[i].atom)}; + normalize_to_upper(norm_atom, atom_str, true); + uint64_t key = md_hash64(norm_atom, strlen(norm_atom), 0); + md_hashmap_add(&atom_only_map, key, atom_entries[i].z); + } + + maps_initialized = true; +} + +// Core atomic number functions using existing md_util tables +md_atomic_number_t md_atomic_number_from_symbol(str_t sym) { + return md_util_element_lookup(sym); +} + +md_atomic_number_t md_atomic_number_from_symbol_icase(str_t sym) { + return md_util_element_lookup_ignore_case(sym); +} + +str_t md_symbol_from_atomic_number(md_atomic_number_t z) { + return md_util_element_symbol(z); +} + +str_t md_name_from_atomic_number(md_atomic_number_t z) { + return md_util_element_name(z); +} + +float md_atomic_mass(md_atomic_number_t z) { + return md_util_element_atomic_mass(z); +} + +float md_vdw_radius(md_atomic_number_t z) { + return md_util_element_vdw_radius(z); +} + +float md_covalent_radius(md_atomic_number_t z) { + return md_util_element_covalent_radius(z); +} + +int md_max_valence(md_atomic_number_t z) { + return md_util_element_max_valence(z); +} + +uint32_t md_cpk_color(md_atomic_number_t z) { + return md_util_element_cpk_color(z); +} + +// Inference functions +md_atomic_number_t md_atom_infer_atomic_number(str_t atom_name, str_t res_name) { + init_lookup_tables(); + + // Special case: if atom name is empty but residue name is an element (ion case) + if (atom_name.len == 0 && res_name.len > 0) { + md_atomic_number_t res_element = md_atomic_number_from_symbol_icase(res_name); + if (res_element != MD_Z_X) { + return res_element; + } + return MD_Z_X; + } + + if (atom_name.len == 0) return MD_Z_X; + + // Normalize inputs + char norm_res[16] = {0}; + char norm_atom[16] = {0}; + char norm_atom_stripped[16] = {0}; + + if (res_name.len > 0) { + normalize_to_upper(norm_res, res_name, false); + } + normalize_to_upper(norm_atom, atom_name, false); + normalize_to_upper(norm_atom_stripped, atom_name, true); + + // First try residue+atom combination + if (res_name.len > 0) { + char res_atom_key[64]; + snprintf(res_atom_key, sizeof(res_atom_key), "%s\t%s", norm_res, norm_atom); + uint64_t key = md_hash64(res_atom_key, strlen(res_atom_key), 0); + uint32_t* result = md_hashmap_get(&residue_atom_map, key); + if (result) { + return (md_atomic_number_t)*result; + } + + // Special case: if residue is water + if (md_util_resname_water(res_name)) { + if (norm_atom[0] == 'O') return MD_Z_O; + if (norm_atom[0] == 'H') return MD_Z_H; + } + + // Special case: if residue is amino acid and atom is CA, return carbon + if (md_util_resname_amino_acid(res_name) && strcmp(norm_atom, "CA") == 0) { + return MD_Z_C; + } + + // If residue name itself is an element (ion case) + md_atomic_number_t res_element = md_atomic_number_from_symbol_icase(res_name); + if (res_element != MD_Z_X) { + // If atom name is empty or equals residue, return that element + if (atom_name.len == 0 || str_eq_ignore_case(atom_name, res_name)) { + return res_element; + } + } + } + + // Try atom-only map with digits stripped + uint64_t atom_key = md_hash64(norm_atom_stripped, strlen(norm_atom_stripped), 0); + uint32_t* atom_result = md_hashmap_get(&atom_only_map, atom_key); + if (atom_result) { + md_atomic_number_t z = (md_atomic_number_t)*atom_result; + // Override CA (Calcium) to Carbon if in amino acid context + if (z == MD_Z_CA && res_name.len > 0 && md_util_resname_amino_acid(res_name)) { + return MD_Z_C; + } + return z; + } + + // Try two-letter element heuristic (e.g., CL12 => Cl, BR1 => Br) + if (strlen(norm_atom_stripped) >= 2) { + char two_letter[3] = {norm_atom_stripped[0], norm_atom_stripped[1], '\0'}; + str_t two_letter_str = {two_letter, 2}; + md_atomic_number_t two_z = md_atomic_number_from_symbol_icase(two_letter_str); + if (two_z != MD_Z_X) { + // Override CA (Calcium) to Carbon if in amino acid context + if (two_z == MD_Z_CA && res_name.len > 0 && md_util_resname_amino_acid(res_name)) { + return MD_Z_C; + } + return two_z; + } + } + + // Final fallback: first-letter element mapping + char first_letter[2] = {norm_atom_stripped[0], '\0'}; + str_t first_letter_str = {first_letter, 1}; + return md_atomic_number_from_symbol_icase(first_letter_str); +} + +bool md_atoms_infer_atomic_numbers(md_atomic_number_t out[], size_t n, const struct md_molecule_t* mol) { + if (!out || !mol || n == 0) return false; + + size_t count = MIN(n, mol->atom.count); + for (size_t i = 0; i < count; ++i) { + str_t atom_name = LBL_TO_STR(mol->atom.type[i]); + str_t res_name = mol->atom.resname ? LBL_TO_STR(mol->atom.resname[i]) : (str_t){0}; + out[i] = md_atom_infer_atomic_number(atom_name, res_name); + } + + return true; +} \ No newline at end of file diff --git a/src/md_util.c b/src/md_util.c index 15a6129..4fb5ed4 100644 --- a/src/md_util.c +++ b/src/md_util.c @@ -1240,143 +1240,8 @@ static inline bool is_organic(char c) { } bool md_util_element_guess(md_element_t element[], size_t capacity, const struct md_molecule_t* mol) { - ASSERT(capacity > 0); - ASSERT(mol); - ASSERT(mol->atom.count > 0); - - md_hashmap32_t map = { .allocator = md_get_temp_allocator() }; - md_hashmap_reserve(&map, 256); - - // Just for pure elements which have not been salted with resname - md_hashmap32_t elem_map = { .allocator = md_get_temp_allocator() }; - md_hashmap_reserve(&elem_map, 256); - - typedef struct { - str_t name; - md_element_t elem; - } entry_t; - - // Extra table for predefined atom types - entry_t entries[] = { - {STR_LIT("SOD"), Na}, - {STR_LIT("OW"), O}, - {STR_LIT("HW"), H}, - }; - - for (size_t i = 0; i < ARRAY_SIZE(entries); ++i) { - md_hashmap_add(&elem_map, md_hash64(entries[i].name.ptr, entries[i].name.len, 0), entries[i].elem); - } - - const size_t count = MIN(capacity, mol->atom.count); - for (size_t i = 0; i < count; ++i) { - if (element[i] != 0) continue; - - str_t original = LBL_TO_STR(mol->atom.type[i]); - - // Trim whitespace, digits and 'X's - str_t name = trim_label(original); - - if (name.len > 0) { - md_element_t elem = 0; - - str_t resname = STR_LIT(""); - uint64_t res_key = 0; - if (mol->atom.resname) { - resname = LBL_TO_STR(mol->atom.resname[i]); - res_key = md_hash64_str(resname, 0); - } - uint64_t key = md_hash64_str(name, res_key); - uint32_t* ptr = md_hashmap_get(&map, key); - if (ptr) { - element[i] = (md_element_t)*ptr; - continue; - } else { - uint64_t elem_key = md_hash64(name.ptr, name.len, 0); - ptr = md_hashmap_get(&elem_map, elem_key); - if (ptr) { - elem = (md_element_t)*ptr; - goto done; - } - } - - if ((elem = md_util_element_lookup(name)) != 0) goto done; - - // If amino acid, try to deduce the element from that - if (mol->atom.flags) { - if (mol->atom.flags[i] & (MD_FLAG_AMINO_ACID | MD_FLAG_NUCLEOTIDE)) { - // Try to match against the first character - name.len = 1; - elem = md_util_element_lookup_ignore_case(name); - goto done; - } - } - - // This is the same logic as above but more general, for the natural organic elements - if (is_organic(name.ptr[0]) && name.len > 1) { - if (name.ptr[1] - 'A' < 5) { - if (mol->residue.count > 0 && mol->atom.res_idx) { - int32_t res_idx = mol->atom.res_idx[i]; - uint32_t res_beg = mol->residue.atom_offset[res_idx]; - uint32_t res_end = mol->residue.atom_offset[res_idx+1]; - uint32_t res_len = res_end - res_beg; - if (res_len > 3) { - name.len = 1; - elem = md_util_element_lookup_ignore_case(name); - goto done; - } - } - } - } - - // Heuristic cases - - // This can be fishy... - if (str_eq_cstr(name, "HOH")) { - elem = H; - goto done; - } - if (str_eq_cstr(name, "HS")) { - elem = H; - goto done; - } - - size_t num_alpha = 0; - while (num_alpha < str_len(original) && is_alpha(original.ptr[num_alpha])) ++num_alpha; - - size_t num_digits = 0; - str_t digits = str_substr(original, num_alpha, SIZE_MAX); - while (num_digits < str_len(digits) && is_digit(digits.ptr[num_digits])) ++num_digits; - - // 2-3 letters + 1-2 digit (e.g. HO(H)[0-99]) usually means just look at the first letter - if ((num_alpha == 2 || num_alpha == 3) && (num_digits == 1 || num_digits == 2)) { - name.len = 1; - elem = md_util_element_lookup_ignore_case(name); - goto done; - } - - // Try to match against several characters but ignore the case - if (name.len > 1) { - name.len = 2; - elem = md_util_element_lookup_ignore_case(name); - } - - // Last resort, try to match against single first character - if (elem == 0) { - name.len = 1; - elem = md_util_element_lookup_ignore_case(name); - } - - done: - element[i] = elem; - if (elem != 0) { - md_hashmap_add(&map, key, elem); - } - } - } - - md_hashmap_free(&map); - - return true; + // Delegate to the new hash-backed atomic number inference system + return md_atoms_infer_atomic_numbers(element, capacity, mol); } bool md_util_element_from_mass(md_element_t element[], const float mass[], size_t count) { diff --git a/src/md_util.h b/src/md_util.h index 2a98aa3..a98a107 100644 --- a/src/md_util.h +++ b/src/md_util.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -31,20 +32,16 @@ enum { typedef uint32_t md_util_postprocess_flags_t; -// This assumes the string exactly matches the value within the look up table -// The match is case sensitive and expects elements to be formatted with Big first letter and small second letter: -// E.g. H, He, Fe, Na, C -md_element_t md_util_element_lookup(str_t element_str); -md_element_t md_util_element_lookup_ignore_case(str_t element_str); - -// Access to the static arrays +// Access to the static arrays (preserved for direct access) const str_t* md_util_element_symbols(void); const str_t* md_util_element_names(void); const float* md_util_element_vdw_radii(void); +// Element functions (now calling new atomic number API internally) +md_element_t md_util_element_lookup(str_t element_str); +md_element_t md_util_element_lookup_ignore_case(str_t element_str); str_t md_util_element_symbol(md_element_t element); str_t md_util_element_name(md_element_t element); - float md_util_element_vdw_radius(md_element_t element); float md_util_element_covalent_radius(md_element_t element); float md_util_element_atomic_mass(md_element_t element); @@ -63,9 +60,7 @@ static inline bool md_util_backbone_atoms_valid(md_protein_backbone_atoms_t prot return (prot.ca != prot.c) && (prot.ca != prot.o) && (prot.c != prot.o); } -// This operation tries to deduce the element from the atom type/name which usually contains alot of cruft. -// It also tries resolve some ambiguities: Such as CA, is that Carbon Alpha or is it calcium? -// We can resolve that by looking at the residue name and in the case of Carbon Alpha, the residue name should be matched to an amino acid. +// Element guess function - now delegates to the new inference system bool md_util_element_guess(md_element_t element[], size_t capacity, const struct md_molecule_t* mol); bool md_util_element_from_mass(md_element_t out_element[], const float in_mass[], size_t count); diff --git a/unittest/CMakeLists.txt b/unittest/CMakeLists.txt index d78f2e3..6fabeb3 100644 --- a/unittest/CMakeLists.txt +++ b/unittest/CMakeLists.txt @@ -12,6 +12,7 @@ target_link_libraries(md_unittest PRIVATE mdlib ${MD_LIBS}) set (SRC_FILES test_allocator.c test_array.c + test_atomic.c test_bitop.c test_bitfield.c test_str.c diff --git a/unittest/test_atomic.c b/unittest/test_atomic.c new file mode 100644 index 0000000..dfddc35 --- /dev/null +++ b/unittest/test_atomic.c @@ -0,0 +1,122 @@ +#include "utest.h" +#include +#include +#include +#include + +UTEST(atomic, enum_constants) { + // Test that enum constants are correct + EXPECT_EQ(MD_Z_X, 0); // Unknown + EXPECT_EQ(MD_Z_H, 1); // Hydrogen + EXPECT_EQ(MD_Z_HE, 2); // Helium + EXPECT_EQ(MD_Z_C, 6); // Carbon + EXPECT_EQ(MD_Z_N, 7); // Nitrogen + EXPECT_EQ(MD_Z_O, 8); // Oxygen + EXPECT_EQ(MD_Z_P, 15); // Phosphorus + EXPECT_EQ(MD_Z_S, 16); // Sulfur + EXPECT_EQ(MD_Z_CA, 20); // Calcium + EXPECT_EQ(MD_Z_CL, 17); // Chlorine + EXPECT_EQ(MD_Z_BR, 35); // Bromine + EXPECT_EQ(MD_Z_NA, 11); // Sodium + EXPECT_EQ(MD_Z_FE, 26); // Iron + EXPECT_EQ(MD_Z_OG, 118); // Oganesson +} + +UTEST(atomic, symbol_lookup) { + // Test symbol lookup functions + EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("H")), MD_Z_H); + EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("C")), MD_Z_C); + EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("He")), MD_Z_HE); + EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("Ca")), MD_Z_CA); + EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("Unknown")), MD_Z_X); + + // Test case insensitive lookup + EXPECT_EQ(md_atomic_number_from_symbol_icase(STR_LIT("h")), MD_Z_H); + EXPECT_EQ(md_atomic_number_from_symbol_icase(STR_LIT("ca")), MD_Z_CA); + EXPECT_EQ(md_atomic_number_from_symbol_icase(STR_LIT("HE")), MD_Z_HE); +} + +UTEST(atomic, symbol_from_number) { + // Test reverse lookup + str_t h_symbol = md_symbol_from_atomic_number(MD_Z_H); + EXPECT_TRUE(str_eq_cstr(h_symbol, "H")); + + str_t c_symbol = md_symbol_from_atomic_number(MD_Z_C); + EXPECT_TRUE(str_eq_cstr(c_symbol, "C")); + + str_t ca_symbol = md_symbol_from_atomic_number(MD_Z_CA); + EXPECT_TRUE(str_eq_cstr(ca_symbol, "Ca")); +} + +UTEST(atomic, element_properties) { + // Test that we can get basic properties + EXPECT_GT(md_atomic_mass(MD_Z_H), 0.0f); + EXPECT_GT(md_atomic_mass(MD_Z_C), 0.0f); + EXPECT_GT(md_vdw_radius(MD_Z_H), 0.0f); + EXPECT_GT(md_covalent_radius(MD_Z_C), 0.0f); + EXPECT_GT(md_max_valence(MD_Z_C), 0); + EXPECT_GT(md_cpk_color(MD_Z_C), 0); +} + +UTEST(atomic, inference_water) { + // Test water atom inference + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("O"), STR_LIT("HOH")), MD_Z_O); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("OW"), STR_LIT("HOH")), MD_Z_O); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("OH2"), STR_LIT("WAT")), MD_Z_O); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("H"), STR_LIT("HOH")), MD_Z_H); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("H1"), STR_LIT("TIP3")), MD_Z_H); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("HW"), STR_LIT("SPC")), MD_Z_H); +} + +UTEST(atomic, inference_amino_acid) { + // Test amino acid atom inference + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("CA"), STR_LIT("ALA")), MD_Z_C); // Alpha carbon, not calcium + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("N"), STR_LIT("GLY")), MD_Z_N); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("C"), STR_LIT("SER")), MD_Z_C); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("O"), STR_LIT("TRP")), MD_Z_O); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("OXT"), STR_LIT("PHE")), MD_Z_O); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("OG"), STR_LIT("SER")), MD_Z_O); // Serine hydroxyl + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("SG"), STR_LIT("CYS")), MD_Z_S); // Cysteine sulfur +} + +UTEST(atomic, inference_nucleic_acid) { + // Test nucleic acid atom inference + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("P"), STR_LIT("DA")), MD_Z_P); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("OP1"), STR_LIT("DG")), MD_Z_O); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("O2P"), STR_LIT("A")), MD_Z_O); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("P"), STR_LIT("U")), MD_Z_P); +} + +UTEST(atomic, inference_ions) { + // Test ion inference (residue name is element) + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("NA"), STR_LIT("NA")), MD_Z_NA); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT(""), STR_LIT("K")), MD_Z_K); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("MG"), STR_LIT("MG")), MD_Z_MG); + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("CL"), STR_LIT("CL")), MD_Z_CL); +} + +UTEST(atomic, inference_fallbacks) { + // Test fallback mechanisms + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("CL12"), STR_LIT("")), MD_Z_CL); // Two-letter heuristic + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("BR1"), STR_LIT("")), MD_Z_BR); // Two-letter heuristic + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("H123"), STR_LIT("")), MD_Z_H); // First letter fallback + EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("C99"), STR_LIT("")), MD_Z_C); // First letter fallback +} + +UTEST(atomic, backward_compatibility) { + // Test that old API still works through wrappers + EXPECT_EQ(md_util_element_lookup(STR_LIT("H")), MD_Z_H); + EXPECT_EQ(md_util_element_lookup_ignore_case(STR_LIT("ca")), MD_Z_CA); + + str_t symbol = md_util_element_symbol(MD_Z_C); + EXPECT_TRUE(str_eq_cstr(symbol, "C")); + + str_t name = md_util_element_name(MD_Z_O); + EXPECT_TRUE(str_eq_cstr(name, "Oxygen")); + + EXPECT_GT(md_util_element_atomic_mass(MD_Z_C), 0.0f); + EXPECT_GT(md_util_element_vdw_radius(MD_Z_H), 0.0f); + EXPECT_GT(md_util_element_covalent_radius(MD_Z_N), 0.0f); + EXPECT_GT(md_util_element_max_valence(MD_Z_C), 0); + EXPECT_GT(md_util_element_cpk_color(MD_Z_O), 0); +} \ No newline at end of file From e723aefff6301cd25fb292b17f9633f562441d5a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 11 Sep 2025 14:23:13 +0000 Subject: [PATCH 3/3] Add comprehensive API equivalence and compatibility tests Co-authored-by: scanberg <706523+scanberg@users.noreply.github.com> --- unittest/CMakeLists.txt | 2 + unittest/test_api_equivalence.c | 53 ++++++++++++++++++++ unittest/test_element_guess_compat.c | 72 ++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 unittest/test_api_equivalence.c create mode 100644 unittest/test_element_guess_compat.c diff --git a/unittest/CMakeLists.txt b/unittest/CMakeLists.txt index 6fabeb3..3093144 100644 --- a/unittest/CMakeLists.txt +++ b/unittest/CMakeLists.txt @@ -11,10 +11,12 @@ target_link_libraries(md_unittest PRIVATE mdlib ${MD_LIBS}) set (SRC_FILES test_allocator.c + test_api_equivalence.c test_array.c test_atomic.c test_bitop.c test_bitfield.c + test_element_guess_compat.c test_str.c test_hash.c test_edr.c diff --git a/unittest/test_api_equivalence.c b/unittest/test_api_equivalence.c new file mode 100644 index 0000000..1e227bc --- /dev/null +++ b/unittest/test_api_equivalence.c @@ -0,0 +1,53 @@ +// Test to verify API equivalence between old and new systems +#include "utest.h" +#include +#include + +UTEST(api_equivalence, symbol_lookup_consistency) { + // Test that old and new APIs return the same results + for (int i = 1; i <= 118; ++i) { + str_t symbol_old = md_util_element_symbol(i); + str_t symbol_new = md_symbol_from_atomic_number(i); + + EXPECT_TRUE(str_eq(symbol_old, symbol_new)); + + // Test reverse lookup + md_atomic_number_t old_lookup = md_util_element_lookup(symbol_old); + md_atomic_number_t new_lookup = md_atomic_number_from_symbol(symbol_old); + + EXPECT_EQ(old_lookup, new_lookup); + EXPECT_EQ(old_lookup, i); + } +} + +UTEST(api_equivalence, property_consistency) { + // Test a few key elements for property consistency + md_atomic_number_t test_elements[] = {MD_Z_H, MD_Z_C, MD_Z_N, MD_Z_O, MD_Z_CA, MD_Z_FE}; + + for (size_t i = 0; i < ARRAY_SIZE(test_elements); ++i) { + md_atomic_number_t z = test_elements[i]; + + // Test masses + float mass_old = md_util_element_atomic_mass(z); + float mass_new = md_atomic_mass(z); + EXPECT_EQ(mass_old, mass_new); + + // Test radii + float vdw_old = md_util_element_vdw_radius(z); + float vdw_new = md_vdw_radius(z); + EXPECT_EQ(vdw_old, vdw_new); + + float cov_old = md_util_element_covalent_radius(z); + float cov_new = md_covalent_radius(z); + EXPECT_EQ(cov_old, cov_new); + + // Test valence and color + int val_old = md_util_element_max_valence(z); + int val_new = md_max_valence(z); + EXPECT_EQ(val_old, val_new); + + uint32_t color_old = md_util_element_cpk_color(z); + uint32_t color_new = md_cpk_color(z); + EXPECT_EQ(color_old, color_new); + } +} \ No newline at end of file diff --git a/unittest/test_element_guess_compat.c b/unittest/test_element_guess_compat.c new file mode 100644 index 0000000..cdab95d --- /dev/null +++ b/unittest/test_element_guess_compat.c @@ -0,0 +1,72 @@ +#include "utest.h" +#include +#include +#include +#include +#include + +// Test backward compatibility with md_util_element_guess +UTEST(element_guess_compat, basic_inference) { + md_allocator_i* alloc = md_vm_arena_create(MEGABYTES(1)); + + // Create a simple molecule structure + md_molecule_t mol = {0}; + + // Atom data + const size_t atom_count = 5; + mol.atom.count = atom_count; + + // Allocate arrays + mol.atom.type = md_alloc(alloc, sizeof(md_label_t) * atom_count); + mol.atom.resname = md_alloc(alloc, sizeof(md_label_t) * atom_count); + mol.atom.element = md_alloc(alloc, sizeof(md_element_t) * atom_count); + + // Set up atom types and residue names + // HOH water oxygen + strncpy(mol.atom.type[0].buf, "O", sizeof(mol.atom.type[0].buf)); + mol.atom.type[0].len = 1; + strncpy(mol.atom.resname[0].buf, "HOH", sizeof(mol.atom.resname[0].buf)); + mol.atom.resname[0].len = 3; + mol.atom.element[0] = 0; // Start unknown + + // HOH water hydrogen + strncpy(mol.atom.type[1].buf, "H1", sizeof(mol.atom.type[1].buf)); + mol.atom.type[1].len = 2; + strncpy(mol.atom.resname[1].buf, "HOH", sizeof(mol.atom.resname[1].buf)); + mol.atom.resname[1].len = 3; + mol.atom.element[1] = 0; // Start unknown + + // Alanine alpha carbon + strncpy(mol.atom.type[2].buf, "CA", sizeof(mol.atom.type[2].buf)); + mol.atom.type[2].len = 2; + strncpy(mol.atom.resname[2].buf, "ALA", sizeof(mol.atom.resname[2].buf)); + mol.atom.resname[2].len = 3; + mol.atom.element[2] = 0; // Start unknown + + // Sodium ion + strncpy(mol.atom.type[3].buf, "NA", sizeof(mol.atom.type[3].buf)); + mol.atom.type[3].len = 2; + strncpy(mol.atom.resname[3].buf, "NA", sizeof(mol.atom.resname[3].buf)); + mol.atom.resname[3].len = 2; + mol.atom.element[3] = 0; // Start unknown + + // Generic carbon + strncpy(mol.atom.type[4].buf, "C1", sizeof(mol.atom.type[4].buf)); + mol.atom.type[4].len = 2; + strncpy(mol.atom.resname[4].buf, "", sizeof(mol.atom.resname[4].buf)); + mol.atom.resname[4].len = 0; + mol.atom.element[4] = 0; // Start unknown + + // Call the element guess function + bool result = md_util_element_guess(mol.atom.element, atom_count, &mol); + + // Verify results + EXPECT_TRUE(result); + EXPECT_EQ(mol.atom.element[0], MD_Z_O); // Water oxygen + EXPECT_EQ(mol.atom.element[1], MD_Z_H); // Water hydrogen + EXPECT_EQ(mol.atom.element[2], MD_Z_C); // Alanine alpha carbon (not calcium!) + EXPECT_EQ(mol.atom.element[3], MD_Z_NA); // Sodium ion + EXPECT_EQ(mol.atom.element[4], MD_Z_C); // Generic carbon from C1 + + md_vm_arena_destroy(alloc); +} \ No newline at end of file