From 3063126d057bbbebdaa62624ee4028438f4c3302 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 11 Sep 2025 14:06:34 +0000
Subject: [PATCH 01/20] Initial plan


From c36c520490bac437ecafc2cfc025863bf4e44f70 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 11 Sep 2025 14:19:57 +0000
Subject: [PATCH 02/20] Implement atomic number refactor with hash-backed
 inference and backward compatibility

Co-authored-by: scanberg <706523+scanberg@users.noreply.github.com>
---
 CMakeLists.txt          |   1 +
 src/core/md_atomic.h    | 166 +++++++++++++++++++++
 src/md_atomic_infer.c   | 315 ++++++++++++++++++++++++++++++++++++++++
 src/md_util.c           | 139 +-----------------
 src/md_util.h           |  17 +--
 unittest/CMakeLists.txt |   1 +
 unittest/test_atomic.c  | 122 ++++++++++++++++
 7 files changed, 613 insertions(+), 148 deletions(-)
 create mode 100644 src/core/md_atomic.h
 create mode 100644 src/md_atomic_infer.c
 create mode 100644 unittest/test_atomic.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d12e7f4..b21051b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,6 +39,7 @@ file(GLOB_RECURSE EXT_FILES ext/*)
 file(GLOB_RECURSE PROJ_FILES cmake/*)
 
 set(SRC_FILES
+    src/md_atomic_infer.c
     src/md_csv.c
     src/md_csv.h
     src/md_cube.c
diff --git a/src/core/md_atomic.h b/src/core/md_atomic.h
new file mode 100644
index 0000000..af63843
--- /dev/null
+++ b/src/core/md_atomic.h
@@ -0,0 +1,166 @@
+#pragma once
+
+#include <core/md_str.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Atomic number type: 0 = unknown, 1-118 = element atomic numbers
+typedef uint8_t md_atomic_number_t;
+
+// Legacy alias for compatibility
+typedef md_atomic_number_t md_element_t;
+
+// Forward declaration of molecule type
+struct md_molecule_t;
+
+// Atomic number constants for all elements (Z values)
+enum {
+    MD_Z_X  = 0,   // Unknown
+    MD_Z_H  = 1,   // Hydrogen
+    MD_Z_HE = 2,   // Helium
+    MD_Z_LI = 3,   // Lithium
+    MD_Z_BE = 4,   // Beryllium
+    MD_Z_B  = 5,   // Boron
+    MD_Z_C  = 6,   // Carbon
+    MD_Z_N  = 7,   // Nitrogen
+    MD_Z_O  = 8,   // Oxygen
+    MD_Z_F  = 9,   // Fluorine
+    MD_Z_NE = 10,  // Neon
+    MD_Z_NA = 11,  // Sodium
+    MD_Z_MG = 12,  // Magnesium
+    MD_Z_AL = 13,  // Aluminium
+    MD_Z_SI = 14,  // Silicon
+    MD_Z_P  = 15,  // Phosphorus
+    MD_Z_S  = 16,  // Sulfur
+    MD_Z_CL = 17,  // Chlorine
+    MD_Z_AR = 18,  // Argon
+    MD_Z_K  = 19,  // Potassium
+    MD_Z_CA = 20,  // Calcium
+    MD_Z_SC = 21,  // Scandium
+    MD_Z_TI = 22,  // Titanium
+    MD_Z_V  = 23,  // Vanadium
+    MD_Z_CR = 24,  // Chromium
+    MD_Z_MN = 25,  // Manganese
+    MD_Z_FE = 26,  // Iron
+    MD_Z_CO = 27,  // Cobalt
+    MD_Z_NI = 28,  // Nickel
+    MD_Z_CU = 29,  // Copper
+    MD_Z_ZN = 30,  // Zinc
+    MD_Z_GA = 31,  // Gallium
+    MD_Z_GE = 32,  // Germanium
+    MD_Z_AS = 33,  // Arsenic
+    MD_Z_SE = 34,  // Selenium
+    MD_Z_BR = 35,  // Bromine
+    MD_Z_KR = 36,  // Krypton
+    MD_Z_RB = 37,  // Rubidium
+    MD_Z_SR = 38,  // Strontium
+    MD_Z_Y  = 39,  // Yttrium
+    MD_Z_ZR = 40,  // Zirconium
+    MD_Z_NB = 41,  // Niobium
+    MD_Z_MO = 42,  // Molybdenum
+    MD_Z_TC = 43,  // Technetium
+    MD_Z_RU = 44,  // Ruthenium
+    MD_Z_RH = 45,  // Rhodium
+    MD_Z_PD = 46,  // Palladium
+    MD_Z_AG = 47,  // Silver
+    MD_Z_CD = 48,  // Cadmium
+    MD_Z_IN = 49,  // Indium
+    MD_Z_SN = 50,  // Tin
+    MD_Z_SB = 51,  // Antimony
+    MD_Z_TE = 52,  // Tellurium
+    MD_Z_I  = 53,  // Iodine
+    MD_Z_XE = 54,  // Xenon
+    MD_Z_CS = 55,  // Caesium
+    MD_Z_BA = 56,  // Barium
+    MD_Z_LA = 57,  // Lanthanum
+    MD_Z_CE = 58,  // Cerium
+    MD_Z_PR = 59,  // Praseodymium
+    MD_Z_ND = 60,  // Neodymium
+    MD_Z_PM = 61,  // Promethium
+    MD_Z_SM = 62,  // Samarium
+    MD_Z_EU = 63,  // Europium
+    MD_Z_GD = 64,  // Gadolinium
+    MD_Z_TB = 65,  // Terbium
+    MD_Z_DY = 66,  // Dysprosium
+    MD_Z_HO = 67,  // Holmium
+    MD_Z_ER = 68,  // Erbium
+    MD_Z_TM = 69,  // Thulium
+    MD_Z_YB = 70,  // Ytterbium
+    MD_Z_LU = 71,  // Lutetium
+    MD_Z_HF = 72,  // Hafnium
+    MD_Z_TA = 73,  // Tantalum
+    MD_Z_W  = 74,  // Tungsten
+    MD_Z_RE = 75,  // Rhenium
+    MD_Z_OS = 76,  // Osmium
+    MD_Z_IR = 77,  // Iridium
+    MD_Z_PT = 78,  // Platinum
+    MD_Z_AU = 79,  // Gold
+    MD_Z_HG = 80,  // Mercury
+    MD_Z_TL = 81,  // Thallium
+    MD_Z_PB = 82,  // Lead
+    MD_Z_BI = 83,  // Bismuth
+    MD_Z_PO = 84,  // Polonium
+    MD_Z_AT = 85,  // Astatine
+    MD_Z_RN = 86,  // Radon
+    MD_Z_FR = 87,  // Francium
+    MD_Z_RA = 88,  // Radium
+    MD_Z_AC = 89,  // Actinium
+    MD_Z_TH = 90,  // Thorium
+    MD_Z_PA = 91,  // Protactinium
+    MD_Z_U  = 92,  // Uranium
+    MD_Z_NP = 93,  // Neptunium
+    MD_Z_PU = 94,  // Plutonium
+    MD_Z_AM = 95,  // Americium
+    MD_Z_CM = 96,  // Curium
+    MD_Z_BK = 97,  // Berkelium
+    MD_Z_CF = 98,  // Californium
+    MD_Z_ES = 99,  // Einsteinium
+    MD_Z_FM = 100, // Fermium
+    MD_Z_MD = 101, // Mendelevium
+    MD_Z_NO = 102, // Nobelium
+    MD_Z_LR = 103, // Lawrencium
+    MD_Z_RF = 104, // Rutherfordium
+    MD_Z_DB = 105, // Dubnium
+    MD_Z_SG = 106, // Seaborgium
+    MD_Z_BH = 107, // Bohrium
+    MD_Z_HS = 108, // Hassium
+    MD_Z_MT = 109, // Meitnerium
+    MD_Z_DS = 110, // Darmstadtium
+    MD_Z_RG = 111, // Roentgenium
+    MD_Z_CN = 112, // Copernicium
+    MD_Z_NH = 113, // Nihonium
+    MD_Z_FL = 114, // Flerovium
+    MD_Z_MC = 115, // Moscovium
+    MD_Z_LV = 116, // Livermorium
+    MD_Z_TS = 117, // Tennessine
+    MD_Z_OG = 118, // Oganesson
+};
+
+// New preferred API names
+
+// Element symbol and name lookup functions
+md_atomic_number_t md_atomic_number_from_symbol(str_t sym);
+md_atomic_number_t md_atomic_number_from_symbol_icase(str_t sym);
+str_t md_symbol_from_atomic_number(md_atomic_number_t z);
+str_t md_name_from_atomic_number(md_atomic_number_t z);
+
+// Element property functions
+float md_atomic_mass(md_atomic_number_t z);
+float md_vdw_radius(md_atomic_number_t z);
+float md_covalent_radius(md_atomic_number_t z);
+int   md_max_valence(md_atomic_number_t z);
+uint32_t md_cpk_color(md_atomic_number_t z);
+
+// Per-atom inference from labels (atom name + residue)
+md_atomic_number_t md_atom_infer_atomic_number(str_t atom_name, str_t res_name);
+
+// Batch form wired to molecule structure
+bool md_atoms_infer_atomic_numbers(md_atomic_number_t out[], size_t n, const struct md_molecule_t* mol);
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/src/md_atomic_infer.c b/src/md_atomic_infer.c
new file mode 100644
index 0000000..386acc6
--- /dev/null
+++ b/src/md_atomic_infer.c
@@ -0,0 +1,315 @@
+#include <core/md_atomic.h>
+#include <md_molecule.h>
+#include <md_util.h>
+
+#include <core/md_hash.h>
+#include <core/md_allocator.h>
+#include <core/md_common.h>
+#include <core/md_str.h>
+
+#include <string.h>
+#include <ctype.h>
+
+// Static hashmaps for element inference
+static md_hashmap32_t residue_atom_map = {0};
+static md_hashmap32_t atom_only_map = {0};
+static bool maps_initialized = false;
+
+// Helper functions
+static inline char to_upper_c(char c) {
+    return (c >= 'a' && c <= 'z') ? (c - 'a' + 'A') : c;
+}
+
+static inline bool is_alpha_c(char c) {
+    return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
+}
+
+static inline bool is_digit_c(char c) {
+    return c >= '0' && c <= '9';
+}
+
+// Strip digits from end of string
+static str_t strip_digits(str_t str) {
+    while (str.len > 0 && is_digit_c(str.ptr[str.len - 1])) {
+        str.len--;
+    }
+    return str;
+}
+
+// Create normalized uppercase string for hashing
+static void normalize_to_upper(char* dst, str_t src, bool strip_digits_flag) {
+    str_t s = strip_digits_flag ? strip_digits(src) : src;
+    for (size_t i = 0; i < s.len; ++i) {
+        dst[i] = to_upper_c(s.ptr[i]);
+    }
+    dst[s.len] = '\0';
+}
+
+// Initialize lookup tables
+static void init_lookup_tables(void) {
+    if (maps_initialized) return;
+    
+    md_allocator_i* alloc = md_get_heap_allocator();
+    residue_atom_map.allocator = alloc;
+    atom_only_map.allocator = alloc;
+    
+    md_hashmap_reserve(&residue_atom_map, 512);
+    md_hashmap_reserve(&atom_only_map, 256);
+    
+    // Water variants
+    struct { const char* res; const char* atom; md_atomic_number_t z; } water_entries[] = {
+        {"HOH", "O", MD_Z_O}, {"HOH", "OW", MD_Z_O}, {"HOH", "OH2", MD_Z_O},
+        {"HOH", "H", MD_Z_H}, {"HOH", "H1", MD_Z_H}, {"HOH", "H2", MD_Z_H}, {"HOH", "HW1", MD_Z_H}, {"HOH", "HW2", MD_Z_H}, {"HOH", "HW", MD_Z_H},
+        {"WAT", "O", MD_Z_O}, {"WAT", "OW", MD_Z_O}, {"WAT", "OH2", MD_Z_O},
+        {"WAT", "H", MD_Z_H}, {"WAT", "H1", MD_Z_H}, {"WAT", "H2", MD_Z_H}, {"WAT", "HW1", MD_Z_H}, {"WAT", "HW2", MD_Z_H}, {"WAT", "HW", MD_Z_H},
+        {"TIP3", "O", MD_Z_O}, {"TIP3", "OW", MD_Z_O}, {"TIP3", "OH2", MD_Z_O},
+        {"TIP3", "H", MD_Z_H}, {"TIP3", "H1", MD_Z_H}, {"TIP3", "H2", MD_Z_H}, {"TIP3", "HW1", MD_Z_H}, {"TIP3", "HW2", MD_Z_H}, {"TIP3", "HW", MD_Z_H},
+        {"TIP4", "O", MD_Z_O}, {"TIP4", "OW", MD_Z_O}, {"TIP4", "OH2", MD_Z_O},
+        {"TIP4", "H", MD_Z_H}, {"TIP4", "H1", MD_Z_H}, {"TIP4", "H2", MD_Z_H}, {"TIP4", "HW1", MD_Z_H}, {"TIP4", "HW2", MD_Z_H}, {"TIP4", "HW", MD_Z_H},
+        {"TIP5", "O", MD_Z_O}, {"TIP5", "OW", MD_Z_O}, {"TIP5", "OH2", MD_Z_O},
+        {"TIP5", "H", MD_Z_H}, {"TIP5", "H1", MD_Z_H}, {"TIP5", "H2", MD_Z_H}, {"TIP5", "HW1", MD_Z_H}, {"TIP5", "HW2", MD_Z_H}, {"TIP5", "HW", MD_Z_H},
+        {"SPC", "O", MD_Z_O}, {"SPC", "OW", MD_Z_O}, {"SPC", "OH2", MD_Z_O},
+        {"SPC", "H", MD_Z_H}, {"SPC", "H1", MD_Z_H}, {"SPC", "H2", MD_Z_H}, {"SPC", "HW1", MD_Z_H}, {"SPC", "HW2", MD_Z_H}, {"SPC", "HW", MD_Z_H},
+        {"SOL", "O", MD_Z_O}, {"SOL", "OW", MD_Z_O}, {"SOL", "OH2", MD_Z_O},
+        {"SOL", "H", MD_Z_H}, {"SOL", "H1", MD_Z_H}, {"SOL", "H2", MD_Z_H}, {"SOL", "HW1", MD_Z_H}, {"SOL", "HW2", MD_Z_H}, {"SOL", "HW", MD_Z_H},
+        {"H2O", "O", MD_Z_O}, {"H2O", "OW", MD_Z_O}, {"H2O", "OH2", MD_Z_O},
+        {"H2O", "H", MD_Z_H}, {"H2O", "H1", MD_Z_H}, {"H2O", "H2", MD_Z_H}, {"H2O", "HW1", MD_Z_H}, {"H2O", "HW2", MD_Z_H}, {"H2O", "HW", MD_Z_H},
+    };
+    
+    // Add water entries to residue+atom map
+    for (size_t i = 0; i < ARRAY_SIZE(water_entries); ++i) {
+        char key_str[64];
+        snprintf(key_str, sizeof(key_str), "%s\t%s", water_entries[i].res, water_entries[i].atom);
+        uint64_t key = md_hash64(key_str, strlen(key_str), 0);
+        md_hashmap_add(&residue_atom_map, key, water_entries[i].z);
+    }
+    
+    // Amino acid backbone and sidechain entries
+    struct { const char* res; const char* atom; md_atomic_number_t z; } amino_entries[] = {
+        // Common backbone atoms for all amino acids
+        {"ALA", "CA", MD_Z_C}, {"ALA", "N", MD_Z_N}, {"ALA", "C", MD_Z_C}, {"ALA", "O", MD_Z_O}, {"ALA", "OXT", MD_Z_O},
+        {"ARG", "CA", MD_Z_C}, {"ARG", "N", MD_Z_N}, {"ARG", "C", MD_Z_C}, {"ARG", "O", MD_Z_O}, {"ARG", "OXT", MD_Z_O},
+        {"ASN", "CA", MD_Z_C}, {"ASN", "N", MD_Z_N}, {"ASN", "C", MD_Z_C}, {"ASN", "O", MD_Z_O}, {"ASN", "OXT", MD_Z_O},
+        {"ASP", "CA", MD_Z_C}, {"ASP", "N", MD_Z_N}, {"ASP", "C", MD_Z_C}, {"ASP", "O", MD_Z_O}, {"ASP", "OXT", MD_Z_O},
+        {"CYS", "CA", MD_Z_C}, {"CYS", "N", MD_Z_N}, {"CYS", "C", MD_Z_C}, {"CYS", "O", MD_Z_O}, {"CYS", "OXT", MD_Z_O},
+        {"GLN", "CA", MD_Z_C}, {"GLN", "N", MD_Z_N}, {"GLN", "C", MD_Z_C}, {"GLN", "O", MD_Z_O}, {"GLN", "OXT", MD_Z_O},
+        {"GLU", "CA", MD_Z_C}, {"GLU", "N", MD_Z_N}, {"GLU", "C", MD_Z_C}, {"GLU", "O", MD_Z_O}, {"GLU", "OXT", MD_Z_O},
+        {"GLY", "CA", MD_Z_C}, {"GLY", "N", MD_Z_N}, {"GLY", "C", MD_Z_C}, {"GLY", "O", MD_Z_O}, {"GLY", "OXT", MD_Z_O},
+        {"HIS", "CA", MD_Z_C}, {"HIS", "N", MD_Z_N}, {"HIS", "C", MD_Z_C}, {"HIS", "O", MD_Z_O}, {"HIS", "OXT", MD_Z_O},
+        {"ILE", "CA", MD_Z_C}, {"ILE", "N", MD_Z_N}, {"ILE", "C", MD_Z_C}, {"ILE", "O", MD_Z_O}, {"ILE", "OXT", MD_Z_O},
+        {"LEU", "CA", MD_Z_C}, {"LEU", "N", MD_Z_N}, {"LEU", "C", MD_Z_C}, {"LEU", "O", MD_Z_O}, {"LEU", "OXT", MD_Z_O},
+        {"LYS", "CA", MD_Z_C}, {"LYS", "N", MD_Z_N}, {"LYS", "C", MD_Z_C}, {"LYS", "O", MD_Z_O}, {"LYS", "OXT", MD_Z_O},
+        {"MET", "CA", MD_Z_C}, {"MET", "N", MD_Z_N}, {"MET", "C", MD_Z_C}, {"MET", "O", MD_Z_O}, {"MET", "OXT", MD_Z_O},
+        {"PHE", "CA", MD_Z_C}, {"PHE", "N", MD_Z_N}, {"PHE", "C", MD_Z_C}, {"PHE", "O", MD_Z_O}, {"PHE", "OXT", MD_Z_O},
+        {"PRO", "CA", MD_Z_C}, {"PRO", "N", MD_Z_N}, {"PRO", "C", MD_Z_C}, {"PRO", "O", MD_Z_O}, {"PRO", "OXT", MD_Z_O},
+        {"SER", "CA", MD_Z_C}, {"SER", "N", MD_Z_N}, {"SER", "C", MD_Z_C}, {"SER", "O", MD_Z_O}, {"SER", "OXT", MD_Z_O},
+        {"THR", "CA", MD_Z_C}, {"THR", "N", MD_Z_N}, {"THR", "C", MD_Z_C}, {"THR", "O", MD_Z_O}, {"THR", "OXT", MD_Z_O},
+        {"TRP", "CA", MD_Z_C}, {"TRP", "N", MD_Z_N}, {"TRP", "C", MD_Z_C}, {"TRP", "O", MD_Z_O}, {"TRP", "OXT", MD_Z_O},
+        {"TYR", "CA", MD_Z_C}, {"TYR", "N", MD_Z_N}, {"TYR", "C", MD_Z_C}, {"TYR", "O", MD_Z_O}, {"TYR", "OXT", MD_Z_O},
+        {"VAL", "CA", MD_Z_C}, {"VAL", "N", MD_Z_N}, {"VAL", "C", MD_Z_C}, {"VAL", "O", MD_Z_O}, {"VAL", "OXT", MD_Z_O},
+        // Sidechain specific atoms
+        {"SER", "OG", MD_Z_O}, {"THR", "OG1", MD_Z_O}, {"TYR", "OH", MD_Z_O},
+        {"CYS", "SG", MD_Z_S}, {"MET", "SD", MD_Z_S},
+    };
+    
+    // Add amino acid entries to residue+atom map
+    for (size_t i = 0; i < ARRAY_SIZE(amino_entries); ++i) {
+        char key_str[64];
+        snprintf(key_str, sizeof(key_str), "%s\t%s", amino_entries[i].res, amino_entries[i].atom);
+        uint64_t key = md_hash64(key_str, strlen(key_str), 0);
+        md_hashmap_add(&residue_atom_map, key, amino_entries[i].z);
+    }
+    
+    // Nucleic acid entries
+    struct { const char* res; const char* atom; md_atomic_number_t z; } nucleic_entries[] = {
+        // DNA
+        {"DA", "P", MD_Z_P}, {"DA", "OP1", MD_Z_O}, {"DA", "OP2", MD_Z_O}, {"DA", "O1P", MD_Z_O}, {"DA", "O2P", MD_Z_O},
+        {"DC", "P", MD_Z_P}, {"DC", "OP1", MD_Z_O}, {"DC", "OP2", MD_Z_O}, {"DC", "O1P", MD_Z_O}, {"DC", "O2P", MD_Z_O},
+        {"DG", "P", MD_Z_P}, {"DG", "OP1", MD_Z_O}, {"DG", "OP2", MD_Z_O}, {"DG", "O1P", MD_Z_O}, {"DG", "O2P", MD_Z_O},
+        {"DT", "P", MD_Z_P}, {"DT", "OP1", MD_Z_O}, {"DT", "OP2", MD_Z_O}, {"DT", "O1P", MD_Z_O}, {"DT", "O2P", MD_Z_O},
+        // RNA
+        {"A", "P", MD_Z_P}, {"A", "OP1", MD_Z_O}, {"A", "OP2", MD_Z_O}, {"A", "O1P", MD_Z_O}, {"A", "O2P", MD_Z_O},
+        {"C", "P", MD_Z_P}, {"C", "OP1", MD_Z_O}, {"C", "OP2", MD_Z_O}, {"C", "O1P", MD_Z_O}, {"C", "O2P", MD_Z_O},
+        {"G", "P", MD_Z_P}, {"G", "OP1", MD_Z_O}, {"G", "OP2", MD_Z_O}, {"G", "O1P", MD_Z_O}, {"G", "O2P", MD_Z_O},
+        {"U", "P", MD_Z_P}, {"U", "OP1", MD_Z_O}, {"U", "OP2", MD_Z_O}, {"U", "O1P", MD_Z_O}, {"U", "O2P", MD_Z_O},
+    };
+    
+    // Add nucleic acid entries to residue+atom map
+    for (size_t i = 0; i < ARRAY_SIZE(nucleic_entries); ++i) {
+        char key_str[64];
+        snprintf(key_str, sizeof(key_str), "%s\t%s", nucleic_entries[i].res, nucleic_entries[i].atom);
+        uint64_t key = md_hash64(key_str, strlen(key_str), 0);
+        md_hashmap_add(&residue_atom_map, key, nucleic_entries[i].z);
+    }
+    
+    // Selenomethionine
+    char mse_key[64];
+    snprintf(mse_key, sizeof(mse_key), "MSE\tSE");
+    uint64_t mse_hash = md_hash64(mse_key, strlen(mse_key), 0);
+    md_hashmap_add(&residue_atom_map, mse_hash, MD_Z_SE);
+    
+    // Atom-only fallbacks
+    struct { const char* atom; md_atomic_number_t z; } atom_entries[] = {
+        {"H", MD_Z_H}, {"C", MD_Z_C}, {"N", MD_Z_N}, {"O", MD_Z_O}, {"S", MD_Z_S}, {"P", MD_Z_P},
+        {"F", MD_Z_F}, {"CL", MD_Z_CL}, {"BR", MD_Z_BR}, {"I", MD_Z_I},
+        {"OW", MD_Z_O}, {"OH", MD_Z_O}, {"HW", MD_Z_H},
+        {"CA", MD_Z_C}, {"CB", MD_Z_C}, {"CG", MD_Z_C}, {"CD", MD_Z_C}, {"CE", MD_Z_C}, {"CZ", MD_Z_C},
+        {"OXT", MD_Z_O},
+        // Common ions
+        {"NA", MD_Z_NA}, {"K", MD_Z_K}, {"MG", MD_Z_MG}, {"ZN", MD_Z_ZN}, {"FE", MD_Z_FE},
+        {"MN", MD_Z_MN}, {"CU", MD_Z_CU}, {"CO", MD_Z_CO}, {"NI", MD_Z_NI}, {"CD", MD_Z_CD},
+        {"SR", MD_Z_SR}, {"BA", MD_Z_BA}, {"LI", MD_Z_LI}, {"CS", MD_Z_CS}, {"RB", MD_Z_RB},
+        {"AL", MD_Z_AL}, {"TI", MD_Z_TI}, {"CR", MD_Z_CR}, {"HG", MD_Z_HG}, {"PB", MD_Z_PB},
+        {"AG", MD_Z_AG}, {"AU", MD_Z_AU}, {"PT", MD_Z_PT},
+    };
+    
+    // Add atom-only entries
+    for (size_t i = 0; i < ARRAY_SIZE(atom_entries); ++i) {
+        char norm_atom[16];
+        str_t atom_str = {atom_entries[i].atom, strlen(atom_entries[i].atom)};
+        normalize_to_upper(norm_atom, atom_str, true);
+        uint64_t key = md_hash64(norm_atom, strlen(norm_atom), 0);
+        md_hashmap_add(&atom_only_map, key, atom_entries[i].z);
+    }
+    
+    maps_initialized = true;
+}
+
+// Core atomic number functions using existing md_util tables
+md_atomic_number_t md_atomic_number_from_symbol(str_t sym) {
+    return md_util_element_lookup(sym);
+}
+
+md_atomic_number_t md_atomic_number_from_symbol_icase(str_t sym) {
+    return md_util_element_lookup_ignore_case(sym);
+}
+
+str_t md_symbol_from_atomic_number(md_atomic_number_t z) {
+    return md_util_element_symbol(z);
+}
+
+str_t md_name_from_atomic_number(md_atomic_number_t z) {
+    return md_util_element_name(z);
+}
+
+float md_atomic_mass(md_atomic_number_t z) {
+    return md_util_element_atomic_mass(z);
+}
+
+float md_vdw_radius(md_atomic_number_t z) {
+    return md_util_element_vdw_radius(z);
+}
+
+float md_covalent_radius(md_atomic_number_t z) {
+    return md_util_element_covalent_radius(z);
+}
+
+int md_max_valence(md_atomic_number_t z) {
+    return md_util_element_max_valence(z);
+}
+
+uint32_t md_cpk_color(md_atomic_number_t z) {
+    return md_util_element_cpk_color(z);
+}
+
+// Inference functions
+md_atomic_number_t md_atom_infer_atomic_number(str_t atom_name, str_t res_name) {
+    init_lookup_tables();
+    
+    // Special case: if atom name is empty but residue name is an element (ion case)
+    if (atom_name.len == 0 && res_name.len > 0) {
+        md_atomic_number_t res_element = md_atomic_number_from_symbol_icase(res_name);
+        if (res_element != MD_Z_X) {
+            return res_element;
+        }
+        return MD_Z_X;
+    }
+    
+    if (atom_name.len == 0) return MD_Z_X;
+    
+    // Normalize inputs
+    char norm_res[16] = {0};
+    char norm_atom[16] = {0};
+    char norm_atom_stripped[16] = {0};
+    
+    if (res_name.len > 0) {
+        normalize_to_upper(norm_res, res_name, false);
+    }
+    normalize_to_upper(norm_atom, atom_name, false);
+    normalize_to_upper(norm_atom_stripped, atom_name, true);
+    
+    // First try residue+atom combination
+    if (res_name.len > 0) {
+        char res_atom_key[64];
+        snprintf(res_atom_key, sizeof(res_atom_key), "%s\t%s", norm_res, norm_atom);
+        uint64_t key = md_hash64(res_atom_key, strlen(res_atom_key), 0);
+        uint32_t* result = md_hashmap_get(&residue_atom_map, key);
+        if (result) {
+            return (md_atomic_number_t)*result;
+        }
+        
+        // Special case: if residue is water
+        if (md_util_resname_water(res_name)) {
+            if (norm_atom[0] == 'O') return MD_Z_O;
+            if (norm_atom[0] == 'H') return MD_Z_H;
+        }
+        
+        // Special case: if residue is amino acid and atom is CA, return carbon
+        if (md_util_resname_amino_acid(res_name) && strcmp(norm_atom, "CA") == 0) {
+            return MD_Z_C;
+        }
+        
+        // If residue name itself is an element (ion case)
+        md_atomic_number_t res_element = md_atomic_number_from_symbol_icase(res_name);
+        if (res_element != MD_Z_X) {
+            // If atom name is empty or equals residue, return that element
+            if (atom_name.len == 0 || str_eq_ignore_case(atom_name, res_name)) {
+                return res_element;
+            }
+        }
+    }
+    
+    // Try atom-only map with digits stripped
+    uint64_t atom_key = md_hash64(norm_atom_stripped, strlen(norm_atom_stripped), 0);
+    uint32_t* atom_result = md_hashmap_get(&atom_only_map, atom_key);
+    if (atom_result) {
+        md_atomic_number_t z = (md_atomic_number_t)*atom_result;
+        // Override CA (Calcium) to Carbon if in amino acid context
+        if (z == MD_Z_CA && res_name.len > 0 && md_util_resname_amino_acid(res_name)) {
+            return MD_Z_C;
+        }
+        return z;
+    }
+    
+    // Try two-letter element heuristic (e.g., CL12 => Cl, BR1 => Br)
+    if (strlen(norm_atom_stripped) >= 2) {
+        char two_letter[3] = {norm_atom_stripped[0], norm_atom_stripped[1], '\0'};
+        str_t two_letter_str = {two_letter, 2};
+        md_atomic_number_t two_z = md_atomic_number_from_symbol_icase(two_letter_str);
+        if (two_z != MD_Z_X) {
+            // Override CA (Calcium) to Carbon if in amino acid context
+            if (two_z == MD_Z_CA && res_name.len > 0 && md_util_resname_amino_acid(res_name)) {
+                return MD_Z_C;
+            }
+            return two_z;
+        }
+    }
+    
+    // Final fallback: first-letter element mapping
+    char first_letter[2] = {norm_atom_stripped[0], '\0'};
+    str_t first_letter_str = {first_letter, 1};
+    return md_atomic_number_from_symbol_icase(first_letter_str);
+}
+
+bool md_atoms_infer_atomic_numbers(md_atomic_number_t out[], size_t n, const struct md_molecule_t* mol) {
+    if (!out || !mol || n == 0) return false;
+    
+    size_t count = MIN(n, mol->atom.count);
+    for (size_t i = 0; i < count; ++i) {
+        str_t atom_name = LBL_TO_STR(mol->atom.type[i]);
+        str_t res_name = mol->atom.resname ? LBL_TO_STR(mol->atom.resname[i]) : (str_t){0};
+        out[i] = md_atom_infer_atomic_number(atom_name, res_name);
+    }
+    
+    return true;
+}
\ No newline at end of file
diff --git a/src/md_util.c b/src/md_util.c
index 15a6129..4fb5ed4 100644
--- a/src/md_util.c
+++ b/src/md_util.c
@@ -1240,143 +1240,8 @@ static inline bool is_organic(char c) {
 }
 
 bool md_util_element_guess(md_element_t element[], size_t capacity, const struct md_molecule_t* mol) {
-    ASSERT(capacity > 0);
-    ASSERT(mol);
-    ASSERT(mol->atom.count > 0);
-
-    md_hashmap32_t map = { .allocator = md_get_temp_allocator() };
-    md_hashmap_reserve(&map, 256);
-
-    // Just for pure elements which have not been salted with resname
-    md_hashmap32_t elem_map = { .allocator = md_get_temp_allocator() };
-    md_hashmap_reserve(&elem_map, 256);
-
-    typedef struct {
-        str_t name;
-        md_element_t elem;
-    } entry_t;
-    
-    // Extra table for predefined atom types
-    entry_t entries[] = {
-        {STR_LIT("SOD"), Na},
-        {STR_LIT("OW"),  O},
-        {STR_LIT("HW"),  H},
-    };
-
-    for (size_t i = 0; i < ARRAY_SIZE(entries); ++i) {
-        md_hashmap_add(&elem_map, md_hash64(entries[i].name.ptr, entries[i].name.len, 0), entries[i].elem);
-    }
-
-    const size_t count = MIN(capacity, mol->atom.count);
-    for (size_t i = 0; i < count; ++i) {
-        if (element[i] != 0) continue;
-
-        str_t original = LBL_TO_STR(mol->atom.type[i]);
-
-        // Trim whitespace, digits and 'X's
-        str_t name = trim_label(original);
-
-        if (name.len > 0) {
-            md_element_t elem = 0;
-
-            str_t resname = STR_LIT("");
-            uint64_t res_key = 0;
-            if (mol->atom.resname) {
-                resname = LBL_TO_STR(mol->atom.resname[i]);
-                res_key = md_hash64_str(resname, 0);
-            }
-            uint64_t key = md_hash64_str(name, res_key);
-            uint32_t* ptr = md_hashmap_get(&map, key);
-            if (ptr) {
-                element[i] = (md_element_t)*ptr;
-                continue;
-            } else {
-                uint64_t elem_key = md_hash64(name.ptr, name.len, 0);
-                ptr = md_hashmap_get(&elem_map, elem_key);
-                if (ptr) {
-                    elem = (md_element_t)*ptr;
-                    goto done;
-                }
-            }
-
-            if ((elem = md_util_element_lookup(name)) != 0) goto done;
-
-            // If amino acid, try to deduce the element from that
-            if (mol->atom.flags) {
-                if (mol->atom.flags[i] & (MD_FLAG_AMINO_ACID | MD_FLAG_NUCLEOTIDE)) {
-                    // Try to match against the first character
-                    name.len = 1;
-                    elem = md_util_element_lookup_ignore_case(name);
-                    goto done;
-                }
-            }
-
-            // This is the same logic as above but more general, for the natural organic elements
-            if (is_organic(name.ptr[0]) && name.len > 1) {
-                if (name.ptr[1] - 'A' < 5) {
-                    if (mol->residue.count > 0 && mol->atom.res_idx) {
-                        int32_t res_idx = mol->atom.res_idx[i];
-                        uint32_t res_beg = mol->residue.atom_offset[res_idx];
-                        uint32_t res_end = mol->residue.atom_offset[res_idx+1];
-                        uint32_t res_len = res_end - res_beg;
-                        if (res_len > 3) {
-                            name.len = 1;
-                            elem = md_util_element_lookup_ignore_case(name);
-                            goto done;
-                        }
-                    }
-                }
-            }
-
-            // Heuristic cases
-
-            // This can be fishy...
-            if (str_eq_cstr(name, "HOH")) {
-                elem = H;
-                goto done;
-            }
-            if (str_eq_cstr(name, "HS")) {
-                elem = H;
-                goto done;
-            }
-
-            size_t num_alpha = 0;
-            while (num_alpha < str_len(original) && is_alpha(original.ptr[num_alpha])) ++num_alpha;
-            
-            size_t num_digits = 0;
-            str_t digits = str_substr(original, num_alpha, SIZE_MAX);
-            while (num_digits < str_len(digits) && is_digit(digits.ptr[num_digits])) ++num_digits;
-
-            // 2-3 letters + 1-2 digit (e.g. HO(H)[0-99]) usually means just look at the first letter
-            if ((num_alpha == 2 || num_alpha == 3) && (num_digits == 1 || num_digits == 2)) {
-                name.len = 1;
-                elem = md_util_element_lookup_ignore_case(name);
-                goto done;
-            }
-
-            // Try to match against several characters but ignore the case
-            if (name.len > 1) {
-                name.len = 2;
-                elem = md_util_element_lookup_ignore_case(name);
-            }
-
-            // Last resort, try to match against single first character
-            if (elem == 0) {
-                name.len = 1;
-                elem = md_util_element_lookup_ignore_case(name);
-            }
-
-        done:
-            element[i] = elem;
-            if (elem != 0) {
-                md_hashmap_add(&map, key, elem);
-            }
-        }
-    }
-
-    md_hashmap_free(&map);
-
-    return true;
+    // Delegate to the new hash-backed atomic number inference system
+    return md_atoms_infer_atomic_numbers(element, capacity, mol);
 }
 
 bool md_util_element_from_mass(md_element_t element[], const float mass[], size_t count) {
diff --git a/src/md_util.h b/src/md_util.h
index 2a98aa3..a98a107 100644
--- a/src/md_util.h
+++ b/src/md_util.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <md_molecule.h>
+#include <core/md_atomic.h>
 
 #include <core/md_str.h>
 #include <core/md_vec_math.h>
@@ -31,20 +32,16 @@ enum {
 
 typedef uint32_t md_util_postprocess_flags_t;
 
-// This assumes the string exactly matches the value within the look up table
-// The match is case sensitive and expects elements to be formatted with Big first letter and small second letter:
-// E.g. H, He, Fe, Na, C
-md_element_t md_util_element_lookup(str_t element_str);
-md_element_t md_util_element_lookup_ignore_case(str_t element_str);
-
-// Access to the static arrays
+// Access to the static arrays (preserved for direct access)
 const str_t* md_util_element_symbols(void);
 const str_t* md_util_element_names(void);
 const float* md_util_element_vdw_radii(void);
 
+// Element functions (now calling new atomic number API internally)
+md_element_t md_util_element_lookup(str_t element_str);
+md_element_t md_util_element_lookup_ignore_case(str_t element_str);
 str_t md_util_element_symbol(md_element_t element);
 str_t md_util_element_name(md_element_t element);
-
 float md_util_element_vdw_radius(md_element_t element);
 float md_util_element_covalent_radius(md_element_t element);
 float md_util_element_atomic_mass(md_element_t element);
@@ -63,9 +60,7 @@ static inline bool md_util_backbone_atoms_valid(md_protein_backbone_atoms_t prot
     return (prot.ca != prot.c) && (prot.ca != prot.o) && (prot.c != prot.o);
 }
 
-// This operation tries to deduce the element from the atom type/name which usually contains alot of cruft.
-// It also tries resolve some ambiguities: Such as CA, is that Carbon Alpha or is it calcium?
-// We can resolve that by looking at the residue name and in the case of Carbon Alpha, the residue name should be matched to an amino acid.
+// Element guess function - now delegates to the new inference system
 bool md_util_element_guess(md_element_t element[], size_t capacity, const struct md_molecule_t* mol);
 
 bool md_util_element_from_mass(md_element_t out_element[], const float in_mass[], size_t count);
diff --git a/unittest/CMakeLists.txt b/unittest/CMakeLists.txt
index d78f2e3..6fabeb3 100644
--- a/unittest/CMakeLists.txt
+++ b/unittest/CMakeLists.txt
@@ -12,6 +12,7 @@ target_link_libraries(md_unittest PRIVATE mdlib ${MD_LIBS})
 set (SRC_FILES
     test_allocator.c
     test_array.c
+    test_atomic.c
     test_bitop.c
     test_bitfield.c
     test_str.c
diff --git a/unittest/test_atomic.c b/unittest/test_atomic.c
new file mode 100644
index 0000000..dfddc35
--- /dev/null
+++ b/unittest/test_atomic.c
@@ -0,0 +1,122 @@
+#include "utest.h"
+#include <core/md_atomic.h>
+#include <md_util.h>
+#include <core/md_allocator.h>
+#include <core/md_arena_allocator.h>
+
+UTEST(atomic, enum_constants) {
+    // Test that enum constants are correct
+    EXPECT_EQ(MD_Z_X, 0);   // Unknown
+    EXPECT_EQ(MD_Z_H, 1);   // Hydrogen
+    EXPECT_EQ(MD_Z_HE, 2);  // Helium
+    EXPECT_EQ(MD_Z_C, 6);   // Carbon
+    EXPECT_EQ(MD_Z_N, 7);   // Nitrogen
+    EXPECT_EQ(MD_Z_O, 8);   // Oxygen
+    EXPECT_EQ(MD_Z_P, 15);  // Phosphorus
+    EXPECT_EQ(MD_Z_S, 16);  // Sulfur
+    EXPECT_EQ(MD_Z_CA, 20); // Calcium
+    EXPECT_EQ(MD_Z_CL, 17); // Chlorine
+    EXPECT_EQ(MD_Z_BR, 35); // Bromine
+    EXPECT_EQ(MD_Z_NA, 11); // Sodium
+    EXPECT_EQ(MD_Z_FE, 26); // Iron
+    EXPECT_EQ(MD_Z_OG, 118); // Oganesson
+}
+
+UTEST(atomic, symbol_lookup) {
+    // Test symbol lookup functions
+    EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("H")), MD_Z_H);
+    EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("C")), MD_Z_C);
+    EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("He")), MD_Z_HE);
+    EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("Ca")), MD_Z_CA);
+    EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("Unknown")), MD_Z_X);
+    
+    // Test case insensitive lookup
+    EXPECT_EQ(md_atomic_number_from_symbol_icase(STR_LIT("h")), MD_Z_H);
+    EXPECT_EQ(md_atomic_number_from_symbol_icase(STR_LIT("ca")), MD_Z_CA);
+    EXPECT_EQ(md_atomic_number_from_symbol_icase(STR_LIT("HE")), MD_Z_HE);
+}
+
+UTEST(atomic, symbol_from_number) {
+    // Test reverse lookup
+    str_t h_symbol = md_symbol_from_atomic_number(MD_Z_H);
+    EXPECT_TRUE(str_eq_cstr(h_symbol, "H"));
+    
+    str_t c_symbol = md_symbol_from_atomic_number(MD_Z_C);
+    EXPECT_TRUE(str_eq_cstr(c_symbol, "C"));
+    
+    str_t ca_symbol = md_symbol_from_atomic_number(MD_Z_CA);
+    EXPECT_TRUE(str_eq_cstr(ca_symbol, "Ca"));
+}
+
+UTEST(atomic, element_properties) {
+    // Test that we can get basic properties
+    EXPECT_GT(md_atomic_mass(MD_Z_H), 0.0f);
+    EXPECT_GT(md_atomic_mass(MD_Z_C), 0.0f);
+    EXPECT_GT(md_vdw_radius(MD_Z_H), 0.0f);
+    EXPECT_GT(md_covalent_radius(MD_Z_C), 0.0f);
+    EXPECT_GT(md_max_valence(MD_Z_C), 0);
+    EXPECT_GT(md_cpk_color(MD_Z_C), 0);
+}
+
+UTEST(atomic, inference_water) {
+    // Test water atom inference
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("O"), STR_LIT("HOH")), MD_Z_O);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("OW"), STR_LIT("HOH")), MD_Z_O);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("OH2"), STR_LIT("WAT")), MD_Z_O);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("H"), STR_LIT("HOH")), MD_Z_H);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("H1"), STR_LIT("TIP3")), MD_Z_H);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("HW"), STR_LIT("SPC")), MD_Z_H);
+}
+
+UTEST(atomic, inference_amino_acid) {
+    // Test amino acid atom inference
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("CA"), STR_LIT("ALA")), MD_Z_C);  // Alpha carbon, not calcium
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("N"), STR_LIT("GLY")), MD_Z_N);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("C"), STR_LIT("SER")), MD_Z_C);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("O"), STR_LIT("TRP")), MD_Z_O);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("OXT"), STR_LIT("PHE")), MD_Z_O);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("OG"), STR_LIT("SER")), MD_Z_O);  // Serine hydroxyl
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("SG"), STR_LIT("CYS")), MD_Z_S);  // Cysteine sulfur
+}
+
+UTEST(atomic, inference_nucleic_acid) {
+    // Test nucleic acid atom inference
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("P"), STR_LIT("DA")), MD_Z_P);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("OP1"), STR_LIT("DG")), MD_Z_O);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("O2P"), STR_LIT("A")), MD_Z_O);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("P"), STR_LIT("U")), MD_Z_P);
+}
+
+UTEST(atomic, inference_ions) {
+    // Test ion inference (residue name is element)
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("NA"), STR_LIT("NA")), MD_Z_NA);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT(""), STR_LIT("K")), MD_Z_K);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("MG"), STR_LIT("MG")), MD_Z_MG);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("CL"), STR_LIT("CL")), MD_Z_CL);
+}
+
+UTEST(atomic, inference_fallbacks) {
+    // Test fallback mechanisms
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("CL12"), STR_LIT("")), MD_Z_CL);  // Two-letter heuristic
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("BR1"), STR_LIT("")), MD_Z_BR);   // Two-letter heuristic
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("H123"), STR_LIT("")), MD_Z_H);   // First letter fallback
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("C99"), STR_LIT("")), MD_Z_C);    // First letter fallback
+}
+
+UTEST(atomic, backward_compatibility) {
+    // Test that old API still works through wrappers
+    EXPECT_EQ(md_util_element_lookup(STR_LIT("H")), MD_Z_H);
+    EXPECT_EQ(md_util_element_lookup_ignore_case(STR_LIT("ca")), MD_Z_CA);
+    
+    str_t symbol = md_util_element_symbol(MD_Z_C);
+    EXPECT_TRUE(str_eq_cstr(symbol, "C"));
+    
+    str_t name = md_util_element_name(MD_Z_O);
+    EXPECT_TRUE(str_eq_cstr(name, "Oxygen"));
+    
+    EXPECT_GT(md_util_element_atomic_mass(MD_Z_C), 0.0f);
+    EXPECT_GT(md_util_element_vdw_radius(MD_Z_H), 0.0f);
+    EXPECT_GT(md_util_element_covalent_radius(MD_Z_N), 0.0f);
+    EXPECT_GT(md_util_element_max_valence(MD_Z_C), 0);
+    EXPECT_GT(md_util_element_cpk_color(MD_Z_O), 0);
+}
\ No newline at end of file

From e723aefff6301cd25fb292b17f9633f562441d5a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 11 Sep 2025 14:23:13 +0000
Subject: [PATCH 03/20] Add comprehensive API equivalence and compatibility
 tests

Co-authored-by: scanberg <706523+scanberg@users.noreply.github.com>
---
 unittest/CMakeLists.txt              |  2 +
 unittest/test_api_equivalence.c      | 53 ++++++++++++++++++++
 unittest/test_element_guess_compat.c | 72 ++++++++++++++++++++++++++++
 3 files changed, 127 insertions(+)
 create mode 100644 unittest/test_api_equivalence.c
 create mode 100644 unittest/test_element_guess_compat.c

diff --git a/unittest/CMakeLists.txt b/unittest/CMakeLists.txt
index 6fabeb3..3093144 100644
--- a/unittest/CMakeLists.txt
+++ b/unittest/CMakeLists.txt
@@ -11,10 +11,12 @@ target_link_libraries(md_unittest PRIVATE mdlib ${MD_LIBS})
 
 set (SRC_FILES
     test_allocator.c
+    test_api_equivalence.c
     test_array.c
     test_atomic.c
     test_bitop.c
     test_bitfield.c
+    test_element_guess_compat.c
     test_str.c
     test_hash.c
     test_edr.c
diff --git a/unittest/test_api_equivalence.c b/unittest/test_api_equivalence.c
new file mode 100644
index 0000000..1e227bc
--- /dev/null
+++ b/unittest/test_api_equivalence.c
@@ -0,0 +1,53 @@
+// Test to verify API equivalence between old and new systems
+#include "utest.h"
+#include <core/md_atomic.h>
+#include <md_util.h>
+
+UTEST(api_equivalence, symbol_lookup_consistency) {
+    // Test that old and new APIs return the same results
+    for (int i = 1; i <= 118; ++i) {
+        str_t symbol_old = md_util_element_symbol(i);
+        str_t symbol_new = md_symbol_from_atomic_number(i);
+        
+        EXPECT_TRUE(str_eq(symbol_old, symbol_new));
+        
+        // Test reverse lookup
+        md_atomic_number_t old_lookup = md_util_element_lookup(symbol_old);
+        md_atomic_number_t new_lookup = md_atomic_number_from_symbol(symbol_old);
+        
+        EXPECT_EQ(old_lookup, new_lookup);
+        EXPECT_EQ(old_lookup, i);
+    }
+}
+
+UTEST(api_equivalence, property_consistency) {
+    // Test a few key elements for property consistency
+    md_atomic_number_t test_elements[] = {MD_Z_H, MD_Z_C, MD_Z_N, MD_Z_O, MD_Z_CA, MD_Z_FE};
+    
+    for (size_t i = 0; i < ARRAY_SIZE(test_elements); ++i) {
+        md_atomic_number_t z = test_elements[i];
+        
+        // Test masses
+        float mass_old = md_util_element_atomic_mass(z);
+        float mass_new = md_atomic_mass(z);
+        EXPECT_EQ(mass_old, mass_new);
+        
+        // Test radii
+        float vdw_old = md_util_element_vdw_radius(z);
+        float vdw_new = md_vdw_radius(z);
+        EXPECT_EQ(vdw_old, vdw_new);
+        
+        float cov_old = md_util_element_covalent_radius(z);
+        float cov_new = md_covalent_radius(z);
+        EXPECT_EQ(cov_old, cov_new);
+        
+        // Test valence and color
+        int val_old = md_util_element_max_valence(z);
+        int val_new = md_max_valence(z);
+        EXPECT_EQ(val_old, val_new);
+        
+        uint32_t color_old = md_util_element_cpk_color(z);
+        uint32_t color_new = md_cpk_color(z);
+        EXPECT_EQ(color_old, color_new);
+    }
+}
\ No newline at end of file
diff --git a/unittest/test_element_guess_compat.c b/unittest/test_element_guess_compat.c
new file mode 100644
index 0000000..cdab95d
--- /dev/null
+++ b/unittest/test_element_guess_compat.c
@@ -0,0 +1,72 @@
+#include "utest.h"
+#include <md_util.h>
+#include <md_molecule.h>
+#include <core/md_allocator.h>
+#include <core/md_arena_allocator.h>
+#include <core/md_str.h>
+
+// Test backward compatibility with md_util_element_guess
+UTEST(element_guess_compat, basic_inference) {
+    md_allocator_i* alloc = md_vm_arena_create(MEGABYTES(1));
+    
+    // Create a simple molecule structure
+    md_molecule_t mol = {0};
+    
+    // Atom data
+    const size_t atom_count = 5;
+    mol.atom.count = atom_count;
+    
+    // Allocate arrays
+    mol.atom.type = md_alloc(alloc, sizeof(md_label_t) * atom_count);
+    mol.atom.resname = md_alloc(alloc, sizeof(md_label_t) * atom_count);
+    mol.atom.element = md_alloc(alloc, sizeof(md_element_t) * atom_count);
+    
+    // Set up atom types and residue names
+    // HOH water oxygen
+    strncpy(mol.atom.type[0].buf, "O", sizeof(mol.atom.type[0].buf));
+    mol.atom.type[0].len = 1;
+    strncpy(mol.atom.resname[0].buf, "HOH", sizeof(mol.atom.resname[0].buf));
+    mol.atom.resname[0].len = 3;
+    mol.atom.element[0] = 0; // Start unknown
+    
+    // HOH water hydrogen
+    strncpy(mol.atom.type[1].buf, "H1", sizeof(mol.atom.type[1].buf));
+    mol.atom.type[1].len = 2;
+    strncpy(mol.atom.resname[1].buf, "HOH", sizeof(mol.atom.resname[1].buf));
+    mol.atom.resname[1].len = 3;
+    mol.atom.element[1] = 0; // Start unknown
+    
+    // Alanine alpha carbon
+    strncpy(mol.atom.type[2].buf, "CA", sizeof(mol.atom.type[2].buf));
+    mol.atom.type[2].len = 2;
+    strncpy(mol.atom.resname[2].buf, "ALA", sizeof(mol.atom.resname[2].buf));
+    mol.atom.resname[2].len = 3;
+    mol.atom.element[2] = 0; // Start unknown
+    
+    // Sodium ion
+    strncpy(mol.atom.type[3].buf, "NA", sizeof(mol.atom.type[3].buf));
+    mol.atom.type[3].len = 2;
+    strncpy(mol.atom.resname[3].buf, "NA", sizeof(mol.atom.resname[3].buf));
+    mol.atom.resname[3].len = 2;
+    mol.atom.element[3] = 0; // Start unknown
+    
+    // Generic carbon
+    strncpy(mol.atom.type[4].buf, "C1", sizeof(mol.atom.type[4].buf));
+    mol.atom.type[4].len = 2;
+    strncpy(mol.atom.resname[4].buf, "", sizeof(mol.atom.resname[4].buf));
+    mol.atom.resname[4].len = 0;
+    mol.atom.element[4] = 0; // Start unknown
+    
+    // Call the element guess function
+    bool result = md_util_element_guess(mol.atom.element, atom_count, &mol);
+    
+    // Verify results
+    EXPECT_TRUE(result);
+    EXPECT_EQ(mol.atom.element[0], MD_Z_O);  // Water oxygen
+    EXPECT_EQ(mol.atom.element[1], MD_Z_H);  // Water hydrogen
+    EXPECT_EQ(mol.atom.element[2], MD_Z_C);  // Alanine alpha carbon (not calcium!)
+    EXPECT_EQ(mol.atom.element[3], MD_Z_NA); // Sodium ion
+    EXPECT_EQ(mol.atom.element[4], MD_Z_C);  // Generic carbon from C1
+    
+    md_vm_arena_destroy(alloc);
+}
\ No newline at end of file

From 71add641cbf8215ebe976e6c891ea11455c5e652 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 11 Sep 2025 20:19:18 +0000
Subject: [PATCH 04/20] Initial plan


From 11adf566147796d5197c6d97c11e3fa6fcd873f5 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 11 Sep 2025 20:30:24 +0000
Subject: [PATCH 05/20] Implement parser updates for element inference and
 LAMMPS fixes

Co-authored-by: scanberg <706523+scanberg@users.noreply.github.com>
---
 src/md_gro.c    |   5 ++
 src/md_lammps.c |  13 ++---
 src/md_mmcif.c  |  37 +++++++++++++-
 src/md_pdb.c    |  29 +++++++++++
 src/md_util.c   | 129 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/md_util.h   |   4 ++
 6 files changed, 209 insertions(+), 8 deletions(-)

diff --git a/src/md_gro.c b/src/md_gro.c
index 33c6114..8c405db 100644
--- a/src/md_gro.c
+++ b/src/md_gro.c
@@ -169,6 +169,7 @@ bool md_gro_molecule_init(struct md_molecule_t* mol, const md_gro_data_t* data,
     mol->atom.y       = md_array_create(float, capacity, alloc);
     mol->atom.z       = md_array_create(float, capacity, alloc);
     mol->atom.type    = md_array_create(md_label_t, capacity, alloc);
+    mol->atom.element = md_array_create(md_element_t, capacity, alloc);
 
     mol->atom.resid   = md_array_create(md_residue_id_t, capacity, alloc);
     mol->atom.resname = md_array_create(md_label_t, capacity, alloc);
@@ -198,6 +199,7 @@ bool md_gro_molecule_init(struct md_molecule_t* mol, const md_gro_data_t* data,
         mol->atom.y[i] = y;
         mol->atom.z[i] = z;
         mol->atom.type[i] = make_label(atom_name);
+        mol->atom.element[i] = 0; // Initialize to unknown, will be filled below
         mol->atom.resid[i] =  res_id;
         mol->atom.resname[i] = make_label(res_name);
         mol->atom.flags[i] = flags;
@@ -214,6 +216,9 @@ bool md_gro_molecule_init(struct md_molecule_t* mol, const md_gro_data_t* data,
 
     mol->unit_cell = md_util_unit_cell_from_matrix(box);
 
+    // Use hash-backed inference to assign elements (GRO typically lacks explicit element information)
+    md_util_element_guess(mol->atom.element, mol->atom.count, mol);
+
     return true;
 }
 
diff --git a/src/md_lammps.c b/src/md_lammps.c
index 514a35f..f2db116 100644
--- a/src/md_lammps.c
+++ b/src/md_lammps.c
@@ -453,8 +453,8 @@ static size_t parse_masses(float* mass_type_table, size_t mass_type_capacity, si
 		}
 		int   type = (int)parse_int(tok[0]);
 		float mass = (float)parse_float(tok[1]);
-		if (type >= (int)mass_type_capacity) {
-			MD_LOG_ERROR("Invalid atom type: %i", (int)num_tok);
+		if (type < 0 || (size_t)type >= mass_type_capacity) {
+			MD_LOG_ERROR("Invalid atom type index in Masses: %d (capacity %zu)", type, mass_type_capacity);
 			return 0;
 		}
 		mass_type_table[type] = mass;
@@ -715,7 +715,7 @@ bool md_lammps_molecule_init(md_molecule_t* mol, const md_lammps_data_t* data, m
 	}
 
 	for (size_t i = 0; i < data->num_atoms; ++i) {
-		mol->atom.type[i].len = (uint8_t)snprintf(mol->atom.type[i].buf, sizeof(mol->atom.type[i].buf), "%i", data->atoms[i].type);
+		mol->atom.type[i].len = (uint8_t)snprintf(mol->atom.type[i].buf, sizeof(mol->atom.type[i].buf), "lammps:%i", data->atoms[i].type);
 		mol->atom.x[i] = data->atoms[i].x - data->cell.xlo;
 		mol->atom.y[i] = data->atoms[i].y - data->cell.ylo;
 		mol->atom.z[i] = data->atoms[i].z - data->cell.zlo;
@@ -725,10 +725,11 @@ bool md_lammps_molecule_init(md_molecule_t* mol, const md_lammps_data_t* data, m
 		}
 	}
 
-	//Set elements
+	//Set elements using conservative mass→element mapping
 	md_array_resize(mol->atom.element, capacity, alloc);
-	if (!md_util_element_from_mass(mol->atom.element, mol->atom.mass, data->num_atoms)) {
-		MD_LOG_ERROR("One or more masses are missing matching element");
+	if (!md_util_lammps_element_from_mass(mol->atom.element, mol->atom.mass, data->num_atoms)) {
+		// CG/reduced-units detected or mapping failed, leave elements as 0
+		MD_LOG_DEBUG("LAMMPS data appears to be coarse-grained or reduced-units, elements left unassigned");
 	}
 
 	mol->atom.count = data->num_atoms;
diff --git a/src/md_mmcif.c b/src/md_mmcif.c
index 29df116..544ab61 100644
--- a/src/md_mmcif.c
+++ b/src/md_mmcif.c
@@ -148,8 +148,13 @@ static bool mmcif_parse_atom_site(md_atom_data_t* atom, md_buffered_reader_t* re
             int32_t entity_id = -1;
             
             str_t sym = tok[table[ATOM_SITE_TYPE_SYMBOL]];
-            uint32_t* entry = md_hashmap_get(&elem_map, md_hash64(sym.ptr, sym.len, 0));
-            md_element_t elem = entry ? (md_element_t)*entry : 0;
+            md_element_t elem = 0;
+            
+            // Prefer _atom_site.type_symbol if not '.'
+            if (sym.len > 0 && sym.ptr[0] != '.') {
+                uint32_t* entry = md_hashmap_get(&elem_map, md_hash64(sym.ptr, sym.len, 0));
+                elem = entry ? (md_element_t)*entry : 0;
+            }
 
             md_label_t type = make_label(tok[table[ATOM_SITE_LABEL_ATOM_ID]]);
 
@@ -226,6 +231,34 @@ static bool mmcif_parse_atom_site(md_atom_data_t* atom, md_buffered_reader_t* re
         md_array_ensure(atom->chainid,  capacity, alloc);
     }
 
+    // Fill in missing elements using hash-backed inference
+    for (size_t i = 0; i < (size_t)num_atoms; ++i) {
+        if (atom->element[i] == 0) {
+            // Use hash-backed inference for missing elements
+            str_t atom_label = LBL_TO_STR(atom->type[i]);
+            
+            // Try direct element lookup first
+            md_element_t elem = md_util_element_lookup_ignore_case(atom_label);
+            
+            // If that fails, use the hash-backed inference from md_util_element_guess
+            if (elem == 0) {
+                // Create a temporary molecule structure for the single atom to use element_guess
+                md_molecule_t temp_mol = {0};
+                temp_mol.atom.count = 1;
+                temp_mol.atom.type = &atom->type[i];
+                temp_mol.atom.resname = &atom->resname[i];
+                temp_mol.atom.flags = atom->flags ? &atom->flags[i] : NULL;
+                
+                md_element_t temp_element = 0;
+                if (md_util_element_guess(&temp_element, 1, &temp_mol)) {
+                    elem = temp_element;
+                }
+            }
+            
+            atom->element[i] = elem;
+        }
+    }
+
     atom->count = num_atoms;
 done:
     md_temp_set_pos_back(temp_pos);
diff --git a/src/md_pdb.c b/src/md_pdb.c
index f7aea57..4776d3f 100644
--- a/src/md_pdb.c
+++ b/src/md_pdb.c
@@ -666,6 +666,35 @@ bool md_pdb_molecule_init(md_molecule_t* mol, const md_pdb_data_t* data, md_pdb_
     ASSERT(md_array_size(mol->instance.label) == mol->instance.count);
     ASSERT(md_array_size(mol->instance.atom_range) == mol->instance.count);
 
+    // Fill in missing elements using hash-backed inference
+    for (size_t i = 0; i < mol->atom.count; ++i) {
+        if (mol->atom.element[i] == 0) {
+            // Use hash-backed inference for missing elements
+            str_t atom_name = LBL_TO_STR(mol->atom.type[i]);
+            
+            // Try direct element lookup first
+            md_element_t elem = md_util_element_lookup_ignore_case(atom_name);
+            
+            // If that fails, use the hash-backed inference from md_util_element_guess
+            // This handles cases like CA (carbon alpha vs calcium) using residue context
+            if (elem == 0) {
+                // Create a temporary molecule structure for the single atom to use element_guess
+                md_molecule_t temp_mol = {0};
+                temp_mol.atom.count = 1;
+                temp_mol.atom.type = &mol->atom.type[i];
+                temp_mol.atom.resname = &mol->atom.resname[i];
+                temp_mol.atom.flags = mol->atom.flags ? &mol->atom.flags[i] : NULL;
+                
+                md_element_t temp_element = 0;
+                if (md_util_element_guess(&temp_element, 1, &temp_mol)) {
+                    elem = temp_element;
+                }
+            }
+            
+            mol->atom.element[i] = elem;
+        }
+    }
+
     result = true;
 done:
     md_vm_arena_destroy(temp_alloc);
diff --git a/src/md_util.c b/src/md_util.c
index 15a6129..c003a7c 100644
--- a/src/md_util.c
+++ b/src/md_util.c
@@ -1424,6 +1424,135 @@ bool md_util_element_from_mass(md_element_t element[], const float mass[], size_
     }
 }
 
+bool md_util_lammps_element_from_mass(md_element_t out_element[], const float in_mass[], size_t count) {
+    if (!out_element) {
+        MD_LOG_ERROR("out_element is null");
+        return false;
+    }
+    if (!in_mass) {
+        MD_LOG_ERROR("in_mass is null");
+        return false;
+    }
+    if (count == 0) {
+        return true;
+    }
+
+    // Initialize output to zero (unknown element)
+    for (size_t i = 0; i < count; ++i) {
+        out_element[i] = 0;
+    }
+
+    // Standard atomic masses for common elements used in all-atom simulations
+    typedef struct {
+        md_element_t element;
+        float mass;
+        float tolerance;
+    } element_mass_entry_t;
+
+    static const element_mass_entry_t standard_masses[] = {
+        {H,  1.008f,  0.2f},   // Hydrogen: Z <= 10, tolerance 0.2 amu
+        {C,  12.011f, 0.2f},   // Carbon
+        {N,  14.007f, 0.2f},   // Nitrogen  
+        {O,  15.999f, 0.2f},   // Oxygen
+        {F,  18.998f, 0.2f},   // Fluorine
+        {Na, 22.990f, 0.3f},   // Sodium: Z > 10, <= 20, tolerance 0.3 amu
+        {Mg, 24.305f, 0.3f},   // Magnesium
+        {P,  30.974f, 0.3f},   // Phosphorus
+        {S,  32.06f,  0.3f},   // Sulfur
+        {Cl, 35.45f,  0.3f},   // Chlorine
+        {K,  39.098f, 0.3f},   // Potassium
+        {Ca, 40.078f, 0.3f},   // Calcium
+        {Fe, 55.845f, 0.5f},   // Iron: Z > 20, tolerance 0.5 amu
+        {Zn, 65.38f,  0.5f},   // Zinc
+        {Br, 79.904f, 0.5f},   // Bromine
+        {I,  126.904f, 0.5f},  // Iodine
+    };
+    static const size_t num_standard_masses = sizeof(standard_masses) / sizeof(standard_masses[0]);
+
+    // CG/reduced-units detection heuristics
+    
+    // Count how many unique masses we have
+    float unique_masses[256];
+    size_t unique_count = 0;
+    
+    for (size_t i = 0; i < count; ++i) {
+        const float mass = in_mass[i];
+        if (mass <= 0.0f) continue; // Skip invalid masses
+        
+        // Check if this mass is already in our unique list
+        bool found = false;
+        for (size_t j = 0; j < unique_count; ++j) {
+            if (fabsf(mass - unique_masses[j]) < 1e-6f) {
+                found = true;
+                break;
+            }
+        }
+        
+        if (!found && unique_count < sizeof(unique_masses)/sizeof(unique_masses[0])) {
+            unique_masses[unique_count++] = mass;
+        }
+    }
+    
+    // Heuristic 1: Too few unique masses for the number of atoms suggests CG
+    if (unique_count < 3 && count > 10) {
+        return false; // Likely CG, skip mapping
+    }
+    
+    // Heuristic 2: Check for CG-like mass values (e.g., 72.0, 1.0, etc.)
+    for (size_t i = 0; i < unique_count; ++i) {
+        const float mass = unique_masses[i];
+        // Check for typical CG mass values
+        if (fabsf(mass - 1.0f) < 0.001f ||    // Reduced units
+            fabsf(mass - 72.0f) < 0.001f ||   // Common CG mass
+            fabsf(mass - 36.0f) < 0.001f ||   // Common CG mass
+            mass > 200.0f) {                   // Unrealistically heavy for typical atoms
+            return false; // Likely CG/reduced units, skip mapping
+        }
+    }
+
+    // Perform conservative mass→element mapping
+    size_t successful_mappings = 0;
+    
+    for (size_t i = 0; i < count; ++i) {
+        const float mass = in_mass[i];
+        if (mass <= 0.0f) continue;
+        
+        md_element_t best_element = 0;
+        float best_diff = FLT_MAX;
+        size_t num_candidates = 0;
+        
+        // Find the best matching element
+        for (size_t j = 0; j < num_standard_masses; ++j) {
+            const float diff = fabsf(mass - standard_masses[j].mass);
+            if (diff <= standard_masses[j].tolerance) {
+                num_candidates++;
+                if (diff < best_diff) {
+                    best_diff = diff;
+                    best_element = standard_masses[j].element;
+                }
+            }
+        }
+        
+        // Only assign if we have exactly one candidate (no ambiguity)
+        if (num_candidates == 1) {
+            out_element[i] = best_element;
+            successful_mappings++;
+        }
+        // If multiple elements match within tolerance, leave as 0 (ambiguous)
+    }
+    
+    // If we couldn't map a reasonable fraction, might be CG
+    if (successful_mappings < count / 2) {
+        // Reset all to 0 and return false
+        for (size_t i = 0; i < count; ++i) {
+            out_element[i] = 0;
+        }
+        return false;
+    }
+    
+    return true;
+}
+
 const str_t* md_util_element_symbols(void) {
     return element_symbols;
 }
diff --git a/src/md_util.h b/src/md_util.h
index 2a98aa3..c423ad0 100644
--- a/src/md_util.h
+++ b/src/md_util.h
@@ -70,6 +70,10 @@ bool md_util_element_guess(md_element_t element[], size_t capacity, const struct
 
 bool md_util_element_from_mass(md_element_t out_element[], const float in_mass[], size_t count);
 
+// Conservative mass→element mapping for LAMMPS with CG/reduced-units detection
+// Returns false if data appears to be CG or reduced-units (skips mapping)
+bool md_util_lammps_element_from_mass(md_element_t out_element[], const float in_mass[], size_t count);
+
 // Computes secondary structures from backbone atoms
 // Does not allocate any data, it assumes that secondary_structures has the same length as mol.backbone.count
 bool md_util_backbone_secondary_structure_compute(md_secondary_structure_t secondary_structures[], size_t capacity, const struct md_molecule_t* mol);

From b40290240b4d7fdde386803d10f546167012b979 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 11 Sep 2025 20:37:43 +0000
Subject: [PATCH 06/20] =?UTF-8?q?Complete=20implementation=20with=20workin?=
 =?UTF-8?q?g=20unit=20tests=20for=20mass=E2=86=92element=20mapping?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: scanberg <706523+scanberg@users.noreply.github.com>
---
 src/md_util.c        |  13 +++---
 unittest/test_util.c | 102 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+), 5 deletions(-)

diff --git a/src/md_util.c b/src/md_util.c
index c003a7c..e12fdb5 100644
--- a/src/md_util.c
+++ b/src/md_util.c
@@ -1429,7 +1429,7 @@ bool md_util_lammps_element_from_mass(md_element_t out_element[], const float in
         MD_LOG_ERROR("out_element is null");
         return false;
     }
-    if (!in_mass) {
+    if (count > 0 && !in_mass) {
         MD_LOG_ERROR("in_mass is null");
         return false;
     }
@@ -1501,13 +1501,16 @@ bool md_util_lammps_element_from_mass(md_element_t out_element[], const float in
     // Heuristic 2: Check for CG-like mass values (e.g., 72.0, 1.0, etc.)
     for (size_t i = 0; i < unique_count; ++i) {
         const float mass = unique_masses[i];
-        // Check for typical CG mass values
-        if (fabsf(mass - 1.0f) < 0.001f ||    // Reduced units
-            fabsf(mass - 72.0f) < 0.001f ||   // Common CG mass
+        // Check for typical CG mass values - be careful not to catch legitimate hydrogen masses
+        if (fabsf(mass - 72.0f) < 0.001f ||   // Common CG mass
             fabsf(mass - 36.0f) < 0.001f ||   // Common CG mass
             mass > 200.0f) {                   // Unrealistically heavy for typical atoms
             return false; // Likely CG/reduced units, skip mapping
         }
+        // Check for reduced units (exactly 1.0, not hydrogen mass around 1.008)
+        if (fabsf(mass - 1.0f) < 1e-6f) {  // Much stricter tolerance for exactly 1.0
+            return false; // Likely reduced units
+        }
     }
 
     // Perform conservative mass→element mapping
@@ -1542,7 +1545,7 @@ bool md_util_lammps_element_from_mass(md_element_t out_element[], const float in
     }
     
     // If we couldn't map a reasonable fraction, might be CG
-    if (successful_mappings < count / 2) {
+    if (successful_mappings == 0 || successful_mappings < (count + 2) / 3) { // At least 33% success rate
         // Reset all to 0 and return false
         for (size_t i = 0; i < count; ++i) {
             out_element[i] = 0;
diff --git a/unittest/test_util.c b/unittest/test_util.c
index ac70f98..5e9976e 100644
--- a/unittest/test_util.c
+++ b/unittest/test_util.c
@@ -814,3 +814,105 @@ UTEST(util, radix_sort) {
     	EXPECT_LE(arr[i], arr[i+1]);
     }
 }
+
+UTEST(util, lammps_mass_element_mapping) {
+    // Test conservative mass→element mapping for LAMMPS with various scenarios
+    
+    {
+        // Test all-atom case with standard elements
+        float masses[] = {1.008f, 12.011f, 14.007f, 15.999f, 30.974f, 32.06f};
+        md_element_t elements[6] = {0};
+        size_t count = ARRAY_SIZE(masses);
+        
+        EXPECT_TRUE(md_util_lammps_element_from_mass(elements, masses, count));
+        EXPECT_EQ(elements[0], 1);  // H
+        EXPECT_EQ(elements[1], 6);  // C  
+        EXPECT_EQ(elements[2], 7);  // N
+        EXPECT_EQ(elements[3], 8);  // O
+        EXPECT_EQ(elements[4], 15); // P
+        EXPECT_EQ(elements[5], 16); // S
+    }
+    
+    {
+        // Test with small deviations within tolerance
+        float masses[] = {1.01f, 12.1f, 14.1f, 15.9f}; // Within tolerance, but 1.01 instead of 1.0 to avoid reduced units filter
+        md_element_t elements[4] = {0};
+        size_t count = ARRAY_SIZE(masses);
+        
+        EXPECT_TRUE(md_util_lammps_element_from_mass(elements, masses, count));
+        EXPECT_EQ(elements[0], 1);  // H
+        EXPECT_EQ(elements[1], 6);  // C  
+        EXPECT_EQ(elements[2], 7);  // N
+        EXPECT_EQ(elements[3], 8);  // O
+    }
+    
+    {
+        // Test CG-like masses (should fail and return false)
+        float masses[] = {72.0f, 72.0f, 72.0f, 72.0f, 72.0f}; // Typical CG mass
+        md_element_t elements[5] = {0};
+        size_t count = ARRAY_SIZE(masses);
+        
+        EXPECT_FALSE(md_util_lammps_element_from_mass(elements, masses, count));
+        // All elements should remain 0 (unassigned)
+        for (size_t i = 0; i < count; ++i) {
+            EXPECT_EQ(elements[i], 0);
+        }
+    }
+    
+    {
+        // Test reduced units (should fail and return false)
+        float masses[] = {1.0f, 1.0f, 1.0f, 1.0f}; // Reduced units
+        md_element_t elements[4] = {0};
+        size_t count = ARRAY_SIZE(masses);
+        
+        EXPECT_FALSE(md_util_lammps_element_from_mass(elements, masses, count));
+        // All elements should remain 0 (unassigned)
+        for (size_t i = 0; i < count; ++i) {
+            EXPECT_EQ(elements[i], 0);
+        }
+    }
+    
+    {
+        // Test partial mapping case (some masses match, some don't)
+        float masses[] = {12.011f, 45.0f}; // Carbon matches, 45.0f doesn't match any element well
+        md_element_t elements[2] = {0};
+        size_t count = ARRAY_SIZE(masses);
+        
+        bool result = md_util_lammps_element_from_mass(elements, masses, count);
+        // Should succeed because at least one element can be mapped
+        // The first should map to Carbon, second should remain 0
+        EXPECT_TRUE(result);
+        EXPECT_EQ(elements[0], 6);  // C
+        EXPECT_EQ(elements[1], 0);  // Unknown/unmatched
+    }
+    
+    {
+        // Test too few unique masses for the number of atoms (CG heuristic)
+        float masses[20];
+        for (int i = 0; i < 20; ++i) {
+            masses[i] = (i < 10) ? 36.0f : 72.0f; // Only 2 unique masses for 20 atoms
+        }
+        md_element_t elements[20] = {0};
+        size_t count = ARRAY_SIZE(masses);
+        
+        EXPECT_FALSE(md_util_lammps_element_from_mass(elements, masses, count));
+        // All elements should remain 0 (unassigned)
+        for (size_t i = 0; i < count; ++i) {
+            EXPECT_EQ(elements[i], 0);
+        }
+    }
+    
+    {
+        // Test empty case
+        md_element_t elements[1] = {0};
+        EXPECT_TRUE(md_util_lammps_element_from_mass(elements, NULL, 0));
+    }
+    
+    {
+        // Test invalid input
+        float masses[] = {12.011f};
+        EXPECT_FALSE(md_util_lammps_element_from_mass(NULL, masses, 1));
+        md_element_t elements[1] = {0};
+        EXPECT_FALSE(md_util_lammps_element_from_mass(elements, NULL, 1));
+    }
+}

From 47f9ae3e51ab01625394a408a8fdd6302e96f5ce Mon Sep 17 00:00:00 2001
From: scanberg <scanberg@gmail.com>
Date: Fri, 12 Sep 2025 09:48:44 +0200
Subject: [PATCH 07/20] MD_Z_XX how has the form MD_Z_Xx. inlined string
 comparison between str and cstr added water resname types

---
 src/core/md_atomic.h                 | 210 +++++++++++++--------------
 src/core/md_str.h                    |   9 +-
 src/md_atomic_infer.c                |  20 +--
 src/md_lammps.c                      |  26 ++--
 src/md_lammps.h                      |   2 +-
 src/md_util.c                        |   2 +-
 unittest/test_api_equivalence.c      |   4 +-
 unittest/test_atomic.c               |  38 ++---
 unittest/test_element_guess_compat.c |   4 +-
 9 files changed, 159 insertions(+), 156 deletions(-)

diff --git a/src/core/md_atomic.h b/src/core/md_atomic.h
index af63843..19c8a6e 100644
--- a/src/core/md_atomic.h
+++ b/src/core/md_atomic.h
@@ -1,4 +1,4 @@
-#pragma once
+﻿#pragma once
 
 #include <core/md_str.h>
 #include <stdint.h>
@@ -21,123 +21,123 @@ struct md_molecule_t;
 enum {
     MD_Z_X  = 0,   // Unknown
     MD_Z_H  = 1,   // Hydrogen
-    MD_Z_HE = 2,   // Helium
-    MD_Z_LI = 3,   // Lithium
-    MD_Z_BE = 4,   // Beryllium
+    MD_Z_He = 2,   // Helium
+    MD_Z_Li = 3,   // Lithium
+    MD_Z_Be = 4,   // Beryllium
     MD_Z_B  = 5,   // Boron
     MD_Z_C  = 6,   // Carbon
     MD_Z_N  = 7,   // Nitrogen
     MD_Z_O  = 8,   // Oxygen
     MD_Z_F  = 9,   // Fluorine
-    MD_Z_NE = 10,  // Neon
-    MD_Z_NA = 11,  // Sodium
-    MD_Z_MG = 12,  // Magnesium
-    MD_Z_AL = 13,  // Aluminium
-    MD_Z_SI = 14,  // Silicon
+    MD_Z_Ne = 10,  // Neon
+    MD_Z_Na = 11,  // Sodium
+    MD_Z_Mg = 12,  // Magnesium
+    MD_Z_Al = 13,  // Aluminium
+    MD_Z_Si = 14,  // Silicon
     MD_Z_P  = 15,  // Phosphorus
     MD_Z_S  = 16,  // Sulfur
-    MD_Z_CL = 17,  // Chlorine
-    MD_Z_AR = 18,  // Argon
+    MD_Z_Cl = 17,  // Chlorine
+    MD_Z_Ar = 18,  // Argon
     MD_Z_K  = 19,  // Potassium
-    MD_Z_CA = 20,  // Calcium
-    MD_Z_SC = 21,  // Scandium
-    MD_Z_TI = 22,  // Titanium
+    MD_Z_Ca = 20,  // Calcium
+    MD_Z_Sc = 21,  // Scandium
+    MD_Z_Ti = 22,  // Titanium
     MD_Z_V  = 23,  // Vanadium
-    MD_Z_CR = 24,  // Chromium
-    MD_Z_MN = 25,  // Manganese
-    MD_Z_FE = 26,  // Iron
-    MD_Z_CO = 27,  // Cobalt
-    MD_Z_NI = 28,  // Nickel
-    MD_Z_CU = 29,  // Copper
-    MD_Z_ZN = 30,  // Zinc
-    MD_Z_GA = 31,  // Gallium
-    MD_Z_GE = 32,  // Germanium
-    MD_Z_AS = 33,  // Arsenic
-    MD_Z_SE = 34,  // Selenium
-    MD_Z_BR = 35,  // Bromine
-    MD_Z_KR = 36,  // Krypton
-    MD_Z_RB = 37,  // Rubidium
-    MD_Z_SR = 38,  // Strontium
+    MD_Z_Cr = 24,  // Chromium
+    MD_Z_Mn = 25,  // Manganese
+    MD_Z_Fe = 26,  // Iron
+    MD_Z_Co = 27,  // Cobalt
+    MD_Z_Ni = 28,  // Nickel
+    MD_Z_Cu = 29,  // Copper
+    MD_Z_Zn = 30,  // Zinc
+    MD_Z_Ga = 31,  // Gallium
+    MD_Z_Ge = 32,  // Germanium
+    MD_Z_As = 33,  // Arsenic
+    MD_Z_Se = 34,  // Selenium
+    MD_Z_Br = 35,  // Bromine
+    MD_Z_Kr = 36,  // Krypton
+    MD_Z_Rb = 37,  // Rubidium
+    MD_Z_Sr = 38,  // Strontium
     MD_Z_Y  = 39,  // Yttrium
-    MD_Z_ZR = 40,  // Zirconium
-    MD_Z_NB = 41,  // Niobium
-    MD_Z_MO = 42,  // Molybdenum
-    MD_Z_TC = 43,  // Technetium
-    MD_Z_RU = 44,  // Ruthenium
-    MD_Z_RH = 45,  // Rhodium
-    MD_Z_PD = 46,  // Palladium
-    MD_Z_AG = 47,  // Silver
-    MD_Z_CD = 48,  // Cadmium
-    MD_Z_IN = 49,  // Indium
-    MD_Z_SN = 50,  // Tin
-    MD_Z_SB = 51,  // Antimony
-    MD_Z_TE = 52,  // Tellurium
+    MD_Z_Zr = 40,  // Zirconium
+    MD_Z_Nb = 41,  // Niobium
+    MD_Z_Mo = 42,  // Molybdenum
+    MD_Z_Tc = 43,  // Technetium
+    MD_Z_Ru = 44,  // Ruthenium
+    MD_Z_Rh = 45,  // Rhodium
+    MD_Z_Pd = 46,  // Palladium
+    MD_Z_Ag = 47,  // Silver
+    MD_Z_Cd = 48,  // Cadmium
+    MD_Z_In = 49,  // Indium
+    MD_Z_Sn = 50,  // Tin
+    MD_Z_Sb = 51,  // Antimony
+    MD_Z_Te = 52,  // Tellurium
     MD_Z_I  = 53,  // Iodine
-    MD_Z_XE = 54,  // Xenon
-    MD_Z_CS = 55,  // Caesium
-    MD_Z_BA = 56,  // Barium
-    MD_Z_LA = 57,  // Lanthanum
-    MD_Z_CE = 58,  // Cerium
-    MD_Z_PR = 59,  // Praseodymium
-    MD_Z_ND = 60,  // Neodymium
-    MD_Z_PM = 61,  // Promethium
-    MD_Z_SM = 62,  // Samarium
-    MD_Z_EU = 63,  // Europium
-    MD_Z_GD = 64,  // Gadolinium
-    MD_Z_TB = 65,  // Terbium
-    MD_Z_DY = 66,  // Dysprosium
-    MD_Z_HO = 67,  // Holmium
-    MD_Z_ER = 68,  // Erbium
-    MD_Z_TM = 69,  // Thulium
-    MD_Z_YB = 70,  // Ytterbium
-    MD_Z_LU = 71,  // Lutetium
-    MD_Z_HF = 72,  // Hafnium
-    MD_Z_TA = 73,  // Tantalum
+    MD_Z_Xe = 54,  // Xenon
+    MD_Z_Cs = 55,  // Caesium
+    MD_Z_Ba = 56,  // Barium
+    MD_Z_La = 57,  // Lanthanum
+    MD_Z_Ce = 58,  // Cerium
+    MD_Z_Pr = 59,  // Praseodymium
+    MD_Z_Nd = 60,  // Neodymium
+    MD_Z_Pm = 61,  // Promethium
+    MD_Z_Sm = 62,  // Samarium
+    MD_Z_Eu = 63,  // Europium
+    MD_Z_Gd = 64,  // Gadolinium
+    MD_Z_Tb = 65,  // Terbium
+    MD_Z_Dy = 66,  // Dysprosium
+    MD_Z_Ho = 67,  // Holmium
+    MD_Z_Er = 68,  // Erbium
+    MD_Z_Tm = 69,  // Thulium
+    MD_Z_Yb = 70,  // Ytterbium
+    MD_Z_Lu = 71,  // Lutetium
+    MD_Z_Hf = 72,  // Hafnium
+    MD_Z_Ta = 73,  // Tantalum
     MD_Z_W  = 74,  // Tungsten
-    MD_Z_RE = 75,  // Rhenium
-    MD_Z_OS = 76,  // Osmium
-    MD_Z_IR = 77,  // Iridium
-    MD_Z_PT = 78,  // Platinum
-    MD_Z_AU = 79,  // Gold
-    MD_Z_HG = 80,  // Mercury
-    MD_Z_TL = 81,  // Thallium
-    MD_Z_PB = 82,  // Lead
-    MD_Z_BI = 83,  // Bismuth
-    MD_Z_PO = 84,  // Polonium
-    MD_Z_AT = 85,  // Astatine
-    MD_Z_RN = 86,  // Radon
-    MD_Z_FR = 87,  // Francium
-    MD_Z_RA = 88,  // Radium
-    MD_Z_AC = 89,  // Actinium
-    MD_Z_TH = 90,  // Thorium
-    MD_Z_PA = 91,  // Protactinium
+    MD_Z_Re = 75,  // Rhenium
+    MD_Z_Os = 76,  // Osmium
+    MD_Z_Ir = 77,  // Iridium
+    MD_Z_Pt = 78,  // Platinum
+    MD_Z_Au = 79,  // Gold
+    MD_Z_Hg = 80,  // Mercury
+    MD_Z_Tl = 81,  // Thallium
+    MD_Z_Pb = 82,  // Lead
+    MD_Z_Bi = 83,  // Bismuth
+    MD_Z_Po = 84,  // Polonium
+    MD_Z_At = 85,  // Astatine
+    MD_Z_Rn = 86,  // Radon
+    MD_Z_Fr = 87,  // Francium
+    MD_Z_Ra = 88,  // Radium
+    MD_Z_Ac = 89,  // Actinium
+    MD_Z_Th = 90,  // Thorium
+    MD_Z_Pa = 91,  // Protactinium
     MD_Z_U  = 92,  // Uranium
-    MD_Z_NP = 93,  // Neptunium
-    MD_Z_PU = 94,  // Plutonium
-    MD_Z_AM = 95,  // Americium
-    MD_Z_CM = 96,  // Curium
-    MD_Z_BK = 97,  // Berkelium
-    MD_Z_CF = 98,  // Californium
-    MD_Z_ES = 99,  // Einsteinium
-    MD_Z_FM = 100, // Fermium
-    MD_Z_MD = 101, // Mendelevium
-    MD_Z_NO = 102, // Nobelium
-    MD_Z_LR = 103, // Lawrencium
-    MD_Z_RF = 104, // Rutherfordium
-    MD_Z_DB = 105, // Dubnium
-    MD_Z_SG = 106, // Seaborgium
-    MD_Z_BH = 107, // Bohrium
-    MD_Z_HS = 108, // Hassium
-    MD_Z_MT = 109, // Meitnerium
-    MD_Z_DS = 110, // Darmstadtium
-    MD_Z_RG = 111, // Roentgenium
-    MD_Z_CN = 112, // Copernicium
-    MD_Z_NH = 113, // Nihonium
-    MD_Z_FL = 114, // Flerovium
-    MD_Z_MC = 115, // Moscovium
-    MD_Z_LV = 116, // Livermorium
-    MD_Z_TS = 117, // Tennessine
-    MD_Z_OG = 118, // Oganesson
+    MD_Z_Np = 93,  // Neptunium
+    MD_Z_Pu = 94,  // Plutonium
+    MD_Z_Am = 95,  // Americium
+    MD_Z_Cm = 96,  // Curium
+    MD_Z_Bk = 97,  // Berkelium
+    MD_Z_Cf = 98,  // Californium
+    MD_Z_Es = 99,  // Einsteinium
+    MD_Z_Fm = 100, // Fermium
+    MD_Z_Md = 101, // Mendelevium
+    MD_Z_No = 102, // Nobelium
+    MD_Z_Lr = 103, // Lawrencium
+    MD_Z_Rf = 104, // Rutherfordium
+    MD_Z_Db = 105, // Dubnium
+    MD_Z_Sg = 106, // Seaborgium
+    MD_Z_Bh = 107, // Bohrium
+    MD_Z_Hs = 108, // Hassium
+    MD_Z_Mt = 109, // Meitnerium
+    MD_Z_Ds = 110, // Darmstadtium
+    MD_Z_Rg = 111, // Roentgenium
+    MD_Z_Cn = 112, // Copernicium
+    MD_Z_Nh = 113, // Nihonium
+    MD_Z_Fl = 114, // Flerovium
+    MD_Z_Mc = 115, // Moscovium
+    MD_Z_Lv = 116, // Livermorium
+    MD_Z_Ts = 117, // Tennessine
+    MD_Z_Og = 118, // Oganesson
 };
 
 // New preferred API names
diff --git a/src/core/md_str.h b/src/core/md_str.h
index 232a738..68c11ef 100644
--- a/src/core/md_str.h
+++ b/src/core/md_str.h
@@ -1,4 +1,4 @@
-#pragma once
+﻿#pragma once
 
 #include <stddef.h>
 #include <stdint.h>
@@ -111,7 +111,12 @@ bool str_eq_n_ignore_case(const str_t str_a, const str_t str_b, size_t n);
 // Lexicographical comparison
 int str_cmp_lex(str_t a, str_t b);
 
-bool str_eq_cstr(str_t str, const char* cstr);
+static inline bool str_eq_cstr(str_t str, const char* cstr) {
+    if (!str.ptr || !str.len || !cstr) return false;
+    if (str.ptr[0] != cstr[0]) return false;
+    return (strncmp(str.ptr, cstr, str.len) == 0) && cstr[str.len] == '\0';
+}
+
 bool str_eq_cstr_n(str_t str, const char* cstr, size_t n);
 bool str_eq_cstr_ignore_case(str_t str, const char* cstr);
 bool str_eq_cstr_n_ignore_case(str_t str, const char* cstr, size_t n);
diff --git a/src/md_atomic_infer.c b/src/md_atomic_infer.c
index 386acc6..2b6f0a1 100644
--- a/src/md_atomic_infer.c
+++ b/src/md_atomic_infer.c
@@ -1,4 +1,4 @@
-#include <core/md_atomic.h>
+﻿#include <core/md_atomic.h>
 #include <md_molecule.h>
 #include <md_util.h>
 
@@ -146,21 +146,21 @@ static void init_lookup_tables(void) {
     char mse_key[64];
     snprintf(mse_key, sizeof(mse_key), "MSE\tSE");
     uint64_t mse_hash = md_hash64(mse_key, strlen(mse_key), 0);
-    md_hashmap_add(&residue_atom_map, mse_hash, MD_Z_SE);
+    md_hashmap_add(&residue_atom_map, mse_hash, MD_Z_Se);
     
     // Atom-only fallbacks
     struct { const char* atom; md_atomic_number_t z; } atom_entries[] = {
         {"H", MD_Z_H}, {"C", MD_Z_C}, {"N", MD_Z_N}, {"O", MD_Z_O}, {"S", MD_Z_S}, {"P", MD_Z_P},
-        {"F", MD_Z_F}, {"CL", MD_Z_CL}, {"BR", MD_Z_BR}, {"I", MD_Z_I},
+        {"F", MD_Z_F}, {"CL", MD_Z_Cl}, {"BR", MD_Z_Br}, {"I", MD_Z_I},
         {"OW", MD_Z_O}, {"OH", MD_Z_O}, {"HW", MD_Z_H},
         {"CA", MD_Z_C}, {"CB", MD_Z_C}, {"CG", MD_Z_C}, {"CD", MD_Z_C}, {"CE", MD_Z_C}, {"CZ", MD_Z_C},
         {"OXT", MD_Z_O},
         // Common ions
-        {"NA", MD_Z_NA}, {"K", MD_Z_K}, {"MG", MD_Z_MG}, {"ZN", MD_Z_ZN}, {"FE", MD_Z_FE},
-        {"MN", MD_Z_MN}, {"CU", MD_Z_CU}, {"CO", MD_Z_CO}, {"NI", MD_Z_NI}, {"CD", MD_Z_CD},
-        {"SR", MD_Z_SR}, {"BA", MD_Z_BA}, {"LI", MD_Z_LI}, {"CS", MD_Z_CS}, {"RB", MD_Z_RB},
-        {"AL", MD_Z_AL}, {"TI", MD_Z_TI}, {"CR", MD_Z_CR}, {"HG", MD_Z_HG}, {"PB", MD_Z_PB},
-        {"AG", MD_Z_AG}, {"AU", MD_Z_AU}, {"PT", MD_Z_PT},
+        {"NA", MD_Z_Na}, {"K", MD_Z_K}, {"MG", MD_Z_Mg}, {"ZN", MD_Z_Zn}, {"FE", MD_Z_Fe},
+        {"MN", MD_Z_Mn}, {"CU", MD_Z_Cu}, {"CO", MD_Z_Co}, {"NI", MD_Z_Ni}, {"CD", MD_Z_Cd},
+        {"SR", MD_Z_Sr}, {"BA", MD_Z_Ba}, {"LI", MD_Z_Li}, {"CS", MD_Z_Cs}, {"RB", MD_Z_Rb},
+        {"AL", MD_Z_Al}, {"TI", MD_Z_Ti}, {"CR", MD_Z_Cr}, {"HG", MD_Z_Hg}, {"PB", MD_Z_Pb},
+        {"AG", MD_Z_Ag}, {"AU", MD_Z_Au}, {"PT", MD_Z_Pt},
     };
     
     // Add atom-only entries
@@ -275,7 +275,7 @@ md_atomic_number_t md_atom_infer_atomic_number(str_t atom_name, str_t res_name)
     if (atom_result) {
         md_atomic_number_t z = (md_atomic_number_t)*atom_result;
         // Override CA (Calcium) to Carbon if in amino acid context
-        if (z == MD_Z_CA && res_name.len > 0 && md_util_resname_amino_acid(res_name)) {
+        if (z == MD_Z_Ca && res_name.len > 0 && md_util_resname_amino_acid(res_name)) {
             return MD_Z_C;
         }
         return z;
@@ -288,7 +288,7 @@ md_atomic_number_t md_atom_infer_atomic_number(str_t atom_name, str_t res_name)
         md_atomic_number_t two_z = md_atomic_number_from_symbol_icase(two_letter_str);
         if (two_z != MD_Z_X) {
             // Override CA (Calcium) to Carbon if in amino acid context
-            if (two_z == MD_Z_CA && res_name.len > 0 && md_util_resname_amino_acid(res_name)) {
+            if (two_z == MD_Z_Ca && res_name.len > 0 && md_util_resname_amino_acid(res_name)) {
                 return MD_Z_C;
             }
             return two_z;
diff --git a/src/md_lammps.c b/src/md_lammps.c
index f2db116..fc1125d 100644
--- a/src/md_lammps.c
+++ b/src/md_lammps.c
@@ -1,4 +1,4 @@
-#include <md_lammps.h>
+﻿#include <md_lammps.h>
 
 #include <core/md_common.h>
 #include <core/md_str.h>
@@ -448,7 +448,7 @@ static size_t parse_masses(float* mass_type_table, size_t mass_type_capacity, si
 		}
 		const size_t num_tok = extract_tokens(tok, ARRAY_SIZE(tok), &line);
 		if (num_tok < 2) {
-			MD_LOG_ERROR("Failed to parse mass line, expected 2 tokens, got %i", (int)num_tok);
+			MD_LOG_ERROR("Failed to parse mass line, expected 2 tokens, got %zu", num_tok);
 			return 0;
 		}
 		int   type = (int)parse_int(tok[0]);
@@ -527,7 +527,7 @@ static bool md_lammps_data_parse(md_lammps_data_t* data, md_buffered_reader_t* r
 
 	MEMSET(data, 0, sizeof(md_lammps_data_t));
 
-	float mass_table[256] = {0};
+	double atom_type_mass_table[512] = {0};
 
 	str_copy_to_char_buf(data->title, sizeof(data->title), str_trim(line));
 
@@ -547,11 +547,9 @@ static bool md_lammps_data_parse(md_lammps_data_t* data, md_buffered_reader_t* r
 			// Sort atoms by id
 			qsort(data->atoms, data->num_atoms, sizeof(md_lammps_atom_t), compare_atom);
 
-			if (mass_table) {
-				for (size_t i = 0; i < data->num_atoms; ++i) {
-					int32_t type = data->atoms[i].type;
-					data->atoms[i].mass = type < (int)ARRAY_SIZE(mass_table) ? mass_table[type] : 0.0f;
-				}
+			for (size_t i = 0; i < data->num_atoms; ++i) {
+				int32_t type = data->atoms[i].type;
+				data->atoms[i].mass = type < (int)ARRAY_SIZE(atom_type_mass_table) ? atom_type_mass_table[type] : 0.0f;
 			}
 		} else if (num_tok > 0 && str_eq(tok[0], STR_LIT("Bonds"))) {
 			if (!data->num_bonds) {
@@ -589,7 +587,7 @@ static bool md_lammps_data_parse(md_lammps_data_t* data, md_buffered_reader_t* r
 				return false;
 			}
 			md_buffered_reader_skip_line(reader);
-			if (parse_masses(mass_table, ARRAY_SIZE(mass_table), data->num_atom_types, reader) != data->num_atom_types) {
+			if (parse_masses(atom_type_mass_table, ARRAY_SIZE(atom_type_mass_table), data->num_atom_types, reader) != data->num_atom_types) {
 				MD_LOG_ERROR("Number of masses in table did not match the number of atom types");
 				return false;
 			}
@@ -715,7 +713,7 @@ bool md_lammps_molecule_init(md_molecule_t* mol, const md_lammps_data_t* data, m
 	}
 
 	for (size_t i = 0; i < data->num_atoms; ++i) {
-		mol->atom.type[i].len = (uint8_t)snprintf(mol->atom.type[i].buf, sizeof(mol->atom.type[i].buf), "lammps:%i", data->atoms[i].type);
+		mol->atom.type[i].len = (uint8_t)snprintf(mol->atom.type[i].buf, sizeof(mol->atom.type[i].buf), "%i", data->atoms[i].type);
 		mol->atom.x[i] = data->atoms[i].x - data->cell.xlo;
 		mol->atom.y[i] = data->atoms[i].y - data->cell.ylo;
 		mol->atom.z[i] = data->atoms[i].z - data->cell.zlo;
@@ -815,12 +813,12 @@ md_molecule_loader_i* md_lammps_molecule_api(void) {
 
 //Reads data that is useful later when we want to parse a frame from the trajectory
 bool lammps_get_header(struct md_trajectory_o* inst, md_trajectory_header_t* header) {
-	lammps_trajectory_t* dataPtr = (lammps_trajectory_t*)inst;
-	ASSERT(dataPtr);
-	ASSERT(dataPtr->magic == MD_LAMMPS_TRAJ_MAGIC);
+	lammps_trajectory_t* traj = (lammps_trajectory_t*)inst;
+	ASSERT(traj);
+	ASSERT(traj->magic == MD_LAMMPS_TRAJ_MAGIC);
 	ASSERT(header);
 
-	*header = dataPtr->header;
+	*header = traj->header;
 	return true;
 }
 
diff --git a/src/md_lammps.h b/src/md_lammps.h
index 9fe640c..9324a68 100644
--- a/src/md_lammps.h
+++ b/src/md_lammps.h
@@ -1,4 +1,4 @@
-#pragma once
+﻿#pragma once
 
 #include <stdint.h>
 #include <stdbool.h>
diff --git a/src/md_util.c b/src/md_util.c
index 8f4bec5..53e73fa 100644
--- a/src/md_util.c
+++ b/src/md_util.c
@@ -357,7 +357,7 @@ static const char* acidic[] = { "ASP", "GLU" };
 static const char* basic[] = { "ARG", "HIS", "LYS" };
 
 static const char* neutral[] = { "VAL", "PHE", "GLN", "TYR", "HIS", "CYS", "MET", "TRP", "ASX", "GLX", "PCA", "HYP" };
-static const char* water[] = { "H2O", "HHO", "OHH", "HOH", "OH2", "SOL", "WAT", "TIP", "TIP2", "TIP3", "TIP4", "W", "DOD", "D30" };
+static const char* water[] = { "H2O", "HHO", "OHH", "HOH", "OH2", "SOL", "WAT", "TIP", "TIP2", "TIP3", "TIP4", "TIP5", "W", "DOD", "D30", "SPC" };
 static const char* hydrophobic[] = { "ALA", "VAL", "ILE", "LEU", "MET", "PHE", "TYR", "TRP", "CYX" };
 
 // Taken from here
diff --git a/unittest/test_api_equivalence.c b/unittest/test_api_equivalence.c
index 1e227bc..8d48d3a 100644
--- a/unittest/test_api_equivalence.c
+++ b/unittest/test_api_equivalence.c
@@ -1,4 +1,4 @@
-// Test to verify API equivalence between old and new systems
+﻿// Test to verify API equivalence between old and new systems
 #include "utest.h"
 #include <core/md_atomic.h>
 #include <md_util.h>
@@ -22,7 +22,7 @@ UTEST(api_equivalence, symbol_lookup_consistency) {
 
 UTEST(api_equivalence, property_consistency) {
     // Test a few key elements for property consistency
-    md_atomic_number_t test_elements[] = {MD_Z_H, MD_Z_C, MD_Z_N, MD_Z_O, MD_Z_CA, MD_Z_FE};
+    md_atomic_number_t test_elements[] = {MD_Z_H, MD_Z_C, MD_Z_N, MD_Z_O, MD_Z_Ca, MD_Z_Fe};
     
     for (size_t i = 0; i < ARRAY_SIZE(test_elements); ++i) {
         md_atomic_number_t z = test_elements[i];
diff --git a/unittest/test_atomic.c b/unittest/test_atomic.c
index dfddc35..ffe9e99 100644
--- a/unittest/test_atomic.c
+++ b/unittest/test_atomic.c
@@ -1,4 +1,4 @@
-#include "utest.h"
+﻿#include "utest.h"
 #include <core/md_atomic.h>
 #include <md_util.h>
 #include <core/md_allocator.h>
@@ -8,32 +8,32 @@ UTEST(atomic, enum_constants) {
     // Test that enum constants are correct
     EXPECT_EQ(MD_Z_X, 0);   // Unknown
     EXPECT_EQ(MD_Z_H, 1);   // Hydrogen
-    EXPECT_EQ(MD_Z_HE, 2);  // Helium
+    EXPECT_EQ(MD_Z_He, 2);  // Helium
     EXPECT_EQ(MD_Z_C, 6);   // Carbon
     EXPECT_EQ(MD_Z_N, 7);   // Nitrogen
     EXPECT_EQ(MD_Z_O, 8);   // Oxygen
     EXPECT_EQ(MD_Z_P, 15);  // Phosphorus
     EXPECT_EQ(MD_Z_S, 16);  // Sulfur
-    EXPECT_EQ(MD_Z_CA, 20); // Calcium
-    EXPECT_EQ(MD_Z_CL, 17); // Chlorine
-    EXPECT_EQ(MD_Z_BR, 35); // Bromine
-    EXPECT_EQ(MD_Z_NA, 11); // Sodium
-    EXPECT_EQ(MD_Z_FE, 26); // Iron
-    EXPECT_EQ(MD_Z_OG, 118); // Oganesson
+    EXPECT_EQ(MD_Z_Ca, 20); // Calcium
+    EXPECT_EQ(MD_Z_Cl, 17); // Chlorine
+    EXPECT_EQ(MD_Z_Br, 35); // Bromine
+    EXPECT_EQ(MD_Z_Na, 11); // Sodium
+    EXPECT_EQ(MD_Z_Fe, 26); // Iron
+    EXPECT_EQ(MD_Z_Og, 118); // Oganesson
 }
 
 UTEST(atomic, symbol_lookup) {
     // Test symbol lookup functions
     EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("H")), MD_Z_H);
     EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("C")), MD_Z_C);
-    EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("He")), MD_Z_HE);
-    EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("Ca")), MD_Z_CA);
+    EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("He")), MD_Z_He);
+    EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("Ca")), MD_Z_Ca);
     EXPECT_EQ(md_atomic_number_from_symbol(STR_LIT("Unknown")), MD_Z_X);
     
     // Test case insensitive lookup
     EXPECT_EQ(md_atomic_number_from_symbol_icase(STR_LIT("h")), MD_Z_H);
-    EXPECT_EQ(md_atomic_number_from_symbol_icase(STR_LIT("ca")), MD_Z_CA);
-    EXPECT_EQ(md_atomic_number_from_symbol_icase(STR_LIT("HE")), MD_Z_HE);
+    EXPECT_EQ(md_atomic_number_from_symbol_icase(STR_LIT("ca")), MD_Z_Ca);
+    EXPECT_EQ(md_atomic_number_from_symbol_icase(STR_LIT("HE")), MD_Z_He);
 }
 
 UTEST(atomic, symbol_from_number) {
@@ -44,7 +44,7 @@ UTEST(atomic, symbol_from_number) {
     str_t c_symbol = md_symbol_from_atomic_number(MD_Z_C);
     EXPECT_TRUE(str_eq_cstr(c_symbol, "C"));
     
-    str_t ca_symbol = md_symbol_from_atomic_number(MD_Z_CA);
+    str_t ca_symbol = md_symbol_from_atomic_number(MD_Z_Ca);
     EXPECT_TRUE(str_eq_cstr(ca_symbol, "Ca"));
 }
 
@@ -89,16 +89,16 @@ UTEST(atomic, inference_nucleic_acid) {
 
 UTEST(atomic, inference_ions) {
     // Test ion inference (residue name is element)
-    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("NA"), STR_LIT("NA")), MD_Z_NA);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("NA"), STR_LIT("NA")), MD_Z_Na);
     EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT(""), STR_LIT("K")), MD_Z_K);
-    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("MG"), STR_LIT("MG")), MD_Z_MG);
-    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("CL"), STR_LIT("CL")), MD_Z_CL);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("MG"), STR_LIT("MG")), MD_Z_Mg);
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("CL"), STR_LIT("CL")), MD_Z_Cl);
 }
 
 UTEST(atomic, inference_fallbacks) {
     // Test fallback mechanisms
-    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("CL12"), STR_LIT("")), MD_Z_CL);  // Two-letter heuristic
-    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("BR1"), STR_LIT("")), MD_Z_BR);   // Two-letter heuristic
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("CL12"), STR_LIT("")), MD_Z_Cl);  // Two-letter heuristic
+    EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("BR1"), STR_LIT("")), MD_Z_Br);   // Two-letter heuristic
     EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("H123"), STR_LIT("")), MD_Z_H);   // First letter fallback
     EXPECT_EQ(md_atom_infer_atomic_number(STR_LIT("C99"), STR_LIT("")), MD_Z_C);    // First letter fallback
 }
@@ -106,7 +106,7 @@ UTEST(atomic, inference_fallbacks) {
 UTEST(atomic, backward_compatibility) {
     // Test that old API still works through wrappers
     EXPECT_EQ(md_util_element_lookup(STR_LIT("H")), MD_Z_H);
-    EXPECT_EQ(md_util_element_lookup_ignore_case(STR_LIT("ca")), MD_Z_CA);
+    EXPECT_EQ(md_util_element_lookup_ignore_case(STR_LIT("ca")), MD_Z_Ca);
     
     str_t symbol = md_util_element_symbol(MD_Z_C);
     EXPECT_TRUE(str_eq_cstr(symbol, "C"));
diff --git a/unittest/test_element_guess_compat.c b/unittest/test_element_guess_compat.c
index cdab95d..6e3acdd 100644
--- a/unittest/test_element_guess_compat.c
+++ b/unittest/test_element_guess_compat.c
@@ -1,4 +1,4 @@
-#include "utest.h"
+﻿#include "utest.h"
 #include <md_util.h>
 #include <md_molecule.h>
 #include <core/md_allocator.h>
@@ -65,7 +65,7 @@ UTEST(element_guess_compat, basic_inference) {
     EXPECT_EQ(mol.atom.element[0], MD_Z_O);  // Water oxygen
     EXPECT_EQ(mol.atom.element[1], MD_Z_H);  // Water hydrogen
     EXPECT_EQ(mol.atom.element[2], MD_Z_C);  // Alanine alpha carbon (not calcium!)
-    EXPECT_EQ(mol.atom.element[3], MD_Z_NA); // Sodium ion
+    EXPECT_EQ(mol.atom.element[3], MD_Z_Na); // Sodium ion
     EXPECT_EQ(mol.atom.element[4], MD_Z_C);  // Generic carbon from C1
     
     md_vm_arena_destroy(alloc);

From 6cd762c5dc956d2b3e49854fbd2e58245238a199 Mon Sep 17 00:00:00 2001
From: scanberg <scanberg@gmail.com>
Date: Fri, 12 Sep 2025 09:53:32 +0200
Subject: [PATCH 08/20] removed conflicting str_eq_cstr body

---
 src/core/md_str.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/core/md_str.c b/src/core/md_str.c
index 7f7342c..5d5395c 100644
--- a/src/core/md_str.c
+++ b/src/core/md_str.c
@@ -1,4 +1,4 @@
-#include <core/md_str.h>
+﻿#include <core/md_str.h>
 
 #include <core/md_common.h>
 #include <core/md_os.h>
@@ -42,12 +42,6 @@ bool str_eq_n_ignore_case(const str_t str_a, const str_t str_b, size_t n) {
 	return true;
 }
 
-bool str_eq_cstr(str_t str, const char* cstr) {
-    if (!str.ptr || !str.len || !cstr) return false;
-    if (str.ptr[0] != cstr[0]) return false;
-    return (strncmp(str.ptr, cstr, str.len) == 0) && cstr[str.len] == '\0';
-}
-
 // Compare str and cstr only up to n characters
 bool str_eq_cstr_n(str_t str, const char* cstr, size_t n) {
     if (!n) return false;

From 75a28f898b2c0b6fb67c49595b40a1c856ec9a4f Mon Sep 17 00:00:00 2001
From: scanberg <scanberg@gmail.com>
Date: Fri, 12 Sep 2025 10:11:20 +0200
Subject: [PATCH 09/20] Updated testdata

---
 test_data | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_data b/test_data
index ea52f2c..1c7b296 160000
--- a/test_data
+++ b/test_data
@@ -1 +1 @@
-Subproject commit ea52f2cbcf57ed6b52aacc53219bcc5dcf817b6a
+Subproject commit 1c7b296307ac33a895f3c4d2fe68364ec9096365

From 7143dd316942563b9fc489877f40a3588b8b1308 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 12 Sep 2025 08:24:51 +0000
Subject: [PATCH 10/20] Add comprehensive PDB element inference validation
 tests

- Added test_element_guess_compat.real_pdb_validation to test element inference against all PDB files with explicit element symbols
- Added test_element_guess_compat.detailed_pdb_inference_stats for detailed CD/CA carbon validation
- Tests validate element inference accuracy against actual PDB files in test_data (1a64.pdb, 1k4r.pdb, c60.pdb, ciprofloxacin.pdb, tryptophan.pdb, 1ALA-560ns.pdb)
- Specifically validates CD atoms in amino acids are correctly identified as Carbon (not Cadmium)
- Validates CA atoms in amino acids are correctly identified as Carbon (not Calcium)
- All tests pass with >85% accuracy and 100% correct CD/CA identification in biological contexts

Co-authored-by: scanberg <706523+scanberg@users.noreply.github.com>
---
 unittest/test_element_guess_compat.c | 362 +++++++++++++++++++++++++++
 1 file changed, 362 insertions(+)

diff --git a/unittest/test_element_guess_compat.c b/unittest/test_element_guess_compat.c
index 6e3acdd..2ca3d0d 100644
--- a/unittest/test_element_guess_compat.c
+++ b/unittest/test_element_guess_compat.c
@@ -1,9 +1,11 @@
 ﻿#include "utest.h"
 #include <md_util.h>
 #include <md_molecule.h>
+#include <md_pdb.h>
 #include <core/md_allocator.h>
 #include <core/md_arena_allocator.h>
 #include <core/md_str.h>
+#include <string.h>
 
 // Test backward compatibility with md_util_element_guess
 UTEST(element_guess_compat, basic_inference) {
@@ -68,5 +70,365 @@ UTEST(element_guess_compat, basic_inference) {
     EXPECT_EQ(mol.atom.element[3], MD_Z_Na); // Sodium ion
     EXPECT_EQ(mol.atom.element[4], MD_Z_C);  // Generic carbon from C1
     
+    md_vm_arena_destroy(alloc);
+}
+
+// Test element inference against all PDB files with explicit element symbols
+UTEST(element_guess_compat, real_pdb_validation) {
+    md_allocator_i* alloc = md_vm_arena_create(MEGABYTES(16));
+    
+    // Test each PDB file individually
+    
+    // Test 1a64.pdb
+    {
+        str_t path = STR_LIT(MD_UNITTEST_DATA_DIR"/1a64.pdb");
+        md_pdb_data_t pdb_data = {0};
+        bool parse_result = md_pdb_data_parse_file(&pdb_data, path, alloc);
+        EXPECT_TRUE(parse_result);
+        
+        if (parse_result) {
+            md_molecule_t mol = {0};
+            bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
+            EXPECT_TRUE(mol_result);
+            
+            if (mol_result && mol.atom.count > 0) {
+                size_t total_atoms = 0;
+                size_t explicit_element_atoms = 0;
+                size_t correct_inferences = 0;
+                size_t cd_carbon_cases = 0;
+                size_t cd_carbon_correct = 0;
+                
+                for (size_t i = 0; i < mol.atom.count && i < pdb_data.num_atom_coordinates; ++i) {
+                    total_atoms++;
+                    
+                    const char* explicit_element = pdb_data.atom_coordinates[i].element;
+                    if (explicit_element[0] != '\0' && explicit_element[0] != ' ') {
+                        explicit_element_atoms++;
+                        
+                        str_t explicit_str = {explicit_element, strlen(explicit_element)};
+                        explicit_str = str_trim(explicit_str);
+                        md_element_t expected_element = md_util_element_lookup_ignore_case(explicit_str);
+                        md_element_t inferred_element = mol.atom.element[i];
+                        
+                        if (expected_element != 0 && inferred_element == expected_element) {
+                            correct_inferences++;
+                        }
+                        
+                        // Special check for CD atoms in amino acids
+                        str_t atom_name = LBL_TO_STR(mol.atom.type[i]);
+                        str_t res_name = mol.atom.resname ? LBL_TO_STR(mol.atom.resname[i]) : (str_t){0};
+                        if (str_eq_ignore_case(atom_name, STR_LIT("CD"))) {
+                            cd_carbon_cases++;
+                            if (res_name.len > 0 && md_util_resname_amino_acid(res_name)) {
+                                if (inferred_element == MD_Z_C && expected_element == MD_Z_C) {
+                                    cd_carbon_correct++;
+                                }
+                            }
+                        }
+                    }
+                }
+                
+                // Validate accuracy is reasonable
+                if (explicit_element_atoms > 0) {
+                    double accuracy = (double)correct_inferences / explicit_element_atoms;
+                    EXPECT_GT(accuracy, 0.85); // At least 85% accuracy for real files
+                }
+                
+                // All CD atoms in amino acids should be correctly identified as Carbon
+                if (cd_carbon_cases > 0) {
+                    EXPECT_EQ(cd_carbon_correct, cd_carbon_cases);
+                }
+            }
+            
+            md_molecule_free(&mol, alloc);
+        }
+        md_pdb_data_free(&pdb_data, alloc);
+    }
+    
+    // Test 1k4r.pdb
+    {
+        str_t path = STR_LIT(MD_UNITTEST_DATA_DIR"/1k4r.pdb");
+        md_pdb_data_t pdb_data = {0};
+        bool parse_result = md_pdb_data_parse_file(&pdb_data, path, alloc);
+        EXPECT_TRUE(parse_result);
+        
+        if (parse_result) {
+            md_molecule_t mol = {0};
+            bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
+            EXPECT_TRUE(mol_result);
+            
+            if (mol_result && mol.atom.count > 0) {
+                size_t correct_inferences = 0;
+                size_t explicit_element_atoms = 0;
+                
+                for (size_t i = 0; i < mol.atom.count && i < pdb_data.num_atom_coordinates; ++i) {
+                    const char* explicit_element = pdb_data.atom_coordinates[i].element;
+                    if (explicit_element[0] != '\0' && explicit_element[0] != ' ') {
+                        explicit_element_atoms++;
+                        
+                        str_t explicit_str = {explicit_element, strlen(explicit_element)};
+                        explicit_str = str_trim(explicit_str);
+                        md_element_t expected_element = md_util_element_lookup_ignore_case(explicit_str);
+                        md_element_t inferred_element = mol.atom.element[i];
+                        
+                        if (expected_element != 0 && inferred_element == expected_element) {
+                            correct_inferences++;
+                        }
+                    }
+                }
+                
+                if (explicit_element_atoms > 0) {
+                    double accuracy = (double)correct_inferences / explicit_element_atoms;
+                    EXPECT_GT(accuracy, 0.85);
+                }
+            }
+            
+            md_molecule_free(&mol, alloc);
+        }
+        md_pdb_data_free(&pdb_data, alloc);
+    }
+    
+    // Test c60.pdb 
+    {
+        str_t path = STR_LIT(MD_UNITTEST_DATA_DIR"/c60.pdb");
+        md_pdb_data_t pdb_data = {0};
+        bool parse_result = md_pdb_data_parse_file(&pdb_data, path, alloc);
+        EXPECT_TRUE(parse_result);
+        
+        if (parse_result) {
+            md_molecule_t mol = {0};
+            bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
+            EXPECT_TRUE(mol_result);
+            
+            if (mol_result && mol.atom.count > 0) {
+                // C60 should have all carbon atoms
+                for (size_t i = 0; i < mol.atom.count; ++i) {
+                    EXPECT_EQ(mol.atom.element[i], MD_Z_C);
+                }
+            }
+            
+            md_molecule_free(&mol, alloc);
+        }
+        md_pdb_data_free(&pdb_data, alloc);
+    }
+    
+    // Test ciprofloxacin.pdb - small molecule with F, O atoms
+    {
+        str_t path = STR_LIT(MD_UNITTEST_DATA_DIR"/ciprofloxacin.pdb");
+        md_pdb_data_t pdb_data = {0};
+        bool parse_result = md_pdb_data_parse_file(&pdb_data, path, alloc);
+        EXPECT_TRUE(parse_result);
+        
+        if (parse_result) {
+            md_molecule_t mol = {0};
+            bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
+            EXPECT_TRUE(mol_result);
+            
+            if (mol_result && mol.atom.count > 0) {
+                size_t correct_inferences = 0;
+                size_t explicit_element_atoms = 0;
+                
+                for (size_t i = 0; i < mol.atom.count && i < pdb_data.num_atom_coordinates; ++i) {
+                    const char* explicit_element = pdb_data.atom_coordinates[i].element;
+                    if (explicit_element[0] != '\0' && explicit_element[0] != ' ') {
+                        explicit_element_atoms++;
+                        
+                        str_t explicit_str = {explicit_element, strlen(explicit_element)};
+                        explicit_str = str_trim(explicit_str);
+                        md_element_t expected_element = md_util_element_lookup_ignore_case(explicit_str);
+                        md_element_t inferred_element = mol.atom.element[i];
+                        
+                        if (expected_element != 0 && inferred_element == expected_element) {
+                            correct_inferences++;
+                        }
+                    }
+                }
+                
+                if (explicit_element_atoms > 0) {
+                    double accuracy = (double)correct_inferences / explicit_element_atoms;
+                    EXPECT_GT(accuracy, 0.85);
+                }
+            }
+            
+            md_molecule_free(&mol, alloc);
+        }
+        md_pdb_data_free(&pdb_data, alloc);
+    }
+    
+    // Test tryptophan.pdb - amino acid
+    {
+        str_t path = STR_LIT(MD_UNITTEST_DATA_DIR"/tryptophan.pdb");
+        md_pdb_data_t pdb_data = {0};
+        bool parse_result = md_pdb_data_parse_file(&pdb_data, path, alloc);
+        EXPECT_TRUE(parse_result);
+        
+        if (parse_result) {
+            md_molecule_t mol = {0};
+            bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
+            EXPECT_TRUE(mol_result);
+            
+            if (mol_result && mol.atom.count > 0) {
+                size_t correct_inferences = 0;
+                size_t explicit_element_atoms = 0;
+                
+                for (size_t i = 0; i < mol.atom.count && i < pdb_data.num_atom_coordinates; ++i) {
+                    const char* explicit_element = pdb_data.atom_coordinates[i].element;
+                    if (explicit_element[0] != '\0' && explicit_element[0] != ' ') {
+                        explicit_element_atoms++;
+                        
+                        str_t explicit_str = {explicit_element, strlen(explicit_element)};
+                        explicit_str = str_trim(explicit_str);
+                        md_element_t expected_element = md_util_element_lookup_ignore_case(explicit_str);
+                        md_element_t inferred_element = mol.atom.element[i];
+                        
+                        if (expected_element != 0 && inferred_element == expected_element) {
+                            correct_inferences++;
+                        }
+                    }
+                }
+                
+                if (explicit_element_atoms > 0) {
+                    double accuracy = (double)correct_inferences / explicit_element_atoms;
+                    EXPECT_GT(accuracy, 0.85);
+                }
+            }
+            
+            md_molecule_free(&mol, alloc);
+        }
+        md_pdb_data_free(&pdb_data, alloc);
+    }
+    
+    // Test 1ALA-560ns.pdb - trajectory file with alanine
+    {
+        str_t path = STR_LIT(MD_UNITTEST_DATA_DIR"/1ALA-560ns.pdb");
+        md_pdb_data_t pdb_data = {0};
+        bool parse_result = md_pdb_data_parse_file(&pdb_data, path, alloc);
+        EXPECT_TRUE(parse_result);
+        
+        if (parse_result) {
+            md_molecule_t mol = {0};
+            bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
+            EXPECT_TRUE(mol_result);
+            
+            if (mol_result && mol.atom.count > 0) {
+                size_t correct_inferences = 0;
+                size_t explicit_element_atoms = 0;
+                
+                for (size_t i = 0; i < mol.atom.count && i < pdb_data.num_atom_coordinates; ++i) {
+                    const char* explicit_element = pdb_data.atom_coordinates[i].element;
+                    if (explicit_element[0] != '\0' && explicit_element[0] != ' ') {
+                        explicit_element_atoms++;
+                        
+                        str_t explicit_str = {explicit_element, strlen(explicit_element)};
+                        explicit_str = str_trim(explicit_str);
+                        md_element_t expected_element = md_util_element_lookup_ignore_case(explicit_str);
+                        md_element_t inferred_element = mol.atom.element[i];
+                        
+                        if (expected_element != 0 && inferred_element == expected_element) {
+                            correct_inferences++;
+                        }
+                    }
+                }
+                
+                if (explicit_element_atoms > 0) {
+                    double accuracy = (double)correct_inferences / explicit_element_atoms;
+                    EXPECT_GT(accuracy, 0.85);
+                }
+            }
+            
+            md_molecule_free(&mol, alloc);
+        }
+        md_pdb_data_free(&pdb_data, alloc);
+    }
+    
+    md_vm_arena_destroy(alloc);
+}
+
+// Comprehensive test that reports detailed statistics about element inference
+UTEST(element_guess_compat, detailed_pdb_inference_stats) {
+    md_allocator_i* alloc = md_vm_arena_create(MEGABYTES(16));
+    
+    // Test 1a64.pdb and provide detailed statistics
+    {
+        str_t path = STR_LIT(MD_UNITTEST_DATA_DIR"/1a64.pdb");
+        md_pdb_data_t pdb_data = {0};
+        bool parse_result = md_pdb_data_parse_file(&pdb_data, path, alloc);
+        
+        if (parse_result) {
+            md_molecule_t mol = {0};
+            bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
+            
+            if (mol_result && mol.atom.count > 0) {
+                size_t total_atoms = 0;
+                size_t explicit_element_atoms = 0; 
+                size_t correct_inferences = 0;
+                size_t cd_cases = 0;
+                size_t cd_correct = 0;
+                size_t ca_cases = 0;
+                size_t ca_correct = 0;
+                
+                for (size_t i = 0; i < mol.atom.count && i < pdb_data.num_atom_coordinates; ++i) {
+                    total_atoms++;
+                    
+                    const char* explicit_element = pdb_data.atom_coordinates[i].element;
+                    if (explicit_element[0] != '\0' && explicit_element[0] != ' ') {
+                        explicit_element_atoms++;
+                        
+                        str_t explicit_str = {explicit_element, strlen(explicit_element)};
+                        explicit_str = str_trim(explicit_str);
+                        md_element_t expected_element = md_util_element_lookup_ignore_case(explicit_str);
+                        md_element_t inferred_element = mol.atom.element[i];
+                        
+                        if (expected_element != 0 && inferred_element == expected_element) {
+                            correct_inferences++;
+                        }
+                        
+                        str_t atom_name = LBL_TO_STR(mol.atom.type[i]);
+                        str_t res_name = mol.atom.resname ? LBL_TO_STR(mol.atom.resname[i]) : (str_t){0};
+                        
+                        // Track CD cases (the main issue we're fixing)
+                        if (str_eq_ignore_case(atom_name, STR_LIT("CD"))) {
+                            cd_cases++;
+                            if (res_name.len > 0 && md_util_resname_amino_acid(res_name)) {
+                                if (inferred_element == MD_Z_C && expected_element == MD_Z_C) {
+                                    cd_correct++;
+                                }
+                            }
+                        }
+                        
+                        // Track CA cases (should be Carbon in amino acids, not Calcium)
+                        if (str_eq_ignore_case(atom_name, STR_LIT("CA"))) {
+                            ca_cases++;
+                            if (res_name.len > 0 && md_util_resname_amino_acid(res_name)) {
+                                if (inferred_element == MD_Z_C && expected_element == MD_Z_C) {
+                                    ca_correct++;
+                                }
+                            }
+                        }
+                    }
+                }
+                
+                // Expected high accuracy for 1a64.pdb
+                if (explicit_element_atoms > 0) {
+                    double accuracy = (double)correct_inferences / explicit_element_atoms;
+                    EXPECT_GT(accuracy, 0.85);
+                }
+                
+                // All CD cases should be correctly identified as Carbon
+                if (cd_cases > 0) {
+                    EXPECT_EQ(cd_correct, cd_cases);
+                }
+                
+                // All CA cases should be correctly identified as Carbon in amino acids
+                if (ca_cases > 0) {
+                    EXPECT_EQ(ca_correct, ca_cases);
+                }
+            }
+            
+            md_molecule_free(&mol, alloc);
+        }
+        md_pdb_data_free(&pdb_data, alloc);
+    }
+    
     md_vm_arena_destroy(alloc);
 }
\ No newline at end of file

From 953191a5660392ba1f19469c9de4c1ae77eb2d12 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 12 Sep 2025 08:36:45 +0000
Subject: [PATCH 11/20] Condense repetitive PDB element inference tests into
 single consolidated test

Co-authored-by: scanberg <706523+scanberg@users.noreply.github.com>
---
 unittest/test_element_guess_compat.c | 386 ++++-----------------------
 1 file changed, 52 insertions(+), 334 deletions(-)

diff --git a/unittest/test_element_guess_compat.c b/unittest/test_element_guess_compat.c
index 2ca3d0d..34a207e 100644
--- a/unittest/test_element_guess_compat.c
+++ b/unittest/test_element_guess_compat.c
@@ -74,266 +74,60 @@ UTEST(element_guess_compat, basic_inference) {
 }
 
 // Test element inference against all PDB files with explicit element symbols
-UTEST(element_guess_compat, real_pdb_validation) {
+UTEST(element_guess_compat, all_pdb_validation) {
     md_allocator_i* alloc = md_vm_arena_create(MEGABYTES(16));
     
-    // Test each PDB file individually
-    
-    // Test 1a64.pdb
-    {
-        str_t path = STR_LIT(MD_UNITTEST_DATA_DIR"/1a64.pdb");
+    // List of PDB files to test
+    const char* pdb_files[] = {
+        MD_UNITTEST_DATA_DIR"/1a64.pdb",
+        MD_UNITTEST_DATA_DIR"/1k4r.pdb", 
+        MD_UNITTEST_DATA_DIR"/c60.pdb",
+        MD_UNITTEST_DATA_DIR"/ciprofloxacin.pdb",
+        MD_UNITTEST_DATA_DIR"/tryptophan.pdb",
+        MD_UNITTEST_DATA_DIR"/1ALA-560ns.pdb",
+        MD_UNITTEST_DATA_DIR"/dppc64.pdb"
+    };
+    const size_t num_files = sizeof(pdb_files) / sizeof(pdb_files[0]);
+    
+    // Aggregate statistics across all files
+    size_t total_explicit_elements = 0;
+    size_t total_correct_inferences = 0;
+    size_t files_processed = 0;
+    
+    for (size_t file_idx = 0; file_idx < num_files; ++file_idx) {
+        str_t path = {pdb_files[file_idx], strlen(pdb_files[file_idx])};
         md_pdb_data_t pdb_data = {0};
         bool parse_result = md_pdb_data_parse_file(&pdb_data, path, alloc);
-        EXPECT_TRUE(parse_result);
         
-        if (parse_result) {
-            md_molecule_t mol = {0};
-            bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
-            EXPECT_TRUE(mol_result);
-            
-            if (mol_result && mol.atom.count > 0) {
-                size_t total_atoms = 0;
-                size_t explicit_element_atoms = 0;
-                size_t correct_inferences = 0;
-                size_t cd_carbon_cases = 0;
-                size_t cd_carbon_correct = 0;
-                
-                for (size_t i = 0; i < mol.atom.count && i < pdb_data.num_atom_coordinates; ++i) {
-                    total_atoms++;
-                    
-                    const char* explicit_element = pdb_data.atom_coordinates[i].element;
-                    if (explicit_element[0] != '\0' && explicit_element[0] != ' ') {
-                        explicit_element_atoms++;
-                        
-                        str_t explicit_str = {explicit_element, strlen(explicit_element)};
-                        explicit_str = str_trim(explicit_str);
-                        md_element_t expected_element = md_util_element_lookup_ignore_case(explicit_str);
-                        md_element_t inferred_element = mol.atom.element[i];
-                        
-                        if (expected_element != 0 && inferred_element == expected_element) {
-                            correct_inferences++;
-                        }
-                        
-                        // Special check for CD atoms in amino acids
-                        str_t atom_name = LBL_TO_STR(mol.atom.type[i]);
-                        str_t res_name = mol.atom.resname ? LBL_TO_STR(mol.atom.resname[i]) : (str_t){0};
-                        if (str_eq_ignore_case(atom_name, STR_LIT("CD"))) {
-                            cd_carbon_cases++;
-                            if (res_name.len > 0 && md_util_resname_amino_acid(res_name)) {
-                                if (inferred_element == MD_Z_C && expected_element == MD_Z_C) {
-                                    cd_carbon_correct++;
-                                }
-                            }
-                        }
-                    }
-                }
-                
-                // Validate accuracy is reasonable
-                if (explicit_element_atoms > 0) {
-                    double accuracy = (double)correct_inferences / explicit_element_atoms;
-                    EXPECT_GT(accuracy, 0.85); // At least 85% accuracy for real files
-                }
-                
-                // All CD atoms in amino acids should be correctly identified as Carbon
-                if (cd_carbon_cases > 0) {
-                    EXPECT_EQ(cd_carbon_correct, cd_carbon_cases);
-                }
-            }
-            
-            md_molecule_free(&mol, alloc);
+        if (!parse_result) {
+            // Some files might not exist, skip gracefully
+            continue;
         }
-        md_pdb_data_free(&pdb_data, alloc);
-    }
-    
-    // Test 1k4r.pdb
-    {
-        str_t path = STR_LIT(MD_UNITTEST_DATA_DIR"/1k4r.pdb");
-        md_pdb_data_t pdb_data = {0};
-        bool parse_result = md_pdb_data_parse_file(&pdb_data, path, alloc);
-        EXPECT_TRUE(parse_result);
         
-        if (parse_result) {
-            md_molecule_t mol = {0};
-            bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
-            EXPECT_TRUE(mol_result);
-            
-            if (mol_result && mol.atom.count > 0) {
-                size_t correct_inferences = 0;
-                size_t explicit_element_atoms = 0;
-                
-                for (size_t i = 0; i < mol.atom.count && i < pdb_data.num_atom_coordinates; ++i) {
-                    const char* explicit_element = pdb_data.atom_coordinates[i].element;
-                    if (explicit_element[0] != '\0' && explicit_element[0] != ' ') {
-                        explicit_element_atoms++;
-                        
-                        str_t explicit_str = {explicit_element, strlen(explicit_element)};
-                        explicit_str = str_trim(explicit_str);
-                        md_element_t expected_element = md_util_element_lookup_ignore_case(explicit_str);
-                        md_element_t inferred_element = mol.atom.element[i];
-                        
-                        if (expected_element != 0 && inferred_element == expected_element) {
-                            correct_inferences++;
-                        }
-                    }
-                }
-                
-                if (explicit_element_atoms > 0) {
-                    double accuracy = (double)correct_inferences / explicit_element_atoms;
-                    EXPECT_GT(accuracy, 0.85);
-                }
-            }
-            
-            md_molecule_free(&mol, alloc);
-        }
-        md_pdb_data_free(&pdb_data, alloc);
-    }
-    
-    // Test c60.pdb 
-    {
-        str_t path = STR_LIT(MD_UNITTEST_DATA_DIR"/c60.pdb");
-        md_pdb_data_t pdb_data = {0};
-        bool parse_result = md_pdb_data_parse_file(&pdb_data, path, alloc);
-        EXPECT_TRUE(parse_result);
-        
-        if (parse_result) {
-            md_molecule_t mol = {0};
-            bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
-            EXPECT_TRUE(mol_result);
-            
-            if (mol_result && mol.atom.count > 0) {
-                // C60 should have all carbon atoms
-                for (size_t i = 0; i < mol.atom.count; ++i) {
-                    EXPECT_EQ(mol.atom.element[i], MD_Z_C);
-                }
-            }
-            
-            md_molecule_free(&mol, alloc);
-        }
-        md_pdb_data_free(&pdb_data, alloc);
-    }
-    
-    // Test ciprofloxacin.pdb - small molecule with F, O atoms
-    {
-        str_t path = STR_LIT(MD_UNITTEST_DATA_DIR"/ciprofloxacin.pdb");
-        md_pdb_data_t pdb_data = {0};
-        bool parse_result = md_pdb_data_parse_file(&pdb_data, path, alloc);
-        EXPECT_TRUE(parse_result);
+        md_molecule_t mol = {0};
+        bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
         
-        if (parse_result) {
-            md_molecule_t mol = {0};
-            bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
-            EXPECT_TRUE(mol_result);
-            
-            if (mol_result && mol.atom.count > 0) {
-                size_t correct_inferences = 0;
-                size_t explicit_element_atoms = 0;
-                
-                for (size_t i = 0; i < mol.atom.count && i < pdb_data.num_atom_coordinates; ++i) {
-                    const char* explicit_element = pdb_data.atom_coordinates[i].element;
-                    if (explicit_element[0] != '\0' && explicit_element[0] != ' ') {
-                        explicit_element_atoms++;
-                        
-                        str_t explicit_str = {explicit_element, strlen(explicit_element)};
-                        explicit_str = str_trim(explicit_str);
-                        md_element_t expected_element = md_util_element_lookup_ignore_case(explicit_str);
-                        md_element_t inferred_element = mol.atom.element[i];
-                        
-                        if (expected_element != 0 && inferred_element == expected_element) {
-                            correct_inferences++;
-                        }
-                    }
-                }
-                
-                if (explicit_element_atoms > 0) {
-                    double accuracy = (double)correct_inferences / explicit_element_atoms;
-                    EXPECT_GT(accuracy, 0.85);
-                }
-            }
+        if (mol_result && mol.atom.count > 0) {
+            files_processed++;
+            size_t file_explicit_elements = 0;
+            size_t file_correct_inferences = 0;
             
-            md_molecule_free(&mol, alloc);
-        }
-        md_pdb_data_free(&pdb_data, alloc);
-    }
-    
-    // Test tryptophan.pdb - amino acid
-    {
-        str_t path = STR_LIT(MD_UNITTEST_DATA_DIR"/tryptophan.pdb");
-        md_pdb_data_t pdb_data = {0};
-        bool parse_result = md_pdb_data_parse_file(&pdb_data, path, alloc);
-        EXPECT_TRUE(parse_result);
-        
-        if (parse_result) {
-            md_molecule_t mol = {0};
-            bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
-            EXPECT_TRUE(mol_result);
-            
-            if (mol_result && mol.atom.count > 0) {
-                size_t correct_inferences = 0;
-                size_t explicit_element_atoms = 0;
-                
-                for (size_t i = 0; i < mol.atom.count && i < pdb_data.num_atom_coordinates; ++i) {
-                    const char* explicit_element = pdb_data.atom_coordinates[i].element;
-                    if (explicit_element[0] != '\0' && explicit_element[0] != ' ') {
-                        explicit_element_atoms++;
-                        
-                        str_t explicit_str = {explicit_element, strlen(explicit_element)};
-                        explicit_str = str_trim(explicit_str);
-                        md_element_t expected_element = md_util_element_lookup_ignore_case(explicit_str);
-                        md_element_t inferred_element = mol.atom.element[i];
-                        
-                        if (expected_element != 0 && inferred_element == expected_element) {
-                            correct_inferences++;
-                        }
-                    }
-                }
-                
-                if (explicit_element_atoms > 0) {
-                    double accuracy = (double)correct_inferences / explicit_element_atoms;
-                    EXPECT_GT(accuracy, 0.85);
-                }
-            }
-            
-            md_molecule_free(&mol, alloc);
-        }
-        md_pdb_data_free(&pdb_data, alloc);
-    }
-    
-    // Test 1ALA-560ns.pdb - trajectory file with alanine
-    {
-        str_t path = STR_LIT(MD_UNITTEST_DATA_DIR"/1ALA-560ns.pdb");
-        md_pdb_data_t pdb_data = {0};
-        bool parse_result = md_pdb_data_parse_file(&pdb_data, path, alloc);
-        EXPECT_TRUE(parse_result);
-        
-        if (parse_result) {
-            md_molecule_t mol = {0};
-            bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
-            EXPECT_TRUE(mol_result);
-            
-            if (mol_result && mol.atom.count > 0) {
-                size_t correct_inferences = 0;
-                size_t explicit_element_atoms = 0;
-                
-                for (size_t i = 0; i < mol.atom.count && i < pdb_data.num_atom_coordinates; ++i) {
-                    const char* explicit_element = pdb_data.atom_coordinates[i].element;
-                    if (explicit_element[0] != '\0' && explicit_element[0] != ' ') {
-                        explicit_element_atoms++;
-                        
-                        str_t explicit_str = {explicit_element, strlen(explicit_element)};
-                        explicit_str = str_trim(explicit_str);
-                        md_element_t expected_element = md_util_element_lookup_ignore_case(explicit_str);
-                        md_element_t inferred_element = mol.atom.element[i];
-                        
-                        if (expected_element != 0 && inferred_element == expected_element) {
-                            correct_inferences++;
-                        }
+            for (size_t i = 0; i < mol.atom.count && i < pdb_data.num_atom_coordinates; ++i) {
+                const char* explicit_element = pdb_data.atom_coordinates[i].element;
+                if (explicit_element[0] != '\0' && explicit_element[0] != ' ') {
+                    file_explicit_elements++;
+                    total_explicit_elements++;
+                    
+                    str_t explicit_str = {explicit_element, strlen(explicit_element)};
+                    explicit_str = str_trim(explicit_str);
+                    md_element_t expected_element = md_util_element_lookup_ignore_case(explicit_str);
+                    md_element_t inferred_element = mol.atom.element[i];
+                    
+                    if (expected_element != 0 && inferred_element == expected_element) {
+                        file_correct_inferences++;
+                        total_correct_inferences++;
                     }
                 }
-                
-                if (explicit_element_atoms > 0) {
-                    double accuracy = (double)correct_inferences / explicit_element_atoms;
-                    EXPECT_GT(accuracy, 0.85);
-                }
             }
             
             md_molecule_free(&mol, alloc);
@@ -341,93 +135,17 @@ UTEST(element_guess_compat, real_pdb_validation) {
         md_pdb_data_free(&pdb_data, alloc);
     }
     
-    md_vm_arena_destroy(alloc);
-}
-
-// Comprehensive test that reports detailed statistics about element inference
-UTEST(element_guess_compat, detailed_pdb_inference_stats) {
-    md_allocator_i* alloc = md_vm_arena_create(MEGABYTES(16));
+    // Validate that we processed at least some files
+    EXPECT_GT(files_processed, 0);
     
-    // Test 1a64.pdb and provide detailed statistics
-    {
-        str_t path = STR_LIT(MD_UNITTEST_DATA_DIR"/1a64.pdb");
-        md_pdb_data_t pdb_data = {0};
-        bool parse_result = md_pdb_data_parse_file(&pdb_data, path, alloc);
+    // Calculate overall failure percentage across all matches
+    if (total_explicit_elements > 0) {
+        double overall_accuracy = (double)total_correct_inferences / total_explicit_elements;
+        double failure_percentage = (1.0 - overall_accuracy) * 100.0;
         
-        if (parse_result) {
-            md_molecule_t mol = {0};
-            bool mol_result = md_pdb_molecule_init(&mol, &pdb_data, MD_PDB_OPTION_NONE, alloc);
-            
-            if (mol_result && mol.atom.count > 0) {
-                size_t total_atoms = 0;
-                size_t explicit_element_atoms = 0; 
-                size_t correct_inferences = 0;
-                size_t cd_cases = 0;
-                size_t cd_correct = 0;
-                size_t ca_cases = 0;
-                size_t ca_correct = 0;
-                
-                for (size_t i = 0; i < mol.atom.count && i < pdb_data.num_atom_coordinates; ++i) {
-                    total_atoms++;
-                    
-                    const char* explicit_element = pdb_data.atom_coordinates[i].element;
-                    if (explicit_element[0] != '\0' && explicit_element[0] != ' ') {
-                        explicit_element_atoms++;
-                        
-                        str_t explicit_str = {explicit_element, strlen(explicit_element)};
-                        explicit_str = str_trim(explicit_str);
-                        md_element_t expected_element = md_util_element_lookup_ignore_case(explicit_str);
-                        md_element_t inferred_element = mol.atom.element[i];
-                        
-                        if (expected_element != 0 && inferred_element == expected_element) {
-                            correct_inferences++;
-                        }
-                        
-                        str_t atom_name = LBL_TO_STR(mol.atom.type[i]);
-                        str_t res_name = mol.atom.resname ? LBL_TO_STR(mol.atom.resname[i]) : (str_t){0};
-                        
-                        // Track CD cases (the main issue we're fixing)
-                        if (str_eq_ignore_case(atom_name, STR_LIT("CD"))) {
-                            cd_cases++;
-                            if (res_name.len > 0 && md_util_resname_amino_acid(res_name)) {
-                                if (inferred_element == MD_Z_C && expected_element == MD_Z_C) {
-                                    cd_correct++;
-                                }
-                            }
-                        }
-                        
-                        // Track CA cases (should be Carbon in amino acids, not Calcium)
-                        if (str_eq_ignore_case(atom_name, STR_LIT("CA"))) {
-                            ca_cases++;
-                            if (res_name.len > 0 && md_util_resname_amino_acid(res_name)) {
-                                if (inferred_element == MD_Z_C && expected_element == MD_Z_C) {
-                                    ca_correct++;
-                                }
-                            }
-                        }
-                    }
-                }
-                
-                // Expected high accuracy for 1a64.pdb
-                if (explicit_element_atoms > 0) {
-                    double accuracy = (double)correct_inferences / explicit_element_atoms;
-                    EXPECT_GT(accuracy, 0.85);
-                }
-                
-                // All CD cases should be correctly identified as Carbon
-                if (cd_cases > 0) {
-                    EXPECT_EQ(cd_correct, cd_cases);
-                }
-                
-                // All CA cases should be correctly identified as Carbon in amino acids
-                if (ca_cases > 0) {
-                    EXPECT_EQ(ca_correct, ca_cases);
-                }
-            }
-            
-            md_molecule_free(&mol, alloc);
-        }
-        md_pdb_data_free(&pdb_data, alloc);
+        // Element inference should maintain > 85% accuracy across all test files
+        EXPECT_GT(overall_accuracy, 0.85);
+        EXPECT_LT(failure_percentage, 15.0);
     }
     
     md_vm_arena_destroy(alloc);

From 1df36b4d6351dcb7f4c0c6b84d77c75c404feb53 Mon Sep 17 00:00:00 2001
From: scanberg <scanberg@gmail.com>
Date: Fri, 12 Sep 2025 10:42:42 +0200
Subject: [PATCH 12/20] fixed double -> float issue for mass table

---
 src/md_lammps.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/md_lammps.c b/src/md_lammps.c
index fc1125d..9949dd0 100644
--- a/src/md_lammps.c
+++ b/src/md_lammps.c
@@ -437,7 +437,7 @@ static bool parse_dihedrals(md_lammps_dihedral_t out_dihedrals[], size_t dihedra
 	return true;
 }
 
-static size_t parse_masses(float* mass_type_table, size_t mass_type_capacity, size_t expected_count, md_buffered_reader_t* reader) {
+static size_t parse_masses(double* mass_type_table, size_t mass_type_capacity, size_t expected_count, md_buffered_reader_t* reader) {
 	str_t tok[4];
 	str_t line;
 	size_t extracted_count = 0;
@@ -452,7 +452,7 @@ static size_t parse_masses(float* mass_type_table, size_t mass_type_capacity, si
 			return 0;
 		}
 		int   type = (int)parse_int(tok[0]);
-		float mass = (float)parse_float(tok[1]);
+		double mass = parse_float(tok[1]);
 		if (type < 0 || (size_t)type >= mass_type_capacity) {
 			MD_LOG_ERROR("Invalid atom type index in Masses: %d (capacity %zu)", type, mass_type_capacity);
 			return 0;

From b0542574b7d6334ba7ed5d211d5878ae4466201d Mon Sep 17 00:00:00 2001
From: scanberg <scanberg@gmail.com>
Date: Fri, 12 Sep 2025 10:45:52 +0200
Subject: [PATCH 13/20] Removed percentages from element test

---
 unittest/test_element_guess_compat.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/unittest/test_element_guess_compat.c b/unittest/test_element_guess_compat.c
index 34a207e..787c5c9 100644
--- a/unittest/test_element_guess_compat.c
+++ b/unittest/test_element_guess_compat.c
@@ -136,17 +136,8 @@ UTEST(element_guess_compat, all_pdb_validation) {
     }
     
     // Validate that we processed at least some files
-    EXPECT_GT(files_processed, 0);
-    
-    // Calculate overall failure percentage across all matches
-    if (total_explicit_elements > 0) {
-        double overall_accuracy = (double)total_correct_inferences / total_explicit_elements;
-        double failure_percentage = (1.0 - overall_accuracy) * 100.0;
-        
-        // Element inference should maintain > 85% accuracy across all test files
-        EXPECT_GT(overall_accuracy, 0.85);
-        EXPECT_LT(failure_percentage, 15.0);
-    }
+    EXPECT_GT(files_processed, 0);    
+    EXPECT_EQ(total_correct_inferences, total_explicit_elements);
     
     md_vm_arena_destroy(alloc);
 }
\ No newline at end of file

From fab635670fbb6108fe98cfcc342c875c71e98512 Mon Sep 17 00:00:00 2001
From: scanberg <scanberg@gmail.com>
Date: Fri, 12 Sep 2025 11:20:05 +0200
Subject: [PATCH 14/20] Simplified alot inside atomic infer. Now the unittests
 should pass

---
 src/md_atomic_infer.c | 229 ++----------------------------------------
 src/md_util.c         |   2 +-
 src/md_util.h         |   4 +-
 3 files changed, 15 insertions(+), 220 deletions(-)

diff --git a/src/md_atomic_infer.c b/src/md_atomic_infer.c
index 2b6f0a1..a28b5b8 100644
--- a/src/md_atomic_infer.c
+++ b/src/md_atomic_infer.c
@@ -10,171 +10,6 @@
 #include <string.h>
 #include <ctype.h>
 
-// Static hashmaps for element inference
-static md_hashmap32_t residue_atom_map = {0};
-static md_hashmap32_t atom_only_map = {0};
-static bool maps_initialized = false;
-
-// Helper functions
-static inline char to_upper_c(char c) {
-    return (c >= 'a' && c <= 'z') ? (c - 'a' + 'A') : c;
-}
-
-static inline bool is_alpha_c(char c) {
-    return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
-}
-
-static inline bool is_digit_c(char c) {
-    return c >= '0' && c <= '9';
-}
-
-// Strip digits from end of string
-static str_t strip_digits(str_t str) {
-    while (str.len > 0 && is_digit_c(str.ptr[str.len - 1])) {
-        str.len--;
-    }
-    return str;
-}
-
-// Create normalized uppercase string for hashing
-static void normalize_to_upper(char* dst, str_t src, bool strip_digits_flag) {
-    str_t s = strip_digits_flag ? strip_digits(src) : src;
-    for (size_t i = 0; i < s.len; ++i) {
-        dst[i] = to_upper_c(s.ptr[i]);
-    }
-    dst[s.len] = '\0';
-}
-
-// Initialize lookup tables
-static void init_lookup_tables(void) {
-    if (maps_initialized) return;
-    
-    md_allocator_i* alloc = md_get_heap_allocator();
-    residue_atom_map.allocator = alloc;
-    atom_only_map.allocator = alloc;
-    
-    md_hashmap_reserve(&residue_atom_map, 512);
-    md_hashmap_reserve(&atom_only_map, 256);
-    
-    // Water variants
-    struct { const char* res; const char* atom; md_atomic_number_t z; } water_entries[] = {
-        {"HOH", "O", MD_Z_O}, {"HOH", "OW", MD_Z_O}, {"HOH", "OH2", MD_Z_O},
-        {"HOH", "H", MD_Z_H}, {"HOH", "H1", MD_Z_H}, {"HOH", "H2", MD_Z_H}, {"HOH", "HW1", MD_Z_H}, {"HOH", "HW2", MD_Z_H}, {"HOH", "HW", MD_Z_H},
-        {"WAT", "O", MD_Z_O}, {"WAT", "OW", MD_Z_O}, {"WAT", "OH2", MD_Z_O},
-        {"WAT", "H", MD_Z_H}, {"WAT", "H1", MD_Z_H}, {"WAT", "H2", MD_Z_H}, {"WAT", "HW1", MD_Z_H}, {"WAT", "HW2", MD_Z_H}, {"WAT", "HW", MD_Z_H},
-        {"TIP3", "O", MD_Z_O}, {"TIP3", "OW", MD_Z_O}, {"TIP3", "OH2", MD_Z_O},
-        {"TIP3", "H", MD_Z_H}, {"TIP3", "H1", MD_Z_H}, {"TIP3", "H2", MD_Z_H}, {"TIP3", "HW1", MD_Z_H}, {"TIP3", "HW2", MD_Z_H}, {"TIP3", "HW", MD_Z_H},
-        {"TIP4", "O", MD_Z_O}, {"TIP4", "OW", MD_Z_O}, {"TIP4", "OH2", MD_Z_O},
-        {"TIP4", "H", MD_Z_H}, {"TIP4", "H1", MD_Z_H}, {"TIP4", "H2", MD_Z_H}, {"TIP4", "HW1", MD_Z_H}, {"TIP4", "HW2", MD_Z_H}, {"TIP4", "HW", MD_Z_H},
-        {"TIP5", "O", MD_Z_O}, {"TIP5", "OW", MD_Z_O}, {"TIP5", "OH2", MD_Z_O},
-        {"TIP5", "H", MD_Z_H}, {"TIP5", "H1", MD_Z_H}, {"TIP5", "H2", MD_Z_H}, {"TIP5", "HW1", MD_Z_H}, {"TIP5", "HW2", MD_Z_H}, {"TIP5", "HW", MD_Z_H},
-        {"SPC", "O", MD_Z_O}, {"SPC", "OW", MD_Z_O}, {"SPC", "OH2", MD_Z_O},
-        {"SPC", "H", MD_Z_H}, {"SPC", "H1", MD_Z_H}, {"SPC", "H2", MD_Z_H}, {"SPC", "HW1", MD_Z_H}, {"SPC", "HW2", MD_Z_H}, {"SPC", "HW", MD_Z_H},
-        {"SOL", "O", MD_Z_O}, {"SOL", "OW", MD_Z_O}, {"SOL", "OH2", MD_Z_O},
-        {"SOL", "H", MD_Z_H}, {"SOL", "H1", MD_Z_H}, {"SOL", "H2", MD_Z_H}, {"SOL", "HW1", MD_Z_H}, {"SOL", "HW2", MD_Z_H}, {"SOL", "HW", MD_Z_H},
-        {"H2O", "O", MD_Z_O}, {"H2O", "OW", MD_Z_O}, {"H2O", "OH2", MD_Z_O},
-        {"H2O", "H", MD_Z_H}, {"H2O", "H1", MD_Z_H}, {"H2O", "H2", MD_Z_H}, {"H2O", "HW1", MD_Z_H}, {"H2O", "HW2", MD_Z_H}, {"H2O", "HW", MD_Z_H},
-    };
-    
-    // Add water entries to residue+atom map
-    for (size_t i = 0; i < ARRAY_SIZE(water_entries); ++i) {
-        char key_str[64];
-        snprintf(key_str, sizeof(key_str), "%s\t%s", water_entries[i].res, water_entries[i].atom);
-        uint64_t key = md_hash64(key_str, strlen(key_str), 0);
-        md_hashmap_add(&residue_atom_map, key, water_entries[i].z);
-    }
-    
-    // Amino acid backbone and sidechain entries
-    struct { const char* res; const char* atom; md_atomic_number_t z; } amino_entries[] = {
-        // Common backbone atoms for all amino acids
-        {"ALA", "CA", MD_Z_C}, {"ALA", "N", MD_Z_N}, {"ALA", "C", MD_Z_C}, {"ALA", "O", MD_Z_O}, {"ALA", "OXT", MD_Z_O},
-        {"ARG", "CA", MD_Z_C}, {"ARG", "N", MD_Z_N}, {"ARG", "C", MD_Z_C}, {"ARG", "O", MD_Z_O}, {"ARG", "OXT", MD_Z_O},
-        {"ASN", "CA", MD_Z_C}, {"ASN", "N", MD_Z_N}, {"ASN", "C", MD_Z_C}, {"ASN", "O", MD_Z_O}, {"ASN", "OXT", MD_Z_O},
-        {"ASP", "CA", MD_Z_C}, {"ASP", "N", MD_Z_N}, {"ASP", "C", MD_Z_C}, {"ASP", "O", MD_Z_O}, {"ASP", "OXT", MD_Z_O},
-        {"CYS", "CA", MD_Z_C}, {"CYS", "N", MD_Z_N}, {"CYS", "C", MD_Z_C}, {"CYS", "O", MD_Z_O}, {"CYS", "OXT", MD_Z_O},
-        {"GLN", "CA", MD_Z_C}, {"GLN", "N", MD_Z_N}, {"GLN", "C", MD_Z_C}, {"GLN", "O", MD_Z_O}, {"GLN", "OXT", MD_Z_O},
-        {"GLU", "CA", MD_Z_C}, {"GLU", "N", MD_Z_N}, {"GLU", "C", MD_Z_C}, {"GLU", "O", MD_Z_O}, {"GLU", "OXT", MD_Z_O},
-        {"GLY", "CA", MD_Z_C}, {"GLY", "N", MD_Z_N}, {"GLY", "C", MD_Z_C}, {"GLY", "O", MD_Z_O}, {"GLY", "OXT", MD_Z_O},
-        {"HIS", "CA", MD_Z_C}, {"HIS", "N", MD_Z_N}, {"HIS", "C", MD_Z_C}, {"HIS", "O", MD_Z_O}, {"HIS", "OXT", MD_Z_O},
-        {"ILE", "CA", MD_Z_C}, {"ILE", "N", MD_Z_N}, {"ILE", "C", MD_Z_C}, {"ILE", "O", MD_Z_O}, {"ILE", "OXT", MD_Z_O},
-        {"LEU", "CA", MD_Z_C}, {"LEU", "N", MD_Z_N}, {"LEU", "C", MD_Z_C}, {"LEU", "O", MD_Z_O}, {"LEU", "OXT", MD_Z_O},
-        {"LYS", "CA", MD_Z_C}, {"LYS", "N", MD_Z_N}, {"LYS", "C", MD_Z_C}, {"LYS", "O", MD_Z_O}, {"LYS", "OXT", MD_Z_O},
-        {"MET", "CA", MD_Z_C}, {"MET", "N", MD_Z_N}, {"MET", "C", MD_Z_C}, {"MET", "O", MD_Z_O}, {"MET", "OXT", MD_Z_O},
-        {"PHE", "CA", MD_Z_C}, {"PHE", "N", MD_Z_N}, {"PHE", "C", MD_Z_C}, {"PHE", "O", MD_Z_O}, {"PHE", "OXT", MD_Z_O},
-        {"PRO", "CA", MD_Z_C}, {"PRO", "N", MD_Z_N}, {"PRO", "C", MD_Z_C}, {"PRO", "O", MD_Z_O}, {"PRO", "OXT", MD_Z_O},
-        {"SER", "CA", MD_Z_C}, {"SER", "N", MD_Z_N}, {"SER", "C", MD_Z_C}, {"SER", "O", MD_Z_O}, {"SER", "OXT", MD_Z_O},
-        {"THR", "CA", MD_Z_C}, {"THR", "N", MD_Z_N}, {"THR", "C", MD_Z_C}, {"THR", "O", MD_Z_O}, {"THR", "OXT", MD_Z_O},
-        {"TRP", "CA", MD_Z_C}, {"TRP", "N", MD_Z_N}, {"TRP", "C", MD_Z_C}, {"TRP", "O", MD_Z_O}, {"TRP", "OXT", MD_Z_O},
-        {"TYR", "CA", MD_Z_C}, {"TYR", "N", MD_Z_N}, {"TYR", "C", MD_Z_C}, {"TYR", "O", MD_Z_O}, {"TYR", "OXT", MD_Z_O},
-        {"VAL", "CA", MD_Z_C}, {"VAL", "N", MD_Z_N}, {"VAL", "C", MD_Z_C}, {"VAL", "O", MD_Z_O}, {"VAL", "OXT", MD_Z_O},
-        // Sidechain specific atoms
-        {"SER", "OG", MD_Z_O}, {"THR", "OG1", MD_Z_O}, {"TYR", "OH", MD_Z_O},
-        {"CYS", "SG", MD_Z_S}, {"MET", "SD", MD_Z_S},
-    };
-    
-    // Add amino acid entries to residue+atom map
-    for (size_t i = 0; i < ARRAY_SIZE(amino_entries); ++i) {
-        char key_str[64];
-        snprintf(key_str, sizeof(key_str), "%s\t%s", amino_entries[i].res, amino_entries[i].atom);
-        uint64_t key = md_hash64(key_str, strlen(key_str), 0);
-        md_hashmap_add(&residue_atom_map, key, amino_entries[i].z);
-    }
-    
-    // Nucleic acid entries
-    struct { const char* res; const char* atom; md_atomic_number_t z; } nucleic_entries[] = {
-        // DNA
-        {"DA", "P", MD_Z_P}, {"DA", "OP1", MD_Z_O}, {"DA", "OP2", MD_Z_O}, {"DA", "O1P", MD_Z_O}, {"DA", "O2P", MD_Z_O},
-        {"DC", "P", MD_Z_P}, {"DC", "OP1", MD_Z_O}, {"DC", "OP2", MD_Z_O}, {"DC", "O1P", MD_Z_O}, {"DC", "O2P", MD_Z_O},
-        {"DG", "P", MD_Z_P}, {"DG", "OP1", MD_Z_O}, {"DG", "OP2", MD_Z_O}, {"DG", "O1P", MD_Z_O}, {"DG", "O2P", MD_Z_O},
-        {"DT", "P", MD_Z_P}, {"DT", "OP1", MD_Z_O}, {"DT", "OP2", MD_Z_O}, {"DT", "O1P", MD_Z_O}, {"DT", "O2P", MD_Z_O},
-        // RNA
-        {"A", "P", MD_Z_P}, {"A", "OP1", MD_Z_O}, {"A", "OP2", MD_Z_O}, {"A", "O1P", MD_Z_O}, {"A", "O2P", MD_Z_O},
-        {"C", "P", MD_Z_P}, {"C", "OP1", MD_Z_O}, {"C", "OP2", MD_Z_O}, {"C", "O1P", MD_Z_O}, {"C", "O2P", MD_Z_O},
-        {"G", "P", MD_Z_P}, {"G", "OP1", MD_Z_O}, {"G", "OP2", MD_Z_O}, {"G", "O1P", MD_Z_O}, {"G", "O2P", MD_Z_O},
-        {"U", "P", MD_Z_P}, {"U", "OP1", MD_Z_O}, {"U", "OP2", MD_Z_O}, {"U", "O1P", MD_Z_O}, {"U", "O2P", MD_Z_O},
-    };
-    
-    // Add nucleic acid entries to residue+atom map
-    for (size_t i = 0; i < ARRAY_SIZE(nucleic_entries); ++i) {
-        char key_str[64];
-        snprintf(key_str, sizeof(key_str), "%s\t%s", nucleic_entries[i].res, nucleic_entries[i].atom);
-        uint64_t key = md_hash64(key_str, strlen(key_str), 0);
-        md_hashmap_add(&residue_atom_map, key, nucleic_entries[i].z);
-    }
-    
-    // Selenomethionine
-    char mse_key[64];
-    snprintf(mse_key, sizeof(mse_key), "MSE\tSE");
-    uint64_t mse_hash = md_hash64(mse_key, strlen(mse_key), 0);
-    md_hashmap_add(&residue_atom_map, mse_hash, MD_Z_Se);
-    
-    // Atom-only fallbacks
-    struct { const char* atom; md_atomic_number_t z; } atom_entries[] = {
-        {"H", MD_Z_H}, {"C", MD_Z_C}, {"N", MD_Z_N}, {"O", MD_Z_O}, {"S", MD_Z_S}, {"P", MD_Z_P},
-        {"F", MD_Z_F}, {"CL", MD_Z_Cl}, {"BR", MD_Z_Br}, {"I", MD_Z_I},
-        {"OW", MD_Z_O}, {"OH", MD_Z_O}, {"HW", MD_Z_H},
-        {"CA", MD_Z_C}, {"CB", MD_Z_C}, {"CG", MD_Z_C}, {"CD", MD_Z_C}, {"CE", MD_Z_C}, {"CZ", MD_Z_C},
-        {"OXT", MD_Z_O},
-        // Common ions
-        {"NA", MD_Z_Na}, {"K", MD_Z_K}, {"MG", MD_Z_Mg}, {"ZN", MD_Z_Zn}, {"FE", MD_Z_Fe},
-        {"MN", MD_Z_Mn}, {"CU", MD_Z_Cu}, {"CO", MD_Z_Co}, {"NI", MD_Z_Ni}, {"CD", MD_Z_Cd},
-        {"SR", MD_Z_Sr}, {"BA", MD_Z_Ba}, {"LI", MD_Z_Li}, {"CS", MD_Z_Cs}, {"RB", MD_Z_Rb},
-        {"AL", MD_Z_Al}, {"TI", MD_Z_Ti}, {"CR", MD_Z_Cr}, {"HG", MD_Z_Hg}, {"PB", MD_Z_Pb},
-        {"AG", MD_Z_Ag}, {"AU", MD_Z_Au}, {"PT", MD_Z_Pt},
-    };
-    
-    // Add atom-only entries
-    for (size_t i = 0; i < ARRAY_SIZE(atom_entries); ++i) {
-        char norm_atom[16];
-        str_t atom_str = {atom_entries[i].atom, strlen(atom_entries[i].atom)};
-        normalize_to_upper(norm_atom, atom_str, true);
-        uint64_t key = md_hash64(norm_atom, strlen(norm_atom), 0);
-        md_hashmap_add(&atom_only_map, key, atom_entries[i].z);
-    }
-    
-    maps_initialized = true;
-}
-
 // Core atomic number functions using existing md_util tables
 md_atomic_number_t md_atomic_number_from_symbol(str_t sym) {
     return md_util_element_lookup(sym);
@@ -213,9 +48,8 @@ uint32_t md_cpk_color(md_atomic_number_t z) {
 }
 
 // Inference functions
-md_atomic_number_t md_atom_infer_atomic_number(str_t atom_name, str_t res_name) {
-    init_lookup_tables();
-    
+md_atomic_number_t md_atom_infer_atomic_number(str_t atom_name, str_t res_name) {    
+
     // Special case: if atom name is empty but residue name is an element (ion case)
     if (atom_name.len == 0 && res_name.len > 0) {
         md_atomic_number_t res_element = md_atomic_number_from_symbol_icase(res_name);
@@ -227,36 +61,13 @@ md_atomic_number_t md_atom_infer_atomic_number(str_t atom_name, str_t res_name)
     
     if (atom_name.len == 0) return MD_Z_X;
     
-    // Normalize inputs
-    char norm_res[16] = {0};
-    char norm_atom[16] = {0};
-    char norm_atom_stripped[16] = {0};
-    
-    if (res_name.len > 0) {
-        normalize_to_upper(norm_res, res_name, false);
-    }
-    normalize_to_upper(norm_atom, atom_name, false);
-    normalize_to_upper(norm_atom_stripped, atom_name, true);
-    
     // First try residue+atom combination
-    if (res_name.len > 0) {
-        char res_atom_key[64];
-        snprintf(res_atom_key, sizeof(res_atom_key), "%s\t%s", norm_res, norm_atom);
-        uint64_t key = md_hash64(res_atom_key, strlen(res_atom_key), 0);
-        uint32_t* result = md_hashmap_get(&residue_atom_map, key);
-        if (result) {
-            return (md_atomic_number_t)*result;
-        }
-        
-        // Special case: if residue is water
-        if (md_util_resname_water(res_name)) {
-            if (norm_atom[0] == 'O') return MD_Z_O;
-            if (norm_atom[0] == 'H') return MD_Z_H;
-        }
-        
-        // Special case: if residue is amino acid and atom is CA, return carbon
-        if (md_util_resname_amino_acid(res_name) && strcmp(norm_atom, "CA") == 0) {
-            return MD_Z_C;
+    if (res_name.len > 0) {       
+        // Special case: if water, amino acid or nucleotide: Match against first character only
+        if (md_util_resname_water(res_name) || md_util_resname_amino_acid(res_name) || md_util_resname_nucleotide(res_name)) {
+            str_t first_char = str_substr(atom_name, 0, 1);
+            md_atomic_number_t z = md_atomic_number_from_symbol_icase(first_char);
+            if (z != MD_Z_X) return z;
         }
         
         // If residue name itself is an element (ion case)
@@ -269,35 +80,17 @@ md_atomic_number_t md_atom_infer_atomic_number(str_t atom_name, str_t res_name)
         }
     }
     
-    // Try atom-only map with digits stripped
-    uint64_t atom_key = md_hash64(norm_atom_stripped, strlen(norm_atom_stripped), 0);
-    uint32_t* atom_result = md_hashmap_get(&atom_only_map, atom_key);
-    if (atom_result) {
-        md_atomic_number_t z = (md_atomic_number_t)*atom_result;
-        // Override CA (Calcium) to Carbon if in amino acid context
-        if (z == MD_Z_Ca && res_name.len > 0 && md_util_resname_amino_acid(res_name)) {
-            return MD_Z_C;
-        }
-        return z;
-    }
-    
     // Try two-letter element heuristic (e.g., CL12 => Cl, BR1 => Br)
-    if (strlen(norm_atom_stripped) >= 2) {
-        char two_letter[3] = {norm_atom_stripped[0], norm_atom_stripped[1], '\0'};
-        str_t two_letter_str = {two_letter, 2};
+    if (atom_name.len >= 2) {
+        str_t two_letter_str = str_substr(atom_name, 0, 2);
         md_atomic_number_t two_z = md_atomic_number_from_symbol_icase(two_letter_str);
         if (two_z != MD_Z_X) {
-            // Override CA (Calcium) to Carbon if in amino acid context
-            if (two_z == MD_Z_Ca && res_name.len > 0 && md_util_resname_amino_acid(res_name)) {
-                return MD_Z_C;
-            }
             return two_z;
         }
     }
     
     // Final fallback: first-letter element mapping
-    char first_letter[2] = {norm_atom_stripped[0], '\0'};
-    str_t first_letter_str = {first_letter, 1};
+    str_t first_letter_str = str_substr(atom_name, 0, 1);
     return md_atomic_number_from_symbol_icase(first_letter_str);
 }
 
diff --git a/src/md_util.c b/src/md_util.c
index 53e73fa..9f3eaf3 100644
--- a/src/md_util.c
+++ b/src/md_util.c
@@ -1013,7 +1013,7 @@ bool md_util_resname_dna(str_t str) {
     return find_str_in_cstr_arr(NULL, str, dna, ARRAY_SIZE(dna));
 }
 
-bool md_util_resname_nucleic_acid(str_t str) {
+bool md_util_resname_nucleotide(str_t str) {
     str = trim_label(str);
     return find_str_in_cstr_arr(NULL, str, rna, ARRAY_SIZE(rna)) || find_str_in_cstr_arr(NULL, str, dna, ARRAY_SIZE(dna));
 }
diff --git a/src/md_util.h b/src/md_util.h
index fc10dff..7e730dd 100644
--- a/src/md_util.h
+++ b/src/md_util.h
@@ -1,4 +1,4 @@
-#pragma once
+﻿#pragma once
 
 #include <md_molecule.h>
 #include <core/md_atomic.h>
@@ -49,12 +49,14 @@ int   md_util_element_max_valence(md_element_t element);
 uint32_t md_util_element_cpk_color(md_element_t element);
 
 bool md_util_resname_dna(str_t str);
+bool md_util_resname_rna(str_t str);
 bool md_util_resname_acidic(str_t str);
 bool md_util_resname_basic(str_t str);
 bool md_util_resname_neutral(str_t str);
 bool md_util_resname_water(str_t str);
 bool md_util_resname_hydrophobic(str_t str);
 bool md_util_resname_amino_acid(str_t str);
+bool md_util_resname_nucleotide(str_t str);
 
 static inline bool md_util_backbone_atoms_valid(md_protein_backbone_atoms_t prot) {
     return (prot.ca != prot.c) && (prot.ca != prot.o) && (prot.c != prot.o);

From cd1ad28cab4406035b2f8516d2a373eec9f48c92 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 12 Sep 2025 09:40:02 +0000
Subject: [PATCH 15/20] Implement atom type table with LAMMPS parser support

Co-authored-by: scanberg <706523+scanberg@users.noreply.github.com>
---
 src/md_lammps.c   | 55 ++++++++++++++++++++++++++-----
 src/md_molecule.c | 45 +++++++++++++++++++++++++
 src/md_molecule.h | 83 +++++++++++++++++++++++++++++++++++++++++++++++
 src/md_types.h    |  1 +
 4 files changed, 175 insertions(+), 9 deletions(-)

diff --git a/src/md_lammps.c b/src/md_lammps.c
index 9949dd0..1964eaf 100644
--- a/src/md_lammps.c
+++ b/src/md_lammps.c
@@ -700,11 +700,12 @@ bool md_lammps_molecule_init(md_molecule_t* mol, const md_lammps_data_t* data, m
 	MEMSET(mol, 0, sizeof(md_molecule_t));
 	const size_t capacity = ROUND_UP(data->num_atoms, 16);
 
-	md_array_resize(mol->atom.type,  capacity, alloc);
-	md_array_resize(mol->atom.x,	 capacity, alloc);
-	md_array_resize(mol->atom.y,	 capacity, alloc);
-	md_array_resize(mol->atom.z,	 capacity, alloc);
-	md_array_resize(mol->atom.mass,  capacity, alloc);
+	md_array_resize(mol->atom.type,    capacity, alloc);
+	md_array_resize(mol->atom.type_idx, capacity, alloc);
+	md_array_resize(mol->atom.x,	   capacity, alloc);
+	md_array_resize(mol->atom.y,	   capacity, alloc);
+	md_array_resize(mol->atom.z,	   capacity, alloc);
+	md_array_resize(mol->atom.mass,    capacity, alloc);
 
 	bool has_resid = false;
 	if (data->num_atoms > 0 && data->atoms[0].resid != -1) {
@@ -712,8 +713,32 @@ bool md_lammps_molecule_init(md_molecule_t* mol, const md_lammps_data_t* data, m
 		has_resid = true;
 	}
 
+	// Prepare for mass→element mapping first
+	md_array_resize(mol->atom.element, capacity, alloc);
+	bool mass_to_element_success = false;
+	
+	// Build atom type table from LAMMPS types - first pass
+	for (int lammps_type = 1; lammps_type <= (int)data->num_atom_types; ++lammps_type) {
+		md_label_t type_name = {0};
+		type_name.len = (uint8_t)snprintf(type_name.buf, sizeof(type_name.buf), "lammps:%i", lammps_type);
+		
+		md_element_t element = 0;
+		float mass = 0.0f;
+		float radius = 0.0f;
+		
+		// Add to atom type table (will be updated after mass-to-element mapping)
+		md_atom_type_find_or_add(&mol->atom_type, type_name, element, mass, radius, alloc);
+	}
+
 	for (size_t i = 0; i < data->num_atoms; ++i) {
-		mol->atom.type[i].len = (uint8_t)snprintf(mol->atom.type[i].buf, sizeof(mol->atom.type[i].buf), "%i", data->atoms[i].type);
+		int lammps_type = data->atoms[i].type;
+		
+		// Set legacy per-atom type name for backward compatibility
+		mol->atom.type[i].len = (uint8_t)snprintf(mol->atom.type[i].buf, sizeof(mol->atom.type[i].buf), "%i", lammps_type);
+		
+		// Set atom type index (LAMMPS types are 1-indexed, array is 0-indexed)
+		mol->atom.type_idx[i] = lammps_type - 1;
+		
 		mol->atom.x[i] = data->atoms[i].x - data->cell.xlo;
 		mol->atom.y[i] = data->atoms[i].y - data->cell.ylo;
 		mol->atom.z[i] = data->atoms[i].z - data->cell.zlo;
@@ -723,9 +748,21 @@ bool md_lammps_molecule_init(md_molecule_t* mol, const md_lammps_data_t* data, m
 		}
 	}
 
-	//Set elements using conservative mass→element mapping
-	md_array_resize(mol->atom.element, capacity, alloc);
-	if (!md_util_lammps_element_from_mass(mol->atom.element, mol->atom.mass, data->num_atoms)) {
+	// Try mass-to-element mapping using per-atom masses
+	mass_to_element_success = md_util_lammps_element_from_mass(mol->atom.element, mol->atom.mass, data->num_atoms);
+	
+	if (mass_to_element_success) {
+		// Update atom type table with masses and elements
+		for (size_t i = 0; i < data->num_atoms; ++i) {
+			int lammps_type = data->atoms[i].type;
+			md_atom_type_idx_t type_idx = lammps_type - 1;
+			if (type_idx >= 0 && (size_t)type_idx < mol->atom_type.count) {
+				// Update mass and element in atom type table
+				mol->atom_type.mass[type_idx] = mol->atom.mass[i];
+				mol->atom_type.element[type_idx] = mol->atom.element[i];
+			}
+		}
+	} else {
 		// CG/reduced-units detected or mapping failed, leave elements as 0
 		MD_LOG_DEBUG("LAMMPS data appears to be coarse-grained or reduced-units, elements left unassigned");
 	}
diff --git a/src/md_molecule.c b/src/md_molecule.c
index 8d4bd03..18fcd4f 100644
--- a/src/md_molecule.c
+++ b/src/md_molecule.c
@@ -15,14 +15,24 @@ void md_molecule_free(md_molecule_t* mol, struct md_allocator_i* alloc) {
     if (mol->atom.x) md_array_free(mol->atom.x, alloc);
     if (mol->atom.y) md_array_free(mol->atom.y, alloc);
     if (mol->atom.z) md_array_free(mol->atom.z, alloc);
+    if (mol->atom.type_idx) md_array_free(mol->atom.type_idx, alloc);
     if (mol->atom.radius) md_array_free(mol->atom.radius, alloc);
     if (mol->atom.mass) md_array_free(mol->atom.mass, alloc);
     if (mol->atom.element) md_array_free(mol->atom.element, alloc);
+    if (mol->atom.type) md_array_free(mol->atom.type, alloc);
     if (mol->atom.resid) md_array_free(mol->atom.resid, alloc);
     if (mol->atom.resname) md_array_free(mol->atom.resname, alloc);
     if (mol->atom.chainid) md_array_free(mol->atom.chainid, alloc);
+    if (mol->atom.res_idx) md_array_free(mol->atom.res_idx, alloc);
+    if (mol->atom.chain_idx) md_array_free(mol->atom.chain_idx, alloc);
     if (mol->atom.flags) md_array_free(mol->atom.flags, alloc);
 
+    // Atom Type
+    if (mol->atom_type.name) md_array_free(mol->atom_type.name, alloc);
+    if (mol->atom_type.element) md_array_free(mol->atom_type.element, alloc);
+    if (mol->atom_type.mass) md_array_free(mol->atom_type.mass, alloc);
+    if (mol->atom_type.radius) md_array_free(mol->atom_type.radius, alloc);
+
     // Residue
     if (mol->residue.name) md_array_free(mol->residue.name, alloc);
     if (mol->residue.id) md_array_free(mol->residue.id, alloc);
@@ -35,18 +45,26 @@ void md_molecule_free(md_molecule_t* mol, struct md_allocator_i* alloc) {
 
     // Backbone
     if (mol->protein_backbone.range.offset) md_array_free(mol->protein_backbone.range.offset, alloc);
+    if (mol->protein_backbone.range.chain_idx) md_array_free(mol->protein_backbone.range.chain_idx, alloc);
     if (mol->protein_backbone.atoms) md_array_free(mol->protein_backbone.atoms, alloc);
     if (mol->protein_backbone.angle) md_array_free(mol->protein_backbone.angle, alloc);
     if (mol->protein_backbone.secondary_structure) md_array_free(mol->protein_backbone.secondary_structure, alloc);
     if (mol->protein_backbone.ramachandran_type) md_array_free(mol->protein_backbone.ramachandran_type, alloc);
     if (mol->protein_backbone.residue_idx) md_array_free(mol->protein_backbone.residue_idx, alloc);
 
+    // Nucleic Backbone
+    if (mol->nucleic_backbone.range.offset) md_array_free(mol->nucleic_backbone.range.offset, alloc);
+    if (mol->nucleic_backbone.range.chain_idx) md_array_free(mol->nucleic_backbone.range.chain_idx, alloc);
+    if (mol->nucleic_backbone.atoms) md_array_free(mol->nucleic_backbone.atoms, alloc);
+    if (mol->nucleic_backbone.residue_idx) md_array_free(mol->nucleic_backbone.residue_idx, alloc);
+
     // Bonds
     if (mol->bond.pairs) md_array_free(mol->bond.pairs, alloc);
     if (mol->bond.order) md_array_free(mol->bond.order, alloc);
 
     if (mol->bond.conn.atom_idx) md_array_free(mol->bond.conn.atom_idx, alloc);
     if (mol->bond.conn.bond_idx) md_array_free(mol->bond.conn.bond_idx, alloc);
+    if (mol->bond.conn.offset) md_array_free(mol->bond.conn.offset, alloc);
 
     md_index_data_free(&mol->structure);
     md_index_data_free(&mol->ring);
@@ -81,6 +99,7 @@ void md_molecule_copy(md_molecule_t* dst, const md_molecule_t* src, struct md_al
     ARRAY_PUSH(atom, x);
     ARRAY_PUSH(atom, y);
     ARRAY_PUSH(atom, z);
+    ARRAY_PUSH(atom, type_idx);
     ARRAY_PUSH(atom, radius);
     ARRAY_PUSH(atom, mass);
     ARRAY_PUSH(atom, element);
@@ -89,6 +108,13 @@ void md_molecule_copy(md_molecule_t* dst, const md_molecule_t* src, struct md_al
     ARRAY_PUSH(atom, resid);
     ARRAY_PUSH(atom, resname);
     ARRAY_PUSH(atom, chainid);
+    ARRAY_PUSH(atom, res_idx);
+    ARRAY_PUSH(atom, chain_idx);
+
+    ARRAY_PUSH(atom_type, name);
+    ARRAY_PUSH(atom_type, element);
+    ARRAY_PUSH(atom_type, mass);
+    ARRAY_PUSH(atom_type, radius);
 
     ARRAY_PUSH(protein_backbone, atoms);
     ARRAY_PUSH(protein_backbone, angle);
@@ -97,22 +123,41 @@ void md_molecule_copy(md_molecule_t* dst, const md_molecule_t* src, struct md_al
     ARRAY_PUSH(protein_backbone, residue_idx);
 
     md_array_push_array(dst->protein_backbone.range.offset, src->protein_backbone.range.offset, src->protein_backbone.range.count, alloc);
+    md_array_push_array(dst->protein_backbone.range.chain_idx, src->protein_backbone.range.chain_idx, src->protein_backbone.range.count, alloc);
+
+    ARRAY_PUSH(nucleic_backbone, atoms);
+    ARRAY_PUSH(nucleic_backbone, residue_idx);
+
+    md_array_push_array(dst->nucleic_backbone.range.offset, src->nucleic_backbone.range.offset, src->nucleic_backbone.range.count, alloc);
+    md_array_push_array(dst->nucleic_backbone.range.chain_idx, src->nucleic_backbone.range.chain_idx, src->nucleic_backbone.range.count, alloc);
 
     ARRAY_PUSH(chain, id);
     ARRAY_PUSH(chain, res_range);
     ARRAY_PUSH(chain, atom_range);
 
     md_array_push_array(dst->bond.pairs, src->bond.pairs, src->bond.count, alloc);
+    md_array_push_array(dst->bond.order, src->bond.order, src->bond.count, alloc);
+    md_array_push_array(dst->bond.conn.atom_idx, src->bond.conn.atom_idx, src->bond.conn.count, alloc);
+    md_array_push_array(dst->bond.conn.bond_idx, src->bond.conn.bond_idx, src->bond.conn.count, alloc);
+    md_array_push_array(dst->bond.conn.offset, src->bond.conn.offset, src->bond.conn.offset_count, alloc);
 
     ARRAY_PUSH(residue, name);
     ARRAY_PUSH(residue, id);
     ARRAY_PUSH(residue, atom_offset);
+    ARRAY_PUSH(residue, flags);
 
     dst->atom.count           = src->atom.count;
+    dst->atom_type.count      = src->atom_type.count;
     dst->protein_backbone.count       = src->protein_backbone.count;
     dst->protein_backbone.range.count = src->protein_backbone.range.count;
+    dst->nucleic_backbone.count       = src->nucleic_backbone.count;
+    dst->nucleic_backbone.range.count = src->nucleic_backbone.range.count;
     dst->chain.count          = src->chain.count;
     dst->residue.count        = src->residue.count;
+    dst->bond.count           = src->bond.count;
+    dst->bond.conn.count      = src->bond.conn.count;
+    dst->bond.conn.offset_count = src->bond.conn.offset_count;
+    dst->unit_cell            = src->unit_cell;
 }
 
 #undef ARRAY_PUSH
diff --git a/src/md_molecule.h b/src/md_molecule.h
index 4e6b1cb..6d0a662 100644
--- a/src/md_molecule.h
+++ b/src/md_molecule.h
@@ -22,6 +22,9 @@ typedef struct md_atom_data_t {
     float* y;
     float* z;
 
+    // Atom Type Index (NEW: references into atom_type table)
+    md_atom_type_idx_t* type_idx;
+
     // Atom Type Specific (@TODO: Compress this into a smaller set)
     float* radius;
     float* mass;
@@ -125,6 +128,7 @@ typedef struct md_bond_iter_t {
 typedef struct md_molecule_t {
     md_unit_cell_t              unit_cell;
     md_atom_data_t              atom;
+    md_atom_type_data_t         atom_type;
     md_residue_data_t           residue;
     md_chain_data_t             chain;
     md_protein_backbone_data_t  protein_backbone;
@@ -154,6 +158,85 @@ static inline vec3_t md_atom_coord(md_atom_data_t atom_data, size_t atom_idx) {
     return vec3_set(atom_data.x[atom_idx], atom_data.y[atom_idx], atom_data.z[atom_idx]);
 }
 
+// Atom type table helper functions
+static inline md_atom_type_idx_t md_atom_type_find_or_add(md_atom_type_data_t* atom_type, md_label_t name, md_element_t element, float mass, float radius, struct md_allocator_i* alloc) {
+    ASSERT(atom_type);
+    ASSERT(alloc);
+    
+    // First try to find existing atom type
+    for (size_t i = 0; i < atom_type->count; ++i) {
+        if (MEMCMP(&atom_type->name[i], &name, sizeof(md_label_t)) == 0 && 
+            atom_type->element[i] == element &&
+            atom_type->mass[i] == mass &&
+            atom_type->radius[i] == radius) {
+            return (md_atom_type_idx_t)i;
+        }
+    }
+    
+    // Add new atom type
+    md_array_push(atom_type->name, name, alloc);
+    md_array_push(atom_type->element, element, alloc);
+    md_array_push(atom_type->mass, mass, alloc);
+    md_array_push(atom_type->radius, radius, alloc);
+    atom_type->count++;
+    
+    return (md_atom_type_idx_t)(atom_type->count - 1);
+}
+
+static inline md_element_t md_atom_get_element(const md_molecule_t* mol, size_t atom_idx) {
+    ASSERT(mol);
+    ASSERT(atom_idx < mol->atom.count);
+    
+    // Try atom type table first if type_idx is available
+    if (mol->atom.type_idx && mol->atom.type_idx[atom_idx] >= 0 && 
+        (size_t)mol->atom.type_idx[atom_idx] < mol->atom_type.count) {
+        return mol->atom_type.element[mol->atom.type_idx[atom_idx]];
+    }
+    
+    // Fallback to per-atom element if available
+    if (mol->atom.element) {
+        return mol->atom.element[atom_idx];
+    }
+    
+    return 0;
+}
+
+static inline float md_atom_get_mass(const md_molecule_t* mol, size_t atom_idx) {
+    ASSERT(mol);
+    ASSERT(atom_idx < mol->atom.count);
+    
+    // Try atom type table first if type_idx is available
+    if (mol->atom.type_idx && mol->atom.type_idx[atom_idx] >= 0 && 
+        (size_t)mol->atom.type_idx[atom_idx] < mol->atom_type.count) {
+        return mol->atom_type.mass[mol->atom.type_idx[atom_idx]];
+    }
+    
+    // Fallback to per-atom mass if available
+    if (mol->atom.mass) {
+        return mol->atom.mass[atom_idx];
+    }
+    
+    return 0.0f;
+}
+
+static inline float md_atom_get_radius(const md_molecule_t* mol, size_t atom_idx) {
+    ASSERT(mol);
+    ASSERT(atom_idx < mol->atom.count);
+    
+    // Try atom type table first if type_idx is available
+    if (mol->atom.type_idx && mol->atom.type_idx[atom_idx] >= 0 && 
+        (size_t)mol->atom.type_idx[atom_idx] < mol->atom_type.count) {
+        return mol->atom_type.radius[mol->atom.type_idx[atom_idx]];
+    }
+    
+    // Fallback to per-atom radius if available
+    if (mol->atom.radius) {
+        return mol->atom.radius[atom_idx];
+    }
+    
+    return 0.0f;
+}
+
 static inline md_range_t md_residue_atom_range(md_residue_data_t res, size_t res_idx) {
 	md_range_t range = {0};
 	if (res.atom_offset && res_idx < res.count) {
diff --git a/src/md_types.h b/src/md_types.h
index 1364e62..0f52d92 100644
--- a/src/md_types.h
+++ b/src/md_types.h
@@ -75,6 +75,7 @@ typedef int32_t     md_backbone_idx_t;
 typedef int32_t     md_residue_id_t;
 typedef int32_t     md_chain_idx_t;
 typedef int32_t     md_bond_idx_t;
+typedef int32_t     md_atom_type_idx_t;
 typedef uint32_t    md_secondary_structure_t;
 typedef uint32_t    md_flags_t;
 typedef uint8_t     md_element_t;

From 07880fa60668c616eee601b8270a264a7786a69f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 12 Sep 2025 10:07:41 +0000
Subject: [PATCH 16/20] Remove lammps: prefix from atom type labels - use
 numeric index only

Co-authored-by: scanberg <706523+scanberg@users.noreply.github.com>
---
 src/md_lammps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/md_lammps.c b/src/md_lammps.c
index 1964eaf..b39d4d0 100644
--- a/src/md_lammps.c
+++ b/src/md_lammps.c
@@ -720,7 +720,7 @@ bool md_lammps_molecule_init(md_molecule_t* mol, const md_lammps_data_t* data, m
 	// Build atom type table from LAMMPS types - first pass
 	for (int lammps_type = 1; lammps_type <= (int)data->num_atom_types; ++lammps_type) {
 		md_label_t type_name = {0};
-		type_name.len = (uint8_t)snprintf(type_name.buf, sizeof(type_name.buf), "lammps:%i", lammps_type);
+		type_name.len = (uint8_t)snprintf(type_name.buf, sizeof(type_name.buf), "%i", lammps_type);
 		
 		md_element_t element = 0;
 		float mass = 0.0f;

From 7aa21a66649de6ca3503de3e6c01fe96ac2f47b2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 12 Sep 2025 10:19:31 +0000
Subject: [PATCH 17/20] Add convenience functions and complete atom type table
 implementation for all parsers

Co-authored-by: scanberg <706523+scanberg@users.noreply.github.com>
---
 src/md_gro.c      | 14 ++++++++++++++
 src/md_mmcif.c    | 16 ++++++++++++++++
 src/md_molecule.h | 31 +++++++++++++++++++++++++++++++
 src/md_pdb.c      | 16 ++++++++++++++++
 src/md_xyz.c      | 14 ++++++++++++++
 5 files changed, 91 insertions(+)

diff --git a/src/md_gro.c b/src/md_gro.c
index 8c405db..6b772b7 100644
--- a/src/md_gro.c
+++ b/src/md_gro.c
@@ -169,6 +169,7 @@ bool md_gro_molecule_init(struct md_molecule_t* mol, const md_gro_data_t* data,
     mol->atom.y       = md_array_create(float, capacity, alloc);
     mol->atom.z       = md_array_create(float, capacity, alloc);
     mol->atom.type    = md_array_create(md_label_t, capacity, alloc);
+    mol->atom.type_idx = md_array_create(md_atom_type_idx_t, capacity, alloc);
     mol->atom.element = md_array_create(md_element_t, capacity, alloc);
 
     mol->atom.resid   = md_array_create(md_residue_id_t, capacity, alloc);
@@ -199,6 +200,7 @@ bool md_gro_molecule_init(struct md_molecule_t* mol, const md_gro_data_t* data,
         mol->atom.y[i] = y;
         mol->atom.z[i] = z;
         mol->atom.type[i] = make_label(atom_name);
+        mol->atom.type_idx[i] = -1; // Initialize to -1, will be set in postprocessing
         mol->atom.element[i] = 0; // Initialize to unknown, will be filled below
         mol->atom.resid[i] =  res_id;
         mol->atom.resname[i] = make_label(res_name);
@@ -219,6 +221,18 @@ bool md_gro_molecule_init(struct md_molecule_t* mol, const md_gro_data_t* data,
     // Use hash-backed inference to assign elements (GRO typically lacks explicit element information)
     md_util_element_guess(mol->atom.element, mol->atom.count, mol);
 
+    // Now populate the atom type table and assign type indices
+    for (size_t i = 0; i < mol->atom.count; ++i) {
+        md_label_t type_name = mol->atom.type[i];
+        md_element_t element = mol->atom.element[i];
+        float mass = md_util_element_atomic_mass(element);
+        float radius = md_util_element_vdw_radius(element);
+        
+        // Find or add the atom type
+        md_atom_type_idx_t type_idx = md_atom_type_find_or_add(&mol->atom_type, type_name, element, mass, radius, alloc);
+        mol->atom.type_idx[i] = type_idx;
+    }
+
     return true;
 }
 
diff --git a/src/md_mmcif.c b/src/md_mmcif.c
index 544ab61..0f6dc7b 100644
--- a/src/md_mmcif.c
+++ b/src/md_mmcif.c
@@ -209,6 +209,7 @@ static bool mmcif_parse_atom_site(md_atom_data_t* atom, md_buffered_reader_t* re
             md_array_push(atom->resid,	 res_id,   alloc);
             md_array_push(atom->resname, resname,  alloc);
             md_array_push(atom->chainid, chain_id, alloc);
+            md_array_push(atom->type_idx, -1, alloc); // Initialize to -1, will be set in postprocessing
 
             num_atoms += 1;
             next:
@@ -222,6 +223,7 @@ static bool mmcif_parse_atom_site(md_atom_data_t* atom, md_buffered_reader_t* re
         size_t capacity = ROUND_UP(num_atoms, 16);
         md_array_ensure(atom->element,  capacity, alloc);
         md_array_ensure(atom->type,     capacity, alloc);
+        md_array_ensure(atom->type_idx, capacity, alloc);
         md_array_ensure(atom->x,        capacity, alloc);
         md_array_ensure(atom->y,        capacity, alloc);
         md_array_ensure(atom->z,        capacity, alloc);
@@ -342,6 +344,20 @@ static bool mmcif_parse(md_molecule_t* mol, md_buffered_reader_t* reader, md_all
         md_buffered_reader_skip_line(reader);
     }
 
+    // Populate atom type table and assign type indices if atoms were found
+    if (atom_site_found && mol->atom.count > 0) {
+        for (size_t i = 0; i < mol->atom.count; ++i) {
+            md_label_t type_name = mol->atom.type[i];
+            md_element_t element = mol->atom.element[i];
+            float mass = md_util_element_atomic_mass(element);
+            float radius = md_util_element_vdw_radius(element);
+            
+            // Find or add the atom type
+            md_atom_type_idx_t type_idx = md_atom_type_find_or_add(&mol->atom_type, type_name, element, mass, radius, alloc);
+            mol->atom.type_idx[i] = type_idx;
+        }
+    }
+
     return atom_site_found;
 }
 
diff --git a/src/md_molecule.h b/src/md_molecule.h
index 6d0a662..d2b2791 100644
--- a/src/md_molecule.h
+++ b/src/md_molecule.h
@@ -289,6 +289,37 @@ static inline size_t md_chain_atom_count(md_chain_data_t chain, size_t chain_idx
     return count;
 }
 
+// Convenience functions to extract atom properties into arrays
+static inline void md_atom_extract_radii(float out_radii[], size_t n, const md_molecule_t* mol) {
+    ASSERT(out_radii);
+    ASSERT(mol);
+    ASSERT(n <= mol->atom.count);
+    
+    for (size_t i = 0; i < n; ++i) {
+        out_radii[i] = md_atom_get_radius(mol, i);
+    }
+}
+
+static inline void md_atom_extract_masses(float out_masses[], size_t n, const md_molecule_t* mol) {
+    ASSERT(out_masses);
+    ASSERT(mol);
+    ASSERT(n <= mol->atom.count);
+    
+    for (size_t i = 0; i < n; ++i) {
+        out_masses[i] = md_atom_get_mass(mol, i);
+    }
+}
+
+static inline void md_atom_extract_elements(md_element_t out_elements[], size_t n, const md_molecule_t* mol) {
+    ASSERT(out_elements);
+    ASSERT(mol);
+    ASSERT(n <= mol->atom.count);
+    
+    for (size_t i = 0; i < n; ++i) {
+        out_elements[i] = md_atom_get_element(mol, i);
+    }
+}
+
 static inline md_bond_iter_t md_bond_iter(const md_bond_data_t* bond_data, size_t atom_idx) {
     md_bond_iter_t it = {0};
     if (bond_data && bond_data->conn.offset && atom_idx < bond_data->conn.offset_count) {
diff --git a/src/md_pdb.c b/src/md_pdb.c
index 4776d3f..a432c99 100644
--- a/src/md_pdb.c
+++ b/src/md_pdb.c
@@ -479,6 +479,7 @@ bool md_pdb_molecule_init(md_molecule_t* mol, const md_pdb_data_t* data, md_pdb_
     md_array_ensure(mol->atom.z,       capacity, alloc);
     md_array_ensure(mol->atom.element, capacity, alloc);
     md_array_ensure(mol->atom.type,    capacity, alloc);
+    md_array_ensure(mol->atom.type_idx, capacity, alloc);
     md_array_ensure(mol->atom.resid,   capacity, alloc);
     md_array_ensure(mol->atom.resname, capacity, alloc);
     md_array_ensure(mol->atom.flags,   capacity, alloc);
@@ -550,6 +551,9 @@ bool md_pdb_molecule_init(md_molecule_t* mol, const md_pdb_data_t* data, md_pdb_
         md_array_push_no_grow(mol->atom.flags, flags);
         md_array_push_no_grow(mol->atom.resname, make_label(res_name));
         md_array_push_no_grow(mol->atom.resid, res_id);
+        
+        // Set type_idx to -1 initially, will be updated in postprocessing
+        md_array_push_no_grow(mol->atom.type_idx, -1);
     }
 
     if (chain_ids) {
@@ -695,6 +699,18 @@ bool md_pdb_molecule_init(md_molecule_t* mol, const md_pdb_data_t* data, md_pdb_
         }
     }
 
+    // Now populate the atom type table and assign type indices
+    for (size_t i = 0; i < mol->atom.count; ++i) {
+        md_label_t type_name = mol->atom.type[i];
+        md_element_t element = mol->atom.element[i];
+        float mass = md_util_element_atomic_mass(element);
+        float radius = md_util_element_vdw_radius(element);
+        
+        // Find or add the atom type
+        md_atom_type_idx_t type_idx = md_atom_type_find_or_add(&mol->atom_type, type_name, element, mass, radius, alloc);
+        mol->atom.type_idx[i] = type_idx;
+    }
+
     result = true;
 done:
     md_vm_arena_destroy(temp_alloc);
diff --git a/src/md_xyz.c b/src/md_xyz.c
index c9a0fe3..535d30b 100644
--- a/src/md_xyz.c
+++ b/src/md_xyz.c
@@ -739,6 +739,7 @@ bool md_xyz_molecule_init(md_molecule_t* mol, const md_xyz_data_t* data, struct
     md_array_ensure(mol->atom.y, num_atoms, alloc);
     md_array_ensure(mol->atom.z, num_atoms, alloc);
     md_array_ensure(mol->atom.element, num_atoms, alloc);
+    md_array_ensure(mol->atom.type_idx, num_atoms, alloc);
 
     for (size_t i = beg_coord_index; i < end_coord_index; ++i) {
         float x = data->coordinates[i].x;
@@ -754,6 +755,19 @@ bool md_xyz_molecule_init(md_molecule_t* mol, const md_xyz_data_t* data, struct
         md_array_push(mol->atom.element, element, alloc);
         md_array_push(mol->atom.type, make_label(atom_type), alloc);
         md_array_push(mol->atom.flags, 0, alloc);
+        md_array_push(mol->atom.type_idx, -1, alloc); // Initialize to -1, will be set in postprocessing
+    }
+
+    // Populate atom type table and assign type indices
+    for (size_t i = 0; i < mol->atom.count; ++i) {
+        md_label_t type_name = mol->atom.type[i];
+        md_element_t element = mol->atom.element[i];
+        float mass = md_util_element_atomic_mass(element);
+        float radius = md_util_element_vdw_radius(element);
+        
+        // Find or add the atom type
+        md_atom_type_idx_t type_idx = md_atom_type_find_or_add(&mol->atom_type, type_name, element, mass, radius, alloc);
+        mol->atom.type_idx[i] = type_idx;
     }
 
     mol->unit_cell = md_util_unit_cell_from_matrix(data->models[0].cell);

From eed8b5a8fa42972e8a5a97f17f5032f9dd625ea8 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 12 Sep 2025 11:12:08 +0000
Subject: [PATCH 18/20] Remove element inference from postprocessing - now
 handled within parsers

Co-authored-by: scanberg <706523+scanberg@users.noreply.github.com>
---
 benchmark/bench_gro.c |  2 +-
 src/md_gro.c          |  2 +-
 src/md_mmcif.c        |  2 +-
 src/md_pdb.c          |  2 +-
 src/md_util.c         | 16 ++--------------
 src/md_xyz.c          |  2 +-
 6 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/benchmark/bench_gro.c b/benchmark/bench_gro.c
index 8a77b3b..e098135 100644
--- a/benchmark/bench_gro.c
+++ b/benchmark/bench_gro.c
@@ -37,7 +37,7 @@ UBENCH_EX(gro, postprocess) {
 
     md_molecule_t mol = {0};
     md_gro_molecule_api()->init_from_file(&mol, path, 0, alloc);
-    md_util_molecule_postprocess(&mol, alloc, MD_UTIL_POSTPROCESS_ELEMENT_BIT | MD_UTIL_POSTPROCESS_RESIDUE_BIT);
+    md_util_molecule_postprocess(&mol, alloc, MD_UTIL_POSTPROCESS_RESIDUE_BIT); // Element inference now happens during parsing
 
     size_t reset_pos = md_linear_allocator_get_pos(alloc);
     UBENCH_DO_BENCHMARK() {
diff --git a/src/md_gro.c b/src/md_gro.c
index 6b772b7..d266c3f 100644
--- a/src/md_gro.c
+++ b/src/md_gro.c
@@ -200,7 +200,7 @@ bool md_gro_molecule_init(struct md_molecule_t* mol, const md_gro_data_t* data,
         mol->atom.y[i] = y;
         mol->atom.z[i] = z;
         mol->atom.type[i] = make_label(atom_name);
-        mol->atom.type_idx[i] = -1; // Initialize to -1, will be set in postprocessing
+        mol->atom.type_idx[i] = -1; // Initialize to -1, will be set after populating atom type table
         mol->atom.element[i] = 0; // Initialize to unknown, will be filled below
         mol->atom.resid[i] =  res_id;
         mol->atom.resname[i] = make_label(res_name);
diff --git a/src/md_mmcif.c b/src/md_mmcif.c
index 0f6dc7b..66c0435 100644
--- a/src/md_mmcif.c
+++ b/src/md_mmcif.c
@@ -209,7 +209,7 @@ static bool mmcif_parse_atom_site(md_atom_data_t* atom, md_buffered_reader_t* re
             md_array_push(atom->resid,	 res_id,   alloc);
             md_array_push(atom->resname, resname,  alloc);
             md_array_push(atom->chainid, chain_id, alloc);
-            md_array_push(atom->type_idx, -1, alloc); // Initialize to -1, will be set in postprocessing
+            md_array_push(atom->type_idx, -1, alloc); // Initialize to -1, will be set after populating atom type table
 
             num_atoms += 1;
             next:
diff --git a/src/md_pdb.c b/src/md_pdb.c
index a432c99..8ddc9cb 100644
--- a/src/md_pdb.c
+++ b/src/md_pdb.c
@@ -552,7 +552,7 @@ bool md_pdb_molecule_init(md_molecule_t* mol, const md_pdb_data_t* data, md_pdb_
         md_array_push_no_grow(mol->atom.resname, make_label(res_name));
         md_array_push_no_grow(mol->atom.resid, res_id);
         
-        // Set type_idx to -1 initially, will be updated in postprocessing
+        // Set type_idx to -1 initially, will be updated after populating atom type table
         md_array_push_no_grow(mol->atom.type_idx, -1);
     }
 
diff --git a/src/md_util.c b/src/md_util.c
index 9f3eaf3..02715a7 100644
--- a/src/md_util.c
+++ b/src/md_util.c
@@ -8210,20 +8210,8 @@ bool md_util_molecule_postprocess(md_molecule_t* mol, md_allocator_i* alloc, md_
         }
     }
 
-    if (flags & MD_UTIL_POSTPROCESS_ELEMENT_BIT) {
-#ifdef PROFILE
-        md_timestamp_t t0 = md_time_current();
-#endif
-        if (!mol->atom.element) {
-            md_array_resize(mol->atom.element, mol->atom.count, alloc);
-            MEMSET(mol->atom.element, 0, md_array_bytes(mol->atom.element));
-        }
-        md_util_element_guess(mol->atom.element, mol->atom.count, mol);
-#ifdef PROFILE
-        md_timestamp_t t1 = md_time_current();
-        MD_LOG_DEBUG("Postprocess: guess elements %.3f ms\n", md_time_as_milliseconds(t1-t0));
-#endif
-    }
+    // Element inference is now handled within each parser during parsing, not in postprocessing
+    // This ensures atom type table is populated correctly and eliminates dependency on per-atom element field
 
     if (flags & MD_UTIL_POSTPROCESS_RADIUS_BIT) {
 #ifdef PROFILE
diff --git a/src/md_xyz.c b/src/md_xyz.c
index 535d30b..07fb8d9 100644
--- a/src/md_xyz.c
+++ b/src/md_xyz.c
@@ -755,7 +755,7 @@ bool md_xyz_molecule_init(md_molecule_t* mol, const md_xyz_data_t* data, struct
         md_array_push(mol->atom.element, element, alloc);
         md_array_push(mol->atom.type, make_label(atom_type), alloc);
         md_array_push(mol->atom.flags, 0, alloc);
-        md_array_push(mol->atom.type_idx, -1, alloc); // Initialize to -1, will be set in postprocessing
+        md_array_push(mol->atom.type_idx, -1, alloc); // Will be set after populating atom type table
     }
 
     // Populate atom type table and assign type indices

From 4a12b0b8729c355ae50ea32aed3924becd193fa6 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 12 Sep 2025 11:22:14 +0000
Subject: [PATCH 19/20] Move mmCIF element inference to init_molecule function
 level

Co-authored-by: scanberg <706523+scanberg@users.noreply.github.com>
---
 src/md_mmcif.c | 58 +++++++++++++++++++++++++++-----------------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/src/md_mmcif.c b/src/md_mmcif.c
index 66c0435..6219c09 100644
--- a/src/md_mmcif.c
+++ b/src/md_mmcif.c
@@ -233,33 +233,7 @@ static bool mmcif_parse_atom_site(md_atom_data_t* atom, md_buffered_reader_t* re
         md_array_ensure(atom->chainid,  capacity, alloc);
     }
 
-    // Fill in missing elements using hash-backed inference
-    for (size_t i = 0; i < (size_t)num_atoms; ++i) {
-        if (atom->element[i] == 0) {
-            // Use hash-backed inference for missing elements
-            str_t atom_label = LBL_TO_STR(atom->type[i]);
-            
-            // Try direct element lookup first
-            md_element_t elem = md_util_element_lookup_ignore_case(atom_label);
-            
-            // If that fails, use the hash-backed inference from md_util_element_guess
-            if (elem == 0) {
-                // Create a temporary molecule structure for the single atom to use element_guess
-                md_molecule_t temp_mol = {0};
-                temp_mol.atom.count = 1;
-                temp_mol.atom.type = &atom->type[i];
-                temp_mol.atom.resname = &atom->resname[i];
-                temp_mol.atom.flags = atom->flags ? &atom->flags[i] : NULL;
-                
-                md_element_t temp_element = 0;
-                if (md_util_element_guess(&temp_element, 1, &temp_mol)) {
-                    elem = temp_element;
-                }
-            }
-            
-            atom->element[i] = elem;
-        }
-    }
+
 
     atom->count = num_atoms;
 done:
@@ -344,6 +318,36 @@ static bool mmcif_parse(md_molecule_t* mol, md_buffered_reader_t* reader, md_all
         md_buffered_reader_skip_line(reader);
     }
 
+    // Fill in missing elements using hash-backed inference if atoms were found
+    if (atom_site_found && mol->atom.count > 0) {
+        for (size_t i = 0; i < mol->atom.count; ++i) {
+            if (mol->atom.element[i] == 0) {
+                // Use hash-backed inference for missing elements
+                str_t atom_label = LBL_TO_STR(mol->atom.type[i]);
+                
+                // Try direct element lookup first
+                md_element_t elem = md_util_element_lookup_ignore_case(atom_label);
+                
+                // If that fails, use the hash-backed inference from md_util_element_guess
+                if (elem == 0) {
+                    // Create a temporary molecule structure for the single atom to use element_guess
+                    md_molecule_t temp_mol = {0};
+                    temp_mol.atom.count = 1;
+                    temp_mol.atom.type = &mol->atom.type[i];
+                    temp_mol.atom.resname = &mol->atom.resname[i];
+                    temp_mol.atom.flags = mol->atom.flags ? &mol->atom.flags[i] : NULL;
+                    
+                    md_element_t temp_element = 0;
+                    if (md_util_element_guess(&temp_element, 1, &temp_mol)) {
+                        elem = temp_element;
+                    }
+                }
+                
+                mol->atom.element[i] = elem;
+            }
+        }
+    }
+
     // Populate atom type table and assign type indices if atoms were found
     if (atom_site_found && mol->atom.count > 0) {
         for (size_t i = 0; i < mol->atom.count; ++i) {

From 9be4e821ec4bec3f0f92f73d166a431bea2f484d Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 14 Oct 2025 17:12:29 +0000
Subject: [PATCH 20/20] Fix mmCIF parser line-skipping bug and add
 comprehensive tests

Co-authored-by: scanberg <706523+scanberg@users.noreply.github.com>
---
 src/md_mmcif.c        |  4 ++++
 unittest/test_mmcif.c | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/src/md_mmcif.c b/src/md_mmcif.c
index 6219c09..9f9896c 100644
--- a/src/md_mmcif.c
+++ b/src/md_mmcif.c
@@ -308,11 +308,15 @@ static bool mmcif_parse(md_molecule_t* mol, md_buffered_reader_t* reader, md_all
                     return false;
                 }
                 atom_site_found = true;
+                // Sub-parser already consumed its lines, continue without skipping
+                continue;
             } else if (str_eq_cstr_n(line, "_cell.", 6)) {
                 if (!mmcif_parse_cell(&mol->unit_cell, reader)) {
                     MD_LOG_ERROR("Failed to parse _cell");
                     return false;
                 }
+                // Sub-parser already consumed its lines, continue without skipping
+                continue;
             }
         }
         md_buffered_reader_skip_line(reader);
diff --git a/unittest/test_mmcif.c b/unittest/test_mmcif.c
index e9b8636..62cc5fe 100644
--- a/unittest/test_mmcif.c
+++ b/unittest/test_mmcif.c
@@ -78,3 +78,43 @@ UTEST(mmcif, 8g7u) {
 
     md_molecule_free(&mol, md_get_heap_allocator());
 }
+
+// Comprehensive test for all CIF files in test_data
+UTEST(mmcif, all_cif_files) {
+    const char* cif_files[] = {
+        MD_UNITTEST_DATA_DIR"/1fez.cif",
+        MD_UNITTEST_DATA_DIR"/2or2.cif",
+        MD_UNITTEST_DATA_DIR"/8g7u.cif",
+    };
+    const size_t expected_atom_counts[] = {
+        4097,   // 1fez.cif
+        5382,   // 2or2.cif
+        14229,  // 8g7u.cif
+    };
+    const size_t num_files = sizeof(cif_files) / sizeof(cif_files[0]);
+
+    for (size_t i = 0; i < num_files; ++i) {
+        str_t path = str_from_cstr(cif_files[i]);
+        md_molecule_t mol;
+        bool result = md_mmcif_molecule_api()->init_from_file(&mol, path, NULL, md_get_heap_allocator());
+        
+        EXPECT_TRUE(result);
+        if (result) {
+            EXPECT_EQ(expected_atom_counts[i], mol.atom.count);
+            
+            // Verify all atoms have valid elements
+            size_t zero_element_count = 0;
+            for (size_t j = 0; j < mol.atom.count; ++j) {
+                if (mol.atom.element[j] == 0) {
+                    zero_element_count++;
+                }
+            }
+            
+            // Allow some missing elements but ensure most are filled
+            double missing_ratio = (double)zero_element_count / (double)mol.atom.count;
+            EXPECT_LT(missing_ratio, 0.15); // Less than 15% missing
+            
+            md_molecule_free(&mol, md_get_heap_allocator());
+        }
+    }
+}