# Extract atomic weights and masses

This notebook extracts atomic weights and masses from the CIAAW website.
See below for links.

In [None]:
import html.parser

import pandas as pd

## Atomic Weights

Extract the standard atomic weights from an HTML file.

In [None]:
# Simple state machine to parse out the table from HTML.
# The data must be in a table with id "mytable".
class WeightsParser(html.parser.HTMLParser):
    def __init__(self):
        super().__init__()
        self._in_table = False
        self._in_table_body = False
        self._in_td = False
        self.rows = []
        self._row = []

    def handle_starttag(self, tag, attrs):
        if self._in_table:
            self._handle_starttag_in_table(tag, attrs)
        else:
            self._handle_starttag_outside_table(tag, attrs)

    def _handle_starttag_in_table(self, tag, attrs):
        if tag == "table":
            raise NotImplementedError("Nested table")
        if tag == "tbody":
            self._in_table_body = True

        if not self._in_table_body:
            return

        if tag == "tr":
            self._start_row()
        elif tag == "td":
            self._in_td = True

    def _handle_starttag_outside_table(self, tag, attrs):
        if tag == "table" and ('id', 'mytable') in attrs:
            self._in_table = True

    def handle_endtag(self, tag):
        if self._in_table:
            self._handle_endtag_in_table(tag)

    def _handle_endtag_in_table(self, tag):
        if tag == "table":
            self._in_table = False
        elif tag == "tbody":
            self._in_table_body = False
        elif tag == "tr":
            self._end_row()
        elif tag == "td":
            self._in_td = False

    def handle_data(self, data):
        if self._in_td:
            self._row.append(data.strip())

    def _start_row(self):
        self._row = []

    def _end_row(self):
        self.rows.append(self._row)
        self._row = []


def parse_weight(s):
    if s == '—':  # This is UTF-8 char \xe2\x80\x94
        return None, None
    value, error = s.split('±')
    return float(value), float(error)

Set the correct filename here.
This webpage should have been downloaded from the "Abridged Standard Atomic Weights" page at
https://www.ciaaw.org/abridged-atomic-weights.htm

In [None]:
with open("IUPAC_abridged_weights.html") as f:
    raw_html = f.read()

parser = WeightsParser()
parser.feed(raw_html)
atoms = [
    (row[1], int(row[0]), *parse_weight(row[3]))
    for row in parser.rows
    if row
]

In [None]:
with open('atomic_weights.csv', 'w') as f:
    f.write("# Numbers extracted using tools/atomic_weights.ipynb from https://www.ciaaw.org/abridged-atomic-weights.htm")
    f.write("Element,Z,Atomic Weight [Da],Uncertainty [Da]\n")
    for (symbol, z, weight, error) in atoms:
        if weight is None:
            weight = ''
        if error is None:
            error = ''
        f.write(f'{symbol},{z},{weight},{error}\n')

## Atomic Masses

Set the correct filename here.
The file should have been downloaded from the "Atomic Masses" page at
https://www.ciaaw.org/atomic-masses.htm
That page offers a link to download a CSV file.

In [None]:
df = pd.read_csv("IUPAC_atomic_masses.csv", header=2)
df['year'] = df.pop('Year/link').str.extract(r'>(\d+)</a>')
df

Select the most recent entry for each isotope:

In [None]:
latest = df.groupby('nuclide').apply(lambda x: x[x['year'] == x['year'].max()], include_groups=False)
latest.index = latest.index.map(lambda x: x[0])
assert latest.index.is_unique
latest

In [None]:
with open('atomic_masses.csv', 'w') as f:
    f.write("# Numbers extracted using tools/atomic_weights.ipynb from https://www.ciaaw.org/atomic-masses.htm")
    latest.to_csv(f,
                  columns=['mass', 'uncertainty'],
                  index_label="Isotope",
                  header=["Atomic Mass [Da]", "Uncertainty [Da]"])