# Import packages and functions

In [1]:
import sys
# force the notebook to look for files in the upper level directory
sys.path.insert(1, '../')

In [2]:
import pandas as pd
from glob import glob
from requests_html import HTML
from data.nist_html_parser import get_ionization_lookup

# Process the HTML files in the raw_hist_html folder
## Initialize an empty output dataframe

In [3]:
df_output = pd.DataFrame()

## Go through every HTML files in the raw_nist_html folder

In [4]:
for file_path in glob("../data/ionization_energy/raw_nist_html/*.html"):
    # open the file and read in as HTML
    with open(file_path) as html_file:
        html = HTML(html=html_file.read())
    
    try:
        # try to find the Element table with the attribute bgcolor
        energy_table = html.find("table[bgcolor]")[0]
    except IndexError:
        # if the HTML file does not contain the energy table, move on to the next file
        continue

    # parse the HTML file as a pandas dataframe
    df_parsed = pd.read_html(energy_table.html)[0]
    # convert the dataframe into a ionization energy lookup dataframe
    df_to_add = get_ionization_lookup(df_parsed)
    # append to the output dataframe
    df_output = df_output.append(df_to_add, ignore_index=True)

Here's a printout of the output dataframe

In [5]:
# sort by element symbol and oxidation states
df_output.sort_values(by=["element", "v"], inplace=True, ignore_index=True)
df_output

Unnamed: 0,element,v,iv,iv_p1
0,Ac,1,5.380226,11.75000
1,Ac,2,11.750000,17.43100
2,Ac,3,17.431000,44.80000
3,Ag,1,7.576234,21.48440
4,Ag,2,21.484400,34.80000
...,...,...,...,...
334,Zn,2,17.964390,39.72330
335,Zr,1,6.634126,13.13000
336,Zr,2,13.130000,23.17000
337,Zr,3,23.170000,34.41836


## Lookup table codebook

The following table shows the explanation for each column in the output dataframe

|column name|explanation|
|:----------|:----------|
|element    |the element's symbol|
|v          |the oxidation state|
|iv         |the $v^{\text{th}}$ ionization energy in $eV$|
|iv_p1      |the $(v+1)^{\text{th}}$ ionization energy in $eV$|

# Save the dataframe as an excel spreadsheet

In [6]:
df_output.to_excel("../data/ionization_energy/ionization_energy.xlsx", index=False)

# Data citation
The original HTML files are scraped using the [nist_web_scraper](https://github.com/rpw199912j/mit_model_code/tree/master/data/nist_web_scaper.py) from the [NIST Atomic Spectra Database Ionization Energies Form](https://physics.nist.gov/PhysRefData/ASD/ionEnergy.html) website. Formal data citation is as follows.

Kramida, A., Ralchenko, Yu., Reader, J., and NIST ASD Team (2019). NIST Atomic Spectra Database (ver. 5.7.1), [Online]. Available: https://physics.nist.gov/asd [2020, August 2]. National Institute of Standards and Technology, Gaithersburg, MD. DOI: https://doi.org/10.18434/T4W30F