<a href="https://colab.research.google.com/github/profteachkids/CHE2064/blob/master/OutlierDataModelAntoineEquation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from lxml import html
import requests
import jax
import jax.numpy as jnp
import numpy as np
from scipy.optimize import minimize
from plotly.subplots import make_subplots
import plotly.io as pio
from collections import deque
pio.templates.default='plotly_dark'

#JAX default is 32 bit - enable 64 bit - double precision
from jax.config import config
config.update("jax_enable_x64", True)

In [2]:
original_url='http://www.ddbst.com/en/EED/PCP/VAP_C174.php'

url = 'https://raw.githubusercontent.com/profteachkids/CHE2064/master/data/DDBWaterVaporPressure.html'

content= requests.get(url).content
tree=html.fromstring(content)

In [3]:
T_strings=tree.xpath('//*[@id="online-ddb-form"]/div[2]/table[3]/tbody/tr/td[1]/text()')
P_strings=tree.xpath('//*[@id="online-ddb-form"]/div[2]/table[3]/tbody/tr/td[2]/text()')

In [4]:
# converting the strings as np.float32 (single precision) causes precision loss that results
# in failure of numerical derivatives.  Can be overcome by automatic derivatives JAX
# using np.float64 (double precision) overcomes precision loss and allows use of numerical derivatives
# JAX (automatic derivatives) no longer necessary, but it is good to see this demonstration of how
# round-off errors can be a challenge.  In more challenging problems automatic derivatives are necessary
# even with double precision (np.float64).

data = np.array([T_strings, P_strings]).astype(np.float64)

In [5]:
T=data[0,:]
log_Pv = np.log10(1000*data[1,:])

In [6]:
bad_idx = np.random.choice(range(0,log_Pv.size),replace=False, size=10)
log_Pv[bad_idx] = np.random.normal(loc=log_Pv[bad_idx], scale=0.05*log_Pv[bad_idx])

In [7]:
fig = make_subplots(rows=1,cols=1)
fig.add_scatter(x=T, y=log_Pv, mode='markers', marker_size=8, marker_color='rgba(0,0,0,0)',
                marker_line_color='rgb(0,0,255)', marker_line_width=1, name='Data')
fig.update_layout(width=800, height=500)
fig.update_layout(xaxis_title='T (K)', yaxis_title='$Log_{10} P\ (Pa)$a')
fig.show()

In [8]:
def model(a, b, c, T):
  return a-b/(T+c)

In [11]:
a_guess = 10.
b_guess = 2000.0
c_guess = 0.0
x0 = [a_guess, b_guess, c_guess]

def sqerror(x, T, log_Pv):
  model_Pv = model(x[0],x[1],x[2], T)
  err = jnp.sum( (model_Pv-log_Pv)**2)
  return err

T_retained = np.copy(T)
log_Pv_retained = np.copy(log_Pv)
while True:
    res = minimize(sqerror, x0=x0, args=(T_retained, log_Pv_retained),
                   tol=1e-10, jac=jax.grad(sqerror))
    model_Pv = model(res.x[0],res.x[1],res.x[2], T_retained)

    # Iteratively Retain only data points that lie within 2 standard deviations 
    # of the mean absolute deviation between model and data points
    abs_devs = np.abs(model_Pv - log_Pv_retained)
    mean_deviation = np.mean(abs_devs)
    std_deviation = np.std(abs_devs)
    retain_idx = np.where(abs_devs<mean_deviation+2*std_deviation)[0]
    print(len(retain_idx), T_retained.size)
    if len(retain_idx) == T_retained.size:
        break
    T_retained = np.take(T_retained,retain_idx)
    log_Pv_retained = np.take(log_Pv_retained, retain_idx)



24 26
23 24
21 23
19 21
16 19
16 16


In [12]:
res

      fun: 0.00036402294838814545
 hess_inv: array([[7.77417699e+00, 6.04015750e+03, 6.13237171e+02],
       [6.04015750e+03, 4.76261564e+06, 4.89610877e+05],
       [6.13237171e+02, 4.89610877e+05, 5.10538552e+04]])
      jac: DeviceArray([ 1.27269795e-08, -3.91119348e-11,  2.23554403e-10], dtype=float64)
  message: 'Desired error not necessarily achieved due to precision loss.'
     nfev: 138
      nit: 39
     njev: 126
   status: 2
  success: False
        x: array([  10.21768753, 1761.74077418,  -35.52411587])

In [13]:
T_model = jnp.linspace(T[0],T[-1],100)
fig.add_scatter(x=T_model, y=model(*res.x, T_model), mode='lines', line_width=1, name='Antoine')
fig.show()